In [91]:
import numpy as np
from data_generator.data_loader import DataCreator

model_names = ['Llama-2-13b-chat-hf', 'Llama-2-70b-chat-hf', 'Mistral-7B-Instruct-v0.2',
                'Mixtral-8x7B-Instruct-v0.1', 'gemma-2b', 'gemma-7b', 'phi-2']
task_name = "gsm8k"

datacreator = DataCreator(task_name, model_names=model_names, task_type="lang")

for i, (train_data, test_data, num_models, ds_name) in enumerate(datacreator.load()):
    break

labels = train_data[:, -1]
model_preds = []
for arr_spl in np.split(train_data[:, :-1], len(model_names), axis=1):
    model_preds.append(arr_spl.argmax(1))
model_preds = np.array(model_preds).T

for i, mn in enumerate(model_names):
    print(mn, np.mean(labels == model_preds[:, i]))

['Llama-2-13b-chat-hf', 'Llama-2-70b-chat-hf', 'Mistral-7B-Instruct-v0.2', 'Mixtral-8x7B-Instruct-v0.1', 'gemma-2b', 'gemma-7b', 'phi-2']
Truncating the space size
Truncating the space size
Llama-2-13b-chat-hf 0.41189746896662904
Llama-2-70b-chat-hf 0.5661776559729164
Mistral-7B-Instruct-v0.2 0.5779461550862486
Mixtral-8x7B-Instruct-v0.1 0.6886990166048687
gemma-2b 0.24214089956472676
gemma-7b 0.745445752055457
phi-2 0.7173948089634048


In [90]:
comb_idx = [3, 5, 6]

labels = test_data[:, -1]
model_preds = []
for arr_spl in np.split(test_data[:, :-1], len(model_names), axis=1):
    model_preds.append(arr_spl.argmax(1))
model_preds = np.array(model_preds).T

model_preds = model_preds[:, comb_idx]
preds = []
for i in range(len(model_preds)):
    if i == 0:
        rand_idx = np.random.randint(0, len(comb_idx))
        preds.append(model_preds[i, rand_idx])
    else:
        correct_ids = model_preds[i-1] == labels[i-1]
        if sum(correct_ids) != 0:
            next_decision = np.random.randint(sum(correct_ids))
            preds.append(model_preds[i, correct_ids][next_decision])
        else:
            rand_idx = np.random.randint(0, len(comb_idx))
            preds.append(model_preds[i, rand_idx])

preds = np.array(preds)
print(np.mean(labels==preds)*100)

72.24456958970234


In [93]:
labels

array([0., 0., 0., ..., 6., 0., 0.])

In [92]:
model_preds

array([[0, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 3, 0, 0],
       [2, 0, 0, ..., 2, 0, 0],
       ...,
       [0, 0, 0, ..., 7, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 4, 0, 0]])

In [95]:
num_models = len(model_names)
error_arr = (model_preds ==  np.repeat(labels[:, None], num_models, axis=1)).astype(int)
error_arr

array([[1, 1, 1, ..., 0, 1, 1],
       [0, 0, 1, ..., 0, 1, 1],
       [0, 1, 1, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 1, 1],
       [0, 1, 1, ..., 0, 1, 1]])

In [96]:
from env.ens_env import HistData
from env.ens_metrics import calc_div_acc


data_dict = {
    "prob_arr": train_data[:, :-1],
    "error_arr": error_arr,
    "pred_arr": model_preds,
    "label_arr": labels,
}

hist_data = HistData(data_dict)


In [98]:
import itertools

scores = []
ens_sizes = np.arange(2, num_models + 1)
for j, ens_size in enumerate(ens_sizes):
    print(ens_size)
    combinations = list(itertools.combinations(range(num_models), ens_size))
    for comb in combinations:
        comb_idx = np.zeros(num_models, dtype=int)
        comb_idx[list(comb)] = 1
        scores.append(calc_div_acc(comb_idx, hist_data))
scores = np.array(scores)

2
3
4
5
6
7


In [115]:
from env.diversity_stats import calc_stat_matrices

error_dict = {model_names[i]:error_arr[:, i] for i in range(num_models)}
stats_matrices = calc_stat_matrices(error_dict)

In [116]:
stats_matrices.keys()

dict_keys(['q_statistics', 'correlation_co-efficiency', 'binary_disagreement', 'kappa_statistics'])

In [119]:
stats_matrices["binary_disagreement"]

Unnamed: 0,Llama-2-13b-chat-hf,Llama-2-70b-chat-hf,Mistral-7B-Instruct-v0.2,Mixtral-8x7B-Instruct-v0.1,gemma-2b,gemma-7b,phi-2
Llama-2-13b-chat-hf,0.0,0.324198,0.33242,0.438014,0.316782,0.399,0.38836
Llama-2-70b-chat-hf,0.324198,0.0,0.313236,0.335966,0.420764,0.314042,0.315976
Mistral-7B-Instruct-v0.2,0.33242,0.313236,0.0,0.337417,0.417056,0.278414,0.289699
Mixtral-8x7B-Instruct-v0.1,0.438014,0.335966,0.337417,0.0,0.531033,0.27922,0.28728
gemma-2b,0.316782,0.420764,0.417056,0.531033,0.0,0.530066,0.517492
gemma-7b,0.399,0.314042,0.278414,0.27922,0.530066,0.0,0.208931
phi-2,0.38836,0.315976,0.289699,0.28728,0.517492,0.208931,0.0


In [122]:
ens_sizes = np.arange(2, num_models + 1)
for j, ens_size in enumerate(ens_sizes):
    print(ens_size)
    combinations = list(itertools.combinations(range(num_models), ens_size))
    for comb in combinations:
        val = 0
        for i, pair in enumerate(list(itertools.combinations(comb, 2))):
            val += stats_matrices["binary_disagreement"].values[pair]
        val = val / (i + 1)
        print(comb, val)

2
(0, 1) 0.32419796872481055
(0, 2) 0.3324197968724811
(0, 3) 0.4380138642592294
(0, 4) 0.31678220216024505
(0, 5) 0.39900048363694984
(0, 6) 0.38836047073996455
(1, 2) 0.31323553119458325
(1, 3) 0.3359664678381428
(1, 4) 0.42076414638078347
(1, 5) 0.3140415927776882
(1, 6) 0.3159761405771401
(2, 3) 0.33741737868773175
(2, 4) 0.41705626309850075
(2, 5) 0.27841367080444945
(2, 6) 0.28969853296791875
(3, 4) 0.5310333709495405
(3, 5) 0.27921973238755443
(3, 6) 0.2872803482186039
(4, 5) 0.5300660970498146
(4, 6) 0.5174915363533774
(5, 6) 0.20893116234080283
3
(0, 1, 2) 0.32328443226395825
(0, 1, 3) 0.3660594336073943
(0, 1, 4) 0.3539147724219463
(0, 1, 5) 0.34574668171314954
(0, 1, 6) 0.3428448600139717
(0, 2, 3) 0.36928367993981404
(0, 2, 4) 0.35541942071040894
(0, 2, 5) 0.33661131710462683
(0, 2, 6) 0.33682626686012146
(0, 3, 4) 0.4286098124563383
(0, 3, 5) 0.3720780267612445
(0, 3, 6) 0.37121822773926594
(0, 4, 5) 0.4152829276156698
(0, 4, 6) 0.4075447364178624
(0, 5, 6) 0.3320973722392