In [1]:
import pandas as pd
import os
from collections import defaultdict
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pickle
import torch
def defaultdict_to_dict(d):
    """ Recursively convert defaultdict to dict. """
    if isinstance(d, defaultdict):
        d = {key: defaultdict_to_dict(value) for key, value in d.items()}
    return d

In [2]:
def find_first_files_with_str(directory, str_contain, index):
    # Create an empty list to store files that contain str_contain
    files_with_str = []

    # Iterate over the files in the directory
    for file in os.listdir(directory):
        if file[:4] == str_contain:
            files_with_str.append(file)
    
    
    # Sort only the files that contain 'x1'
    files_with_str.sort()

    # Return the first file in the sorted list, or None if the list is empty
    return files_with_str[index] if files_with_str else None

In [3]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Reading data and models
groups = [str(i) for i in range(1,4)]
peers = [str(i) for i in range(8090, 8100)]

#reading entire dataset for all groups: train_df_group1



# for group in groups:
#     datasets_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}','datasets')
#     personal_dfs = os.listdir(datasets_folder)
#     personal_dfs = [i for i in personal_dfs if '.csv' in i and 'train' not in i and 'test' not in i]
#     print (personal_dfs)
#     for i in personal_dfs:
#         df_temp = pd.read_csv(os.path.join(datasets_folder, i))
#         print (df_temp['doc_id'].nunique())
#     exec(f'train_df_group{group} = train_df_group{group}.drop_duplicates()')
    
    
# reading individual peer datasets & group datasets: train_df_group1, train_df_group1_peer1
for group in groups:
    # creating train_df's
    exec(f'train_df_group{group} = pd.DataFrame()')
    for peer in peers:
        datasets_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}','datasets')
        exec_str = f"train_df_group{group}_peer{int(peer) - 8089} = pd.read_csv(os.path.join(datasets_folder,'{peer}_df.csv'))"
        exec(exec_str)
        exec(f'train_df_group{group} = pd.concat([train_df_group{group}, train_df_group{group}_peer{int(peer) - 8089}])')
       
    exec(f'train_df_group{group} = train_df_group{group}.drop_duplicates()')
    
    # creating test_df's: test_df_group1, test_df_group1_peer1
    
    datasets_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}','datasets')
    exec (f"test_df_group{group} = pd.read_csv(os.path.join(datasets_folder,'test_df.csv')) ")
    exec (f"test_df_group{group} = test_df_group{group}[test_df_group{group}['doc_id'].isin(train_df_group{group}['doc_id'].unique())]")
    for peer in peers:
        exec (f"test_df_group{group}_peer{int(peer) - 8089} = test_df_group{group}[test_df_group{group}['doc_id'].isin(train_df_group{group}_peer{int(peer) - 8089}['doc_id'].unique())]")
    
    
    

#reading models: model_group1_peer1
for group in groups:
    for peer in peers:
        model_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}', 'models')
        model_file = find_first_files_with_str(model_folder, peer, 10) # 10 is the largest number of saved models that all peers have finished training
        print (group, peer, model_file)
        exec_str = f"model_group{group}_peer{str(int(peer)-8089)} = T5ForConditionalGeneration.from_pretrained(os.path.join(model_folder, model_file))"
        
        
        exec(exec_str)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1 8090 8090_2023-12-16 175932_my_t5_model
1 8091 8091_2023-12-16 180341_my_t5_model
1 8092 8092_2023-12-16 180103_my_t5_model
1 8093 8093_2023-12-16 191901_my_t5_model
1 8094 8094_2023-12-16 180320_my_t5_model
1 8095 8095_2023-12-16 180348_my_t5_model
1 8096 8096_2023-12-16 180129_my_t5_model
1 8097 8097_2023-12-16 180007_my_t5_model
1 8098 8098_2023-12-16 191911_my_t5_model
1 8099 8099_2023-12-16 191700_my_t5_model
2 8090 8090_2023-12-17 123459_my_t5_model
2 8091 8091_2023-12-17 123536_my_t5_model
2 8092 8092_2023-12-17 123202_my_t5_model
2 8093 8093_2023-12-17 123152_my_t5_model
2 8094 8094_2023-12-17 124010_my_t5_model
2 8095 8095_2023-12-17 123723_my_t5_model
2 8096 8096_2023-12-17 140032_my_t5_model
2 8097 8097_2023-12-17 123518_my_t5_model
2 8098 8098_2023-12-17 123711_my_t5_model
2 8099 8099_2023-12-17 123633_my_t5_model
3 8090 8090_2023-12-18 082034_my_t5_model
3 8091 8091_2023-12-18 070742_my_t5_model
3 8092 8092_2023-12-18 070621_my_t5_model
3 8093 8093_2023-12-18 081950_my_t

In [4]:
print ('train')
print (train_df_group1['doc_id'].nunique())
print (train_df_group2['doc_id'].nunique())
print (train_df_group3['doc_id'].nunique())
print ('test')
print (test_df_group1['doc_id'].nunique())
print (test_df_group2['doc_id'].nunique())
print (test_df_group3['doc_id'].nunique())

train
2001
2032
2002
test
2001
2032
2002


### CALCULATING ACCURACIES ON MODELS' OWN DATA

In [5]:
def read_model_and_evaluate(group, peer, mode = 'global'):
    global_scope = globals()
    acc_train = -1
    print ('group ', group, 'peer ', peer)
    exec (f'model = model_group{group}_peer{peer}', global_scope)
    if mode == 'global':
        exec (f'train_df = train_df_group{group}.sample(1000).copy()', global_scope)
        exec (f'test_df = test_df_group{group}.sample(1000).copy()', global_scope)
    elif mode == 'local':
        exec (f'train_df = train_df_group{group}_peer{peer}.sample(1000).copy()', global_scope)
        exec (f'test_df = test_df_group{group}_peer{peer}.sample(1000).copy()', global_scope)
        
    df_tot = train_df.copy()
    df_tst = test_df.copy()
    print (df_tot.shape, df_tst.shape)
    
    df_tot['generated_doc_id'] = df_tot['query'].apply(lambda x: generate_text(x, model))
    df_tst['generated_doc_id'] = df_tst['query'].apply(lambda x: generate_text(x, model))
    acc_train = df_tot[df_tot['doc_id'] == df_tot['generated_doc_id']].shape[0]/df_tot.shape[0]
    acc_test = df_tst[df_tst['doc_id'] == df_tst['generated_doc_id']].shape[0]/df_tst.shape[0]
    
    
    print (f'{mode} training set accuracy: ', acc_train)
    print (f'{mode} test set accuracy: ', acc_test)
    
    
    df_tot['generated_doc_id_log'] = df_tot['query'].apply(lambda x: generate_text_through_logits(x, model, df_tot))
    df_tst['generated_doc_id_log'] = df_tst['query'].apply(lambda x: generate_text_through_logits(x, model, df_tst))

    
    acc_train_log = df_tot[df_tot['doc_id'] == df_tot['generated_doc_id_log']].shape[0]/df_tot.shape[0]
    acc_test_log = df_tst[df_tst['doc_id'] == df_tst['generated_doc_id_log']].shape[0]/df_tst.shape[0]
    
    print (f'{mode} training set accuracy log: ', acc_train_log)
    print (f'{mode} test set accuracy log: ', acc_test_log)
    return acc_train, acc_test, df_tot, df_tst

In [6]:
# Function to generate text
def generate_text(query, model):
    input_ids = tokenizer.encode(query, return_tensors='pt')
    output = model.generate(input_ids, max_length = 20)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [7]:
def generate_text_through_logits(query,model, df_tst):
    doc_id = df_tst[df_tst['query'] == query]['doc_id'].iloc[0]
#     print (query, doc_id)
    inputs = tokenizer(query, padding=False, return_tensors="pt", truncation=True).input_ids
    labels = tokenizer(doc_id, padding=True, return_tensors="pt", truncation=True).input_ids

    # Forward pass
    outputs = model(input_ids=inputs, labels = labels)
    loss = outputs.loss

    # Extract logits and convert to token IDs
    logits = outputs.logits
    predicted_token_ids = tokenizer.decode(torch.argmax(logits, dim=-1)[0], skip_special_tokens=True)
#     print (predicted_token_ids)
    return predicted_token_ids

In [8]:
global_accuracies = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))))
for group in groups:
    for peer in peers:
        
        p = int(peer) - 8089
        exec(f"acc_train_local, acc_test_local, df_tot_l, df_tst_l = read_model_and_evaluate({group}, {p}, 'local')")
        global_accuracies[group][peer]['train']['local'] = acc_train_local
        global_accuracies[group][peer]['test']['local'] = acc_test_local
        
        exec(f"acc_train_global, acc_test_global, df_tot_g, df_tst_g = read_model_and_evaluate({group}, {p}, 'global')")
        global_accuracies[group][peer]['train']['global'] = acc_train_global
        global_accuracies[group][peer]['test']['global'] = acc_test_global

group  1 peer  1
(1000, 5) (1000, 5)
local training set accuracy:  0.981
local test set accuracy:  0.909
local training set accuracy log:  0.981
local test set accuracy log:  0.909
group  1 peer  1
(1000, 5) (1000, 5)
global training set accuracy:  0.977
global test set accuracy:  0.884
global training set accuracy log:  0.977
global test set accuracy log:  0.884
group  1 peer  2
(1000, 5) (1000, 5)



KeyboardInterrupt



In [11]:
df_tot_g[df_tot_g['generated_doc_id'] != df_tot_g['generated_doc_id_log']]

Unnamed: 0.1,Unnamed: 0,query_id,query,doc_id,doc,generated_doc_id,generated_doc_id_log
162,12967572,9682982,quality control duties,D2450576,https://www.bls.gov/ooh/production/quality-con...,D2531578,D25450576
1964,14534584,11604881,sss math,D235655,http://mathopenref.com/congruentsss.html,D1667190,D165655
2552,2151563,4382814,bouteflika,D1960677,https://www.britannica.com/biography/Abdelaziz...,D1969822,D1969877
893,2609542,2624344,car loans,D29479,https://santanderconsumerusa.com/,D1242700,D129479
555,14531330,3752354,ssi/ssdi benefits,D2197809,https://www.ssa.gov/disability/disability.html,D610732,D6197809
1586,13559621,3452459,russian federation countries,D1664655,https://history.state.gov/countries/russia,D1664542,D1664555
855,16111839,2906342,united airlines booking number,D2790515,http://www.bookmyflightticket.com/flights/unit...,D2791406,D2791415
231,17588713,10309374,what is social security disability income,D610732,https://www.disabilitysecrets.com/page5-13.html,D2197809,D210732
998,14295142,5600135,social security vs disability,D610732,https://www.disabilitysecrets.com/page5-13.html,D2197809,D210732
734,18809142,3630454,Ð½Ð¾ÑÐ²ÐµÐ³Ð¸Ñ,D176337,https://en.wikipedia.org/wiki/Norway,D2085252,D2086337


In [None]:
with open('localnglobal_accuracies_allgroups_allpeers.pkl', 'wb') as file:
    dump(global_accuracies, file)

# # Step 4: Load from the file
# with open('my_defaultdict.pkl', 'rb') as file:
#     loaded_defaultdict = pickle.load(file)

In [None]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('localnglobal_accuracies_allgroups_allpeers.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

# To load and optionally convert back to defaultdict
# (You'll need to redefine your defaultdict structure as before)
with open('localnglobal_accuracies_allgroups_allpeers.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)


In [None]:
# To load and optionally convert back to defaultdict
# (You'll need to redefine your defaultdict structure as before)
with open('localnglobal_accuracies_allgroups_allpeers.pkl', 'rb') as file:
    global_accuracies = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)

### CALCULATING ACCURACIES ON TOP5 FOR EACH PEER

In [None]:
test_df_group1.shape,test_df_group2.shape,test_df_group3.shape

In [5]:
import threading

# Global dictionary to store models for each group and peer
global_objects = {}
global_accuracies = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))))
class ModelEvaluator:
    def __init__(self, group, peer):
        self.group = group
        self.peer = peer
        self.counter = 0

    def read_model_and_evaluate(self, mode='global'):
        global global_accuracies
        acc_train = -1.0
        acc_test = -1.0
        print('group', self.group, 'peer', self.peer, 'mode', mode)

        model = globals()[f'model_group{self.group}_peer{self.peer}']
        
        if mode == 'global':
            df_tot = globals()[f'train_df_group{group}'].copy()
            self.df_tst = globals()[f'test_df_group{group}'].copy()
        elif mode == 'local':
            df_tot = globals()[f'train_df_group{group}_peer{peer}'].copy()
            self.df_tst = globals()[f'test_df_group{group}_peer{peer}'].copy()
        
        print(df_tot.shape, self.df_tst.shape)

        self.df_tst['generated_doc_id'] = self.df_tst['query'].apply(lambda x: self.generate_text_beams(x, model))
        acc_test = self.df_tst.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / self.df_tst.shape[0]
        
        
        global global_objects
        global_objects[(self.group, self.peer)] = self.df_tst
        
        print(f'{mode} training set accuracy: ', acc_train)
        print(f'{mode} test set accuracy: ', acc_test)
        return acc_train, acc_test

    def generate_text_beams(self, query, model):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        input_ids = tokenizer.encode(query, return_tensors='pt')
        output = model.generate(input_ids, do_sample=False, max_length=20,
                                num_beams=5, num_return_sequences=5)
        return [tokenizer.decode(i, skip_special_tokens=True) for i in output]

    def thread_function(self):
        global global_accuracies_20samples
        acc_train_global, acc_test_global = self.read_model_and_evaluate('global')
        global_accuracies[self.group][self.peer]['train']['global'] = acc_train_global
        global_accuracies[self.group][self.peer]['test']['global'] = acc_test_global
        print(f'finished global work for group {self.group} and peer {self.peer}, acc test global :{acc_test_global}')

def evaluate_in_thread(group, peer):
    try:
        evaluator = ModelEvaluator(group, peer)
        evaluator.thread_function()
    except Exception as e:
        print(f"Error in thread for group {group} and peer {peer}: {e}")

    
# Start threads directly in the main script body
threads = []
for group in groups:
    for peer in peers:
        p = int(peer) - 8089
        thread = threading.Thread(target=evaluate_in_thread, args=(group, p,))
        thread.start()
        threads.append(thread)

    # Wait for all threads to complete
    for thread in threads:
        thread.join()


groupgroup 1 peer 2 mode global
group 1 peer 3 mode global
 1 peer 1 mode global
group 1 peer 4 mode global
group 1 peer 5 mode global
group 1 peer 6 mode global
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
group 1 peer 7 mode global
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
group 1 peer 8 mode global
group 1 peer 9 mode global
group 1 peer 10 mode global
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
(16432, 5) (16464, 5)
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 3000 queries
Processed 3000 queries
Pro

Processed 9000 queries
Processed 9000 queries
Processed 9000 queries
Processed 9000 queries
Processed 9000 queries
Processed 8000 queries
Processed 9000 queries
Processed 10000 queries
Processed 9000 queries
Processed 10000 queries
Processed 9000 queries
Processed 10000 queries
Processed 10000 queries
Processed 10000 queries
Processed 10000 queries
Processed 11000 queries
Processed 10000 queries
Processed 9000 queries
Processed 11000 queries
Processed 11000 queries
Processed 11000 queries
Processed 10000 queries
Processed 10000 queries
Processed 11000 queries
Processed 11000 queries
Processed 12000 queries
Processed 12000 queries
Processed 11000 queries
Processed 12000 queries
Processed 12000 queries
Processed 10000 queries
Processed 11000 queries
Processed 11000 queries
Processed 12000 queries
Processed 12000 queries
Processed 13000 queries
Processed 13000 queries
Processed 13000 queries
Processed 12000 queries
Processed 13000 queries
Processed 12000 queries
Processed 12000 queries
Pr

global training set accuracy:  -1.0
global test set accuracy:  0.9186757826556315
finished global work for group 3 and peer 7, acc test global :0.9186757826556315
Processed 15000 queries
Processed 16000 queries
Processed 14000 queries
Processed 15000 queries
global training set accuracy:  -1.0
global test set accuracy:  0.9274919035624325
finished global work for group 3 and peer 10, acc test global :0.9274919035624325
global training set accuracy:  -1.0
global test set accuracy:  0.9314501619287513
finished global work for group 3 and peer 6, acc test global :0.9314501619287513
global training set accuracy:  -1.0
global test set accuracy:  0.9292911119107593
finished global work for group 3 and peer 2, acc test global :0.9292911119107593
Processed 16000 queries
Processed 16000 queries
Processed 15000 queries
global training set accuracy:  -1.0
global test set accuracy:  0.9319899244332494
finished global work for group 3 and peer 4, acc test global :0.9319899244332494
global training 

In [6]:
global_accuracies

defaultdict(<function __main__.<lambda>()>,
            {'1': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {1: defaultdict(<function __main__.<lambda>.<locals>.<lambda>.<locals>.<lambda>()>,
                                      {'train': defaultdict(float,
                                                   {'global': -1.0}),
                                       'test': defaultdict(float,
                                                   {'global': 0.9277818270165209})}),
                          7: defaultdict(<function __main__.<lambda>.<locals>.<lambda>.<locals>.<lambda>()>,
                                      {'train': defaultdict(float,
                                                   {'global': -1.0}),
                                       'test': defaultdict(float,
                                                   {'global': 0.9149659863945578})}),
                          10: defaultdict(<function __main__.<lambda>.<locals>.<

In [8]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('global_accuracies_5beams.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [9]:
with open('global_accuracies_5beams.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

{'1': {1: {'train': {'global': -1.0}, 'test': {'global': 0.9277818270165209}},
  7: {'train': {'global': -1.0}, 'test': {'global': 0.9149659863945578}},
  10: {'train': {'global': -1.0}, 'test': {'global': 0.8914601554907677}},
  3: {'train': {'global': -1.0}, 'test': {'global': 0.9186710398445093}},
  8: {'train': {'global': -1.0}, 'test': {'global': 0.9234086491739553}},
  9: {'train': {'global': -1.0}, 'test': {'global': 0.924198250728863}},
  5: {'train': {'global': -1.0}, 'test': {'global': 0.9241375121477162}},
  6: {'train': {'global': -1.0}, 'test': {'global': 0.9289358600583091}},
  4: {'train': {'global': -1.0}, 'test': {'global': 0.9233479105928085}},
  2: {'train': {'global': -1.0}, 'test': {'global': 0.9275388726919339}}},
 '2': {8: {'train': {'global': -1.0}, 'test': {'global': 0.9123434704830053}},
  1: {'train': {'global': -1.0}, 'test': {'global': 0.9195568122799931}},
  9: {'train': {'global': -1.0}, 'test': {'global': 0.9211148941081424}},
  5: {'train': {'global': -

# Sampling random models and aggregating their suggestions - 5 beams

In [None]:
two_groups_list = []
three_groups_list = []
for group in groups:
    for i, peer in enumerate(peers):
        exec(f'three_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        if int(group)<3:
            exec(f'two_groups_list.append(model_group{group}_peer{int(peer)-8089})')
            
            

In [None]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 2000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

#             beam_scores = output.sequences_scores
#             probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
            model_res = list(set(model_res))
        
            for res in model_res:
                results[res] += 1

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:5]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, model_list_nbr, tokenizer):
    if model_list_nbr == 'two':
        model_list = two_groups_list
    elif model_list_nbr == 'three':
        model_list = three_groups_list

    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']
    
    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = (group_nbr, model_list_nbr)
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Model List: {model_list_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    for model_list_nbr in ['two', 'three']:
        if group_nbr == 3 and model_list_nbr == 'two':
            continue
        thread = threading.Thread(target=run_evaluation, args=(group_nbr, model_list_nbr, tokenizer))
        thread.start()
        threads.append(thread)

for thread in threads:
    thread.join()


In [None]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('global_accuracies_samplingmodels_5beams.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [None]:
with open('global_accuracies_samplingmodels_5beams.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

In [None]:
with open('global_accuracies_samplingmodels_5beams.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

# Sampling random models and aggregating their suggestions - 1 beam - main suggestion

In [5]:
two_groups_list = []
three_groups_list = []
for group in groups:
    for peer in peers:
        exec(f'three_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        if int(group)<3:
            exec(f'two_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        

In [6]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

#             beam_scores = output.sequences_scores
#             probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
            model_res = list(set(model_res))
            for res in model_res:
                results[res] += 1

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:1]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, model_list_nbr, tokenizer):
    if model_list_nbr == 'two':
        model_list = two_groups_list
    elif model_list_nbr == 'three':
        model_list = three_groups_list

    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']

    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = (group_nbr, model_list_nbr)
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Model List: {model_list_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    for model_list_nbr in ['two', 'three']:
        if group_nbr == 3 and model_list_nbr == 'two':
            continue
        thread = threading.Thread(target=run_evaluation, args=(group_nbr, model_list_nbr, tokenizer))
        thread.start()
        threads.append(thread)

for thread in threads:
    thread.join()


train set size:train set size: 16432
test set size: 16464
 16432
test set size: 16464
train set size: 17311
test set size: 17329
train set size: 17311
test set size: 17329
train set size: 16647
test set size: 16674



KeyboardInterrupt



In [None]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('global_accuracies_samplingmodels.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [None]:
global_accuracies

In [None]:
with open('global_accuracies_samplingmodels.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

# Sampling random models and aggregating their suggestions - 5 beams, with probabilities

In [7]:
two_groups_list = []
three_groups_list = []
for group in groups:
    for peer in peers:
        exec(f'three_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        if int(group)<3:
            exec(f'two_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        

In [11]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

            beam_scores = output.sequences_scores
            probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
            
            
            for res, prob in zip(model_res, probabilities):
                results[res] += prob

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:5]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, model_list_nbr, tokenizer):
    if model_list_nbr == 'two':
        model_list = two_groups_list
    elif model_list_nbr == 'three':
        model_list = three_groups_list

    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']

    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = (group_nbr, model_list_nbr)
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Model List: {model_list_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    for model_list_nbr in ['two', 'three']:
        if group_nbr == 3 and model_list_nbr == 'two':
            continue
        thread = threading.Thread(target=run_evaluation, args=(group_nbr, model_list_nbr, tokenizer))
        thread.start()
        threads.append(thread)

for thread in threads:
    thread.join()


Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-13 (run_evaluation), stopped 18488766464)>>
Traceback (most recent call last):
  File "/Users/petruneague/anaconda3/lib/python3.11/threading.py", line 995, in _bootstrap
    self._bootstrap_inner()
  File "/Users/petruneague/anaconda3/lib/python3.11/threading.py", line 1040, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/Users/petruneague/anaconda3/lib/python3.11/threading.py", line 1352, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/Users/petruneague/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 559, in flush
    self.pub_thread.schedule(self._flush)
  File "/Users/petruneague/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 251, in schedule
    self._event_pipe.send(b"")
  File "/Users/petruneague/anaconda3/lib/python3.11/site-packages/zmq/sugar/so


train set size: 16432
test set size: 16464




MODEL INDEXES [4, 1, 12, 19, 16]

RESULTS WITH PROBS
ALL defaultdict(<class 'float'>, {'D176337': 1.3625423908233643, 'D3254647': 0.09982369095087051, 'D3134428': 0.09474887698888779, 'D3258344': 0.08658268302679062, 'D3147911': 0.07561706006526947, 'D1763373': 0.09104155004024506, 'D1763372': 0.06819207221269608, 'D1763187': 0.061951905488967896, 'D1763106': 0.059499699622392654, 'D2585416': 0.22938866913318634, 'D2689316': 0.22170831263065338, 'D2585336': 0.2109907567501068, 'D2541782': 0.5361056625843048, 'D3175149': 0.14857006072998047, 'D1835464': 0.2236683964729309, 'D1850707': 0.15044063329696655, 'D1957508': 0.5873086750507355, 'D1959455': 0.13026893138885498, 'D1953908': 0.16385704278945923, 'D354714': 0.14231367409229279, 'D1956508': 0.1314542591571808, 'D1955508': 0.12392492592334747})
TOP 5 ['D176337', 'D1957508', 'D2541782', 'D2585416', 'D1835464']
D176337
True

RESULTS WITHOUT PROBS
ALL defaultdict(<class 'float'>, {'D176337': 2.0, 'D3254647': 1.0, 'D3134428': 1.0, 'D3258

MODEL INDEXES [7, 15, 3, 8, 4]

RESULTS WITH PROBS
ALL defaultdict(<class 'float'>, {'D3339549': 2.5131624713540077, 'D3385373': 0.10721108317375183, 'D3339769': 0.3039279133081436, 'D842733': 0.08849169313907623, 'D1334916': 0.07284964621067047, 'D551156': 0.32521554827690125, 'D2558370': 0.18580549955368042, 'D1636847': 0.17590709030628204, 'D1626677': 0.1575108766555786, 'D2281196': 0.15556101500988007, 'D33395494': 0.2737727016210556, 'D33395492': 0.09835365414619446, 'D33395493': 0.0783371850848198, 'D33395496': 0.07264116406440735, 'D3489849': 0.11040704697370529, 'D3489033': 0.09697996079921722, 'D3489066': 0.09525743871927261, 'D3337294': 0.08860819786787033})
TOP 5 ['D3339549', 'D551156', 'D3339769', 'D33395494', 'D2558370']
D3339549
True

RESULTS WITHOUT PROBS
ALL defaultdict(<class 'float'>, {'D3339549': 5.0, 'D3385373': 1.0, 'D3339769': 3.0, 'D842733': 1.0, 'D1334916': 1.0, 'D551156': 1.0, 'D2558370': 1.0, 'D1636847': 1.0, 'D1626677': 1.0, 'D2281196': 1.0, 'D33395494': 2.0,

MODEL INDEXES [0, 3, 8, 1, 12]

RESULTS WITH PROBS
ALL defaultdict(<class 'float'>, {'D576017': 1.978517770767212, 'D578420': 0.17356082797050476, 'D578417': 0.13349057734012604, 'D578430': 0.1262706220149994, 'D578401': 0.12428362667560577, 'D603621': 0.12882114946842194, 'D5760172': 0.11842114478349686, 'D578017': 0.11146648228168488, 'D1676047': 0.10491794347763062, 'D583161': 0.09902426600456238, 'D562659': 0.08927366882562637, 'D576111': 0.07820598781108856, 'D576117': 0.07782918959856033, 'D576098': 0.3114544749259949, 'D596098': 0.13545458018779755, 'D635807': 0.10688721388578415, 'D63543': 0.10212058573961258, 'D1803897': 0.2638486921787262, 'D1805775': 0.19984786212444305, 'D1803740': 0.1847630888223648, 'D1113889': 0.17927545309066772, 'D1805520': 0.17226482927799225})
TOP 5 ['D576017', 'D576098', 'D1803897', 'D1805775', 'D1803740']
D576017
True

RESULTS WITHOUT PROBS
ALL defaultdict(<class 'float'>, {'D576017': 4.0, 'D578420': 1.0, 'D578417': 1.0, 'D578430': 1.0, 'D578401': 

KeyboardInterrupt: 

In [None]:
global_accuracies

In [None]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('global_accuracies_samplingmodels_5beams_probabilities.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [None]:
with open('global_accuracies_samplingmodels_5beams_probabilities.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

# Sampling random models and aggregating their suggestions - 1 beams, with probabilities

In [None]:
two_groups_list = []
three_groups_list = []
for group in groups:
    for peer in peers:
        exec(f'three_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        if int(group)<3:
            exec(f'two_groups_list.append(model_group{group}_peer{int(peer)-8089})')
        

In [None]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

            beam_scores = output.sequences_scores
            probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
            
            
            for res, prob in zip(model_res, probabilities):
                results[res] += prob

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:1]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, model_list_nbr, tokenizer):
    if model_list_nbr == 'two':
        model_list = two_groups_list
    elif model_list_nbr == 'three':
        model_list = three_groups_list

    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']

    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = (group_nbr, model_list_nbr)
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Model List: {model_list_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    for model_list_nbr in ['two', 'three']:
        if group_nbr == 3 and model_list_nbr == 'two':
            continue
        thread = threading.Thread(target=run_evaluation, args=(group_nbr, model_list_nbr, tokenizer))
        thread.start()
        threads.append(thread)

for thread in threads:
    thread.join()


In [None]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('global_accuracies_samplingmodels_probabilities.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [None]:
with open('global_accuracies_samplingmodels_probabilities.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict


In [None]:
loaded_dict