In [1]:
import pandas as pd
import os
from collections import defaultdict
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pickle

def defaultdict_to_dict(d):
    """ Recursively convert defaultdict to dict. """
    if isinstance(d, defaultdict):
        d = {key: defaultdict_to_dict(value) for key, value in d.items()}
    return d

In [2]:
def find_first_files_with_str(directory, str_contain, index):
    # Create an empty list to store files that contain str_contain
    files_with_str = []

    # Iterate over the files in the directory
    for file in os.listdir(directory):
        if file[:4] == str_contain:
            files_with_str.append(file)
    
    
    # Sort only the files that contain 'x1'
    files_with_str.sort()

    # Return the first file in the sorted list, or None if the list is empty
    return files_with_str[index] if files_with_str else None

In [3]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Reading data and models
groups = [str(i) for i in range(1,4)]
peers = [str(i) for i in range(8090, 8100)]

#reading entire dataset for all groups: train_df_group1



# for group in groups:
#     datasets_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}','datasets')
#     personal_dfs = os.listdir(datasets_folder)
#     personal_dfs = [i for i in personal_dfs if '.csv' in i and 'train' not in i and 'test' not in i]
#     print (personal_dfs)
#     for i in personal_dfs:
#         df_temp = pd.read_csv(os.path.join(datasets_folder, i))
#         print (df_temp['doc_id'].nunique())
#     exec(f'train_df_group{group} = train_df_group{group}.drop_duplicates()')
    
    
# reading individual peer datasets & group datasets: train_df_group1, train_df_group1_peer1
for group in groups:
    # creating train_df's
    exec(f'train_df_group{group} = pd.DataFrame()')
    for peer in peers:
        datasets_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}','datasets')
        exec_str = f"train_df_group{group}_peer{int(peer) - 8089} = pd.read_csv(os.path.join(datasets_folder,'{peer}_df.csv'))"
        exec(exec_str)
        exec(f'train_df_group{group} = pd.concat([train_df_group{group}, train_df_group{group}_peer{int(peer) - 8089}])')
       
    exec(f'train_df_group{group} = train_df_group{group}.drop_duplicates()')
    
    # creating test_df's: test_df_group1, test_df_group1_peer1
    
    datasets_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}','datasets')
    exec (f"test_df_group{group} = pd.read_csv(os.path.join(datasets_folder,'test_df.csv')) ")
    exec (f"test_df_group{group} = test_df_group{group}[test_df_group{group}['doc_id'].isin(train_df_group{group}['doc_id'].unique())]")
    for peer in peers:
        exec (f"test_df_group{group}_peer{int(peer) - 8089} = test_df_group{group}[test_df_group{group}['doc_id'].isin(train_df_group{group}_peer{int(peer) - 8089}['doc_id'].unique())]")
    
    
    

#reading models: model_group1_peer1
for group in groups:
    for peer in peers:
        model_folder = os.path.join('aggregated_results',f'10_peers_10kEpochs_group{group}', 'models')
        model_file = find_first_files_with_str(model_folder, peer, 10) # 10 is the largest number of saved models that all peers have finished training
        print (group, peer, model_file)
        exec_str = f"model_group{group}_peer{str(int(peer)-8089)} = T5ForConditionalGeneration.from_pretrained(os.path.join(model_folder, model_file))"
        
        
        exec(exec_str)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1 8090 8090_2023-12-16 175932_my_t5_model
1 8091 8091_2023-12-16 180341_my_t5_model
1 8092 8092_2023-12-16 180103_my_t5_model
1 8093 8093_2023-12-16 191901_my_t5_model
1 8094 8094_2023-12-16 180320_my_t5_model
1 8095 8095_2023-12-16 180348_my_t5_model
1 8096 8096_2023-12-16 180129_my_t5_model
1 8097 8097_2023-12-16 180007_my_t5_model
1 8098 8098_2023-12-16 191911_my_t5_model
1 8099 8099_2023-12-16 191700_my_t5_model
2 8090 8090_2023-12-17 123459_my_t5_model
2 8091 8091_2023-12-17 123536_my_t5_model
2 8092 8092_2023-12-17 123202_my_t5_model
2 8093 8093_2023-12-17 123152_my_t5_model
2 8094 8094_2023-12-17 124010_my_t5_model
2 8095 8095_2023-12-17 123723_my_t5_model
2 8096 8096_2023-12-17 140032_my_t5_model
2 8097 8097_2023-12-17 123518_my_t5_model
2 8098 8098_2023-12-17 123711_my_t5_model
2 8099 8099_2023-12-17 123633_my_t5_model
3 8090 8090_2023-12-18 082034_my_t5_model
3 8091 8091_2023-12-18 070742_my_t5_model
3 8092 8092_2023-12-18 070621_my_t5_model
3 8093 8093_2023-12-18 081950_my_t

In [18]:
test_df_group2['doc_id'].nunique()

2032

In [4]:
print (train_df_group1['doc_id'].nunique())
print (train_df_group2['doc_id'].nunique())
print (train_df_group3['doc_id'].nunique())

2001
2032
2002


# Sampling random models and aggregating their suggestions - 5 beams

In [5]:
models_group1_list = []
models_group2_list = []
models_group3_list = []

for group in groups:
    for i, peer in enumerate(peers):
        exec(f'models_group{group}_list.append(model_group{group}_peer{int(peer)-8089})')
       

In [5]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

#             beam_scores = output.sequences_scores
#             probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
            model_res = list(set(model_res))
        
            for res in model_res:
                results[res] += 1

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:5]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, tokenizer):
    
    
    model_list = globals()[f'models_group{group_nbr}_list']
    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']
    
    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = group_nbr
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    thread = threading.Thread(target=run_evaluation, args=(group_nbr, tokenizer))
    thread.start()
    threads.append(thread)

for thread in threads:
    thread.join()


NameError: name 'groups' is not defined

In [7]:
global_accuracies

{3: {'acc_train': 0.9999399291163573, 'acc_test': 0.9535204510015594},
 1: {'acc_train': 0.9996957156767283, 'acc_test': 0.9482507288629738},
 2: {'acc_train': 0.9995378660967015, 'acc_test': 0.9451786023428934}}

In [8]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('accuracies_samplingmodels_5beams_intragroup.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [9]:
with open('accuracies_samplingmodels_5beams_intragroup.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

{3: {'acc_train': 0.9999399291163573, 'acc_test': 0.9535204510015594},
 1: {'acc_train': 0.9996957156767283, 'acc_test': 0.9482507288629738},
 2: {'acc_train': 0.9995378660967015, 'acc_test': 0.9451786023428934}}

# Sampling random models and aggregating their suggestions - 5 beams, with probabilities

In [5]:
models_group1_list = []
models_group2_list = []
models_group3_list = []

for group in groups:
    for i, peer in enumerate(peers):
        exec(f'models_group{group}_list.append(model_group{group}_peer{int(peer)-8089})')
       

In [6]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5, max_length = 20)

            beam_scores = output.sequences_scores
            probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
#             model_res = list(set(model_res))
        
            for res, prob in zip(model_res, probabilities):
                results[res] += prob

        
        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:5]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, tokenizer):
    
    
    model_list = globals()[f'models_group{group_nbr}_list']
    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']
    
    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = group_nbr
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    thread = threading.Thread(target=run_evaluation, args=(group_nbr, tokenizer))
    thread.start()
    threads.append(thread)

for thread in threads:
    thread.join()


train set size: 16432
test set size: 16464
train set size: 17311
test set size: 17329
train set size: 16647
test set size: 16674
Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 3000 queries
Processed 3000 queries
Processed 3000 queries
Processed 4000 queries
Processed 4000 queries
Processed 4000 queries
Processed 5000 queries
Processed 5000 queries
Processed 5000 queries
Processed 6000 queries
Processed 6000 queries
Processed 6000 queries
Processed 7000 queries
Processed 7000 queries
Processed 7000 queries
Processed 8000 queries
Processed 8000 queries
Processed 8000 queries
Processed 9000 queries
Processed 9000 queries
Processed 9000 queries
Processed 10000 queries
Processed 10000 queries
Processed 10000 queries
Processed 11000 queries
Processed 11000 queries
Processed 11000 queries
Processed 12000 queries
Processed 12000 queries
Processed 12000 queries
Processed 13000 queries
Processed 

In [8]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('accuracies_samplingmodels_5beams_probabilities_intragroup.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [9]:
with open('accuracies_samplingmodels_5beams_probabilities_intragroup.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

{3: {'acc_train': 0.9999399291163573, 'acc_test': 0.9568789732517692},
 1: {'acc_train': 0.9998782862706913, 'acc_test': 0.9523809523809523},
 2: {'acc_train': 0.9998844665241754, 'acc_test': 0.9489872468117029}}

# Sampling random models and aggregating their suggestions - 1 beam, with probabilities

In [5]:
models_group1_list = []
models_group2_list = []
models_group3_list = []

for group in groups:
    for i, peer in enumerate(peers):
        exec(f'models_group{group}_list.append(model_group{group}_peer{int(peer)-8089})')
       

In [6]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 1000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

            beam_scores = output.sequences_scores
            probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
#             model_res = list(set(model_res))
        
            for res, prob in zip(model_res, probabilities):
                results[res] += prob

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:1]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, tokenizer):
    
    
    model_list = globals()[f'models_group{group_nbr}_list']
    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']
    
    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = group_nbr
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    thread = threading.Thread(target=run_evaluation, args=(group_nbr, tokenizer))
    thread.start()
    threads.append(thread)

for thread in threads:
    thread.join()


train set size: 16432
test set size: 16464
train set size: 17311
test set size: 17329
train set size: 16647
test set size: 16674




Processed 1000 queries
Processed 1000 queries
Processed 1000 queries
Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 3000 queries
Processed 3000 queries
Processed 3000 queries
Processed 4000 queries
Processed 4000 queries
Processed 4000 queries
Processed 5000 queries
Processed 5000 queries
Processed 5000 queries
Processed 6000 queries
Processed 6000 queries
Processed 6000 queries
Processed 7000 queries
Processed 7000 queries
Processed 7000 queries
Processed 8000 queries
Processed 8000 queries
Processed 8000 queries
Processed 9000 queries
Processed 9000 queries
Processed 9000 queries
Processed 10000 queries
Processed 10000 queries
Processed 10000 queries
Processed 11000 queries
Processed 11000 queries
Processed 11000 queries
Processed 12000 queries
Processed 12000 queries
Processed 12000 queries
Processed 13000 queries
Processed 13000 queries
Processed 13000 queries
Processed 14000 queries
Processed 14000 queries
Processed 14000 queries
Processed 15000 que



Processed 17000 queries
Processed 17000 queries




Processed 17000 queries
Processed 18000 queries
Processed 18000 queries
Processed 18000 queries
Processed 19000 queries
Processed 19000 queries
Processed 19000 queries
Processed 20000 queries
Processed 20000 queries
Processed 20000 queries
Processed 21000 queries
Processed 21000 queries
Processed 21000 queries
Processed 22000 queries
Processed 22000 queries
Processed 22000 queries
Processed 23000 queries
Processed 23000 queries
Processed 23000 queries
Processed 24000 queries
Processed 24000 queries
Processed 24000 queries
Processed 25000 queries
Processed 25000 queries
Processed 25000 queries
Processed 26000 queries
Processed 26000 queries
Processed 26000 queries
Processed 27000 queries
Processed 27000 queries
Processed 27000 queries
Processed 28000 queries
Processed 28000 queries
Processed 28000 queries
Processed 29000 queries
Processed 29000 queries
Processed 29000 queries
Processed 30000 queries
Processed 30000 queries
Processed 30000 queries
Processed 31000 queries
Processed 31000 



Group: 1, Train Acc: 0.9956791626095424, Test Acc: 0.9207968901846453




Processed 34000 queries
Group: 2, Train Acc: 0.9965339957252614, Test Acc: 0.9186335045299786


In [7]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('accuracies_samplingmodels_1beam_probabilities_intragroup.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [8]:
with open('accuracies_samplingmodels_1beam_probabilities_intragroup.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

{3: {'acc_train': 0.9967561722832943, 'acc_test': 0.9292311382991484},
 1: {'acc_train': 0.9956791626095424, 'acc_test': 0.9207968901846453},
 2: {'acc_train': 0.9965339957252614, 'acc_test': 0.9186335045299786}}

# Sampling random models and aggregating their suggestions - 1 beam

In [9]:
models_group1_list = []
models_group2_list = []
models_group3_list = []

for group in groups:
    for i, peer in enumerate(peers):
        exec(f'models_group{group}_list.append(model_group{group}_peer{int(peer)-8089})')
       

In [10]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        self.counter += 1
        if self.counter % 2000 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        sampled_models = random.sample(self.model_list, 5)

        for model in sampled_models:
            # Ensure query is properly encoded
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5)

#             beam_scores = output.sequences_scores
#             probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]
#             model_res = list(set(model_res))
        
            for res in model_res:
                results[res] += 1

        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:1]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, tokenizer):
    
    
    model_list = globals()[f'models_group{group_nbr}_list']
    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']
    
    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = group_nbr
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in range(1, 4):
    thread = threading.Thread(target=run_evaluation, args=(group_nbr, tokenizer))
    thread.start()
    threads.append(thread)

for thread in threads:
    thread.join()


train set size:train set size: 16647
test set size: 16674
 16432
test set size: 16464
train set size: 17311
test set size: 17329




Processed 2000 queries
Processed 2000 queries
Processed 2000 queries
Processed 4000 queries
Processed 4000 queries
Processed 4000 queries
Processed 6000 queries
Processed 6000 queries
Processed 6000 queries
Processed 8000 queries
Processed 8000 queries
Processed 8000 queries
Processed 10000 queries
Processed 10000 queries
Processed 10000 queries
Processed 12000 queries
Processed 12000 queries
Processed 12000 queries
Processed 14000 queries
Processed 14000 queries
Processed 14000 queries
Processed 16000 queries
Processed 16000 queries
Processed 16000 queries




Processed 18000 queries
Processed 18000 queries
Processed 18000 queries
Processed 20000 queries
Processed 20000 queries
Processed 20000 queries
Processed 22000 queries
Processed 22000 queries
Processed 22000 queries
Processed 24000 queries
Processed 24000 queries
Processed 24000 queries
Processed 26000 queries
Processed 26000 queries
Processed 26000 queries
Processed 28000 queries
Processed 28000 queries
Processed 28000 queries
Processed 30000 queries
Processed 30000 queries
Processed 30000 queries
Processed 32000 queries
Processed 32000 queries
Processed 32000 queries
Group: 3, Train Acc: 0.9962756052141527, Test Acc: 0.9232337771380592




Group: 1, Train Acc: 0.9939143135345667, Test Acc: 0.915877065111759




Processed 34000 queries
Group: 2, Train Acc: 0.9951475940153659, Test Acc: 0.9151133937330487


In [12]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('accuracies_samplingmodels_1beam_intragroup.pkl', 'wb') as file:
    pickle.dump(regular_dict, file)

In [14]:
with open('accuracies_samplingmodels_1beam_intragroup.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

{3: {'acc_train': 0.9962756052141527, 'acc_test': 0.9232337771380592},
 1: {'acc_train': 0.9939143135345667, 'acc_test': 0.915877065111759},
 2: {'acc_train': 0.9951475940153659, 'acc_test': 0.9151133937330487}}