In [None]:

import pickle
import numpy as np
import pandas as pd
import  os
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
# Select dataset

dataset_name = 'cmu_book'
#dataset_name = 'cmu_movie'

EVAL_MODEL_LIST=['all-mpnet-base-v2','all-distilroberta-v1','all-MiniLM-L6-v2', 'gemini','multi-qa-distilbert-cos-v1','paraphrase-MiniLM-L6-v2','distiluse-base-multilingual-cased-v1','distiluse-base-multilingual-cased-v2','paraphrase-multilingual-MiniLM-L12-v2','msmarco-distilbert-cos-v5','multi-qa-mpnet-base-cos-v1','text-embedding-3-small','text-embedding-3-large','voyage-3-lite']


In [None]:
# SELECT BENCHMARKING TYPE

BENCHMARKING_TYPE = 'p_y_c_y_default'             # default benchmarking.                     # cmu book /cmu movie
# BENCHMARKING_TYPE = 'p_y_c_n_loc_exists_no'        # contains person, does not contain country/city etc.              # cmu book only 
# BENCHMARKING_TYPE = 'p_y_c_n_country_exists_yes'   # contains person, contain country/city etc.. Perturn person and country   # cmu book only
# BENCHMARKING_TYPE = 'p_y_c_n_same_country_names'   # contains person, does not contain country/city etc. Perturb person names from same country    #cmu book only
# BENCHMARKING_TYPE = 'p_y_c_n_loc_any'  # contains person, may/may not contain country/city etc. Perturb person names only #cmu book only


if BENCHMARKING_TYPE =='p_y_c_n_same_country_names':
    country_people = 'Spain'    #France #India
    path_embed_prefix = f'BENCHMARKING/{dataset_name}/EMBEDDINGS_{BENCHMARKING_TYPE}/{country_people}'

else:
    path_embed_prefix = f'BENCHMARKING/{dataset_name}/EMBEDDINGS_{BENCHMARKING_TYPE}'

In [None]:
if BENCHMARKING_TYPE=='p_y_c_n_loc_exists_no' or BENCHMARKING_TYPE=='p_y_c_n_loc_any' or BENCHMARKING_TYPE=='p_y_c_n_same_country_names':
    EVAL_MODEL_LIST=['all-mpnet-base-v2','all-distilroberta-v1','all-MiniLM-L6-v2', 'gemini','multi-qa-distilbert-cos-v1','paraphrase-MiniLM-L6-v2','distiluse-base-multilingual-cased-v1','distiluse-base-multilingual-cased-v2','paraphrase-multilingual-MiniLM-L12-v2','msmarco-distilbert-cos-v5','multi-qa-mpnet-base-cos-v1','voyage-3-lite']



In [None]:
path_embed_prefix

In [None]:
EVAL_MODEL_LIST

In [None]:
path_embed_prefix

In [None]:
dict_embeddings_model = {}

for model_name in EVAL_MODEL_LIST:
    path_model = f'{path_embed_prefix}/{dataset_name}_{model_name}'

    print('path_model ', path_model)

    with open(path_model, "rb") as f:
        loaded_data = pickle.load(f)

    dict_embeddings_model[model_name]= loaded_data


In [None]:
dict_embeddings_model[model_name].columns

In [None]:
display(dict_embeddings_model[model_name].head(1))

In [None]:


dict_collection_of_similarities_cosine = {}
dict_collection_of_distances_euclidean = {}

def get_lower_diagonal(matrix):
    lower_diagonal_elements = matrix[np.tril_indices(matrix.shape[0], -1)]
    return lower_diagonal_elements


def compute_similarity_numbers(dataset_country_yes_city_no_applicable, model_name, embed_prefix):
    dict_collection_of_similarities_cosine[model_name] = {}
    dict_collection_of_distances_euclidean[model_name] = {}

    all_cosine_similarities = []
    all_euclidean_distances = []

    for index, row in dataset_country_yes_city_no_applicable.iterrows():
        embeddings_list_same_show = []
        for col in dataset_country_yes_city_no_applicable.columns:
            if col.startswith(f'{embed_prefix}'):
                if model_name in col:
                    embeddings_list_same_show.append(row[col])

        embeddings_list_same_show = np.array(embeddings_list_same_show)

        cosine_sim_matrix_show = cosine_similarity(embeddings_list_same_show)
        euclidean_dist_matrix_show = euclidean_distances(embeddings_list_same_show)


        cosine_similarities_sample = get_lower_diagonal(cosine_sim_matrix_show)
        all_euclidean_distances_sample = get_lower_diagonal(euclidean_dist_matrix_show)

        all_cosine_similarities.extend(cosine_similarities_sample.tolist())
        all_euclidean_distances.extend(all_euclidean_distances_sample.tolist())

        movie_id = row['movie_id']
        dict_collection_of_similarities_cosine[model_name][movie_id] = cosine_similarities_sample
        dict_collection_of_distances_euclidean[model_name][movie_id] = all_euclidean_distances_sample

        num_elements_matrix = cosine_sim_matrix_show.shape[0]*cosine_sim_matrix_show.shape[1]

        
    print('len all_cosine_similarities ', len(all_cosine_similarities))
    mean_cosine_similarity = np.mean(all_cosine_similarities)
    std_err_cosine = np.std(all_cosine_similarities)/  np.sqrt(len(all_cosine_similarities))
    
    mean_euclidean_distance = np.mean(all_euclidean_distances)
    std_err_euclidean = np.std(all_euclidean_distances) / np.sqrt(len(all_euclidean_distances))


    return mean_cosine_similarity, std_err_cosine, mean_euclidean_distance, std_err_euclidean, len(dataset_country_yes_city_no_applicable)

global_averages_list = []

for model_name in EVAL_MODEL_LIST:
    print('model_name ', model_name)
    dataset_with_embedding = dict_embeddings_model[model_name] 
    embed_prefix = 'embed_plot'
    cosine_sim_mean, cosine_sim_std_err, euclidean_dist_mean, euclidean_dist_std_err, num_samples_applicable  = compute_similarity_numbers(dataset_with_embedding, model_name, embed_prefix)

    global_averages = {
        'model_name': model_name,
        'cosine_sim_mean': cosine_sim_mean,
        'cosine_sim_std_err': cosine_sim_std_err,
        'euclidean_dist_mean': euclidean_dist_mean,
        'euclidean_dist_std_err': euclidean_dist_std_err,
        'num_applicable_samples': num_samples_applicable
    }

    global_averages_list.append(global_averages)

In [None]:
#
global_averages_df = pd.DataFrame(global_averages_list)
display(global_averages_df)

In [None]:
global_averages_df['cosine_sim_std_err'] = global_averages_df['cosine_sim_std_err'].round(4)
global_averages_df['cosine_sim_mean'] = global_averages_df['cosine_sim_mean'].round(3)

In [None]:
display(global_averages_df)

In [None]:
global_averages_df = global_averages_df.rename(columns={'cosine_sim_mean': 'Cosine sim per perturbation pair'})
display(global_averages_df)

In [None]:
import pandas as pd

global_averages_df['Cosine sim per perturbation pair'] = global_averages_df['Cosine sim per perturbation pair'].astype(str) + ' \pm ' + global_averages_df['cosine_sim_std_err'].astype(str)

latex_code = global_averages_df[['model_name', 'Cosine sim per perturbation pair']].to_latex(index=False, escape=False)
print(latex_code)

In [None]:
BENCHMARKING_TYPE, dataset_name

In [None]:


path_prefix = f'BENCHMARKING/{dataset_name}/RESULTS/{BENCHMARKING_TYPE}/'
if BENCHMARKING_TYPE =='p_y_c_n_same_country_names':
    path_prefix = f'{path_prefix}/{country_people}'
    
os.makedirs(path_prefix, exist_ok=True)


In [None]:
path_prefix

In [None]:

with open(f'{path_prefix}/global_averages_{dataset_name}.pkl', 'wb') as f:
    pickle.dump(global_averages_df, f)

global_averages_df.to_csv(f"{path_prefix}/global_averages_{dataset_name}.csv", index=False)