In [26]:
import numpy as np
import torch
from sklearn.decomposition import PCA
from scipy.spatial import procrustes
from utils import path_config as config
from tqdm import tqdm

def standardize(embeddings):
    mean = np.mean(embeddings, axis=0)
    std_dev = np.std(embeddings, axis=0)
    standardized_embeddings = (embeddings - mean) / std_dev
    return standardized_embeddings

def reduce_dimensionality(embeddings, n_components=200):
    pca = PCA(n_components=n_components, random_state=100)
    reduced_embeddings = pca.fit_transform(embeddings)
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    print(f"Cumulative explained variance: {cumulative_variance[-1]:.3f}")
    return reduced_embeddings

def apply_procrustes(X, Y):
    mtx1, mtx2, disparity = procrustes(X, Y)
    return mtx1, mtx2, disparity

def calculate_disparity(model1, model2):
    data1 = torch.load(config[model1])
    data2 = torch.load(config[model2])
    model1_sentence_embeddings = standardize(np.array(data1['mean_embeddings']))
    model2_sentence_embeddings = standardize(np.array(data2['mean_embeddings']))

    # Reduce dimensionality for comparison
    model1_sentence_reduced = reduce_dimensionality(model1_sentence_embeddings)
    model2_sentence_reduced = reduce_dimensionality(model2_sentence_embeddings)

    # Align embeddings using Procrustes analysis
    x, y, disparity = apply_procrustes(model1_sentence_reduced, model2_sentence_reduced)

    # Permutation test
    disparities = []
    num_permutations = 5000
    for _ in tqdm(range(num_permutations)):
        # Shuffle the multilingual embeddings to simulate null hypothesis
        shuffled_indices = np.random.permutation(len(model2_sentence_reduced))
        _, _, perm_disparity = apply_procrustes(model1_sentence_reduced, model2_sentence_reduced[shuffled_indices])
        disparities.append(perm_disparity)

    p_value = np.sum(disparities <= disparity) / (num_permutations + 1)
    print(f"Disparity between {model1} and {model2} embeddings:", round(disparity, 3))
    print(f"P-value (permutation test):", round(p_value, 3))

In [27]:
calculate_disparity('en_bert', 'en_multi_bert')

Cumulative explained variance: 0.934
Cumulative explained variance: 0.922


100%|██████████| 5000/5000 [02:12<00:00, 37.80it/s]


Disparity between en_bert and en_multi_bert embeddings: 0.375
P-value (permutation test): 0.0


In [28]:
calculate_disparity('ru_bert', 'ru_multi_bert')

Cumulative explained variance: 0.946
Cumulative explained variance: 0.913


100%|██████████| 5000/5000 [02:15<00:00, 36.90it/s]


Disparity between ru_bert and ru_multi_bert embeddings: 0.533
P-value (permutation test): 0.0


In [29]:
calculate_disparity('ru_bert', 'ru_gpt')

Cumulative explained variance: 0.946
Cumulative explained variance: 0.914


100%|██████████| 5000/5000 [02:17<00:00, 36.27it/s]


Disparity between ru_bert and ru_gpt embeddings: 0.483
P-value (permutation test): 0.0


In [30]:
calculate_disparity('en_bert', 'en_gpt')

Cumulative explained variance: 0.934
Cumulative explained variance: 0.901


100%|██████████| 5000/5000 [02:22<00:00, 35.20it/s]


Disparity between en_bert and en_gpt embeddings: 0.395
P-value (permutation test): 0.0
