In [1]:
import torch
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

from utils import load_data, safe_indexing
from utils import path_config as config

In [2]:
def analyze_embedding_magnitudes(embeddings):
    magnitudes = np.linalg.norm(embeddings, axis=1) #  magnitudes (norms) of the embedding vectors
    #  statistics of the magnitudes
    mean_magnitude = np.mean(magnitudes)
    std_magnitude = np.std(magnitudes)
    min_magnitude = np.min(magnitudes)
    max_magnitude = np.max(magnitudes)
    print(f"Number of embeddings: {embeddings.shape[0]}")
    print(f"Embedding dimensionality: {embeddings.shape[1]}")
    print(f"Mean magnitude: {mean_magnitude:.4f}")
    print(f"Standard deviation of magnitudes: {std_magnitude:.4f}")
    print(f"Minimum magnitude: {min_magnitude:.4f}")
    print(f"Maximum magnitude: {max_magnitude:.4f}")

In [2]:
def standardize(embeddings):
    mean = np.mean(embeddings, axis=0)
    std_dev = np.std(embeddings, axis=0)
    standardized_embeddings = (embeddings - mean) / std_dev
    return standardized_embeddings

def compute_metrics(embeddings, do_standardize=False, sample_size=20000):
    if len(embeddings) > sample_size: # random sample 20 000 
        sampled_embeddings = embeddings[np.random.choice(embeddings.shape[0], sample_size, replace=False)]
    else:
        sampled_embeddings = embeddings

    if do_standardize is True: # subtract the mean vector and divide each dimension by standard deviation
        sampled_embeddings = standardize(sampled_embeddings)
        
    # pairwise cosine similarities and euclidean distances
    cos_sim_matrix = cosine_similarity(sampled_embeddings)
    euclidean_dist_matrix = euclidean_distances(sampled_embeddings)
    
    avg_cosine_similarity = np.mean(cos_sim_matrix[np.triu_indices_from(cos_sim_matrix, k=1)])
    avg_euclidean_distance = np.mean(euclidean_dist_matrix[np.triu_indices_from(euclidean_dist_matrix, k=1)])
    return avg_cosine_similarity, avg_euclidean_distance

def compute_all_metrics(model, do_standardize):
    data = torch.load(config[model])
    contextualized_word_embeddings, labels = np.array(data['features']), np.array([safe_indexing(y) for y in data['labels']])
    contextualized_sentence_embeddings = np.array(data['mean_embeddings'])
    # filter
    embeddings_with_label_1 = contextualized_word_embeddings[labels == 1] # terms
    embeddings_with_label_0 = contextualized_word_embeddings[labels == 0] # not terms

    avg_cos_sim_label_1, avg_euclid_dist_label_1 = compute_metrics(embeddings_with_label_1, do_standardize=do_standardize) # terms
    avg_cos_sim_label_0, avg_euclid_dist_label_0 = compute_metrics(embeddings_with_label_0, do_standardize=do_standardize) # not terms
    avg_cos_sim_all, avg_euclid_dist_all = compute_metrics(contextualized_word_embeddings, do_standardize=do_standardize) # all words
    avg_cos_sim_sent, avg_euclid_dist_sent = compute_metrics(contextualized_sentence_embeddings, do_standardize=do_standardize) # for sentence representations
    print("Model", model)
    print("Avg. Cosine Similarity for embeddings (terms):", round(avg_cos_sim_label_1, 4))
    print("Avg. Cosine Similarity for embeddings (not terms):", round(avg_cos_sim_label_0, 4))
    print("Avg. Cosine Similarity for all embeddings:", round(avg_cos_sim_all, 4))
    print("Avg. Cosine Similarity for sentences:", round(avg_cos_sim_sent, 4))
    print("Avg. Euclidean Distance for embeddings (terms):", round(avg_euclid_dist_label_1, 4))
    print("Avg. Euclidean Distance for embeddings (not terms):", round(avg_euclid_dist_label_0, 4))
    print("Avg. Euclidean Distance for all embeddings:", round(avg_euclid_dist_all, 4))
    print("Avg. Euclidean Distance for sentences:", round(avg_euclid_dist_sent, 4))

In [3]:
compute_all_metrics('en_multi_bert', do_standardize=False)

Model en_multi_bert
Avg. Cosine Similarity for embeddings (terms): 0.2088
Avg. Cosine Similarity for embeddings (not terms): 0.2797
Avg. Cosine Similarity for all embeddings: 0.236
Avg. Cosine Similarity for sentences: 0.8261
Avg. Euclidean Distance for embeddings (terms): 18.436
Avg. Euclidean Distance for embeddings (not terms): 17.6143
Avg. Euclidean Distance for all embeddings: 18.1382
Avg. Euclidean Distance for sentences: 4.4547


In [4]:
compute_all_metrics('en_multi_bert', do_standardize=True)

Model en_multi_bert
Avg. Cosine Similarity for embeddings (terms): 0.0003
Avg. Cosine Similarity for embeddings (not terms): 0.0002
Avg. Cosine Similarity for all embeddings: 0.0002
Avg. Cosine Similarity for sentences: 0.0005
Avg. Euclidean Distance for embeddings (terms): 38.972
Avg. Euclidean Distance for embeddings (not terms): 38.9538
Avg. Euclidean Distance for all embeddings: 38.9795
Avg. Euclidean Distance for sentences: 38.6549


In [5]:
compute_all_metrics('ru_multi_bert', do_standardize=False)

Model ru_multi_bert
Avg. Cosine Similarity for embeddings (terms): 0.2525
Avg. Cosine Similarity for embeddings (not terms): 0.2473
Avg. Cosine Similarity for all embeddings: 0.2445
Avg. Cosine Similarity for sentences: 0.8531
Avg. Euclidean Distance for embeddings (terms): 18.5883
Avg. Euclidean Distance for embeddings (not terms): 18.5545
Avg. Euclidean Distance for all embeddings: 18.6178
Avg. Euclidean Distance for sentences: 4.3725


In [6]:
compute_all_metrics('ru_multi_bert', do_standardize=True)

Model ru_multi_bert
Avg. Cosine Similarity for embeddings (terms): 0.0002
Avg. Cosine Similarity for embeddings (not terms): 1e-04
Avg. Cosine Similarity for all embeddings: 1e-04
Avg. Cosine Similarity for sentences: 0.0006
Avg. Euclidean Distance for embeddings (terms): 39.0229
Avg. Euclidean Distance for embeddings (not terms): 39.0292
Avg. Euclidean Distance for all embeddings: 39.036
Avg. Euclidean Distance for sentences: 38.7069


In [7]:
compute_all_metrics('en_bert', do_standardize=False)

Model en_bert
Avg. Cosine Similarity for embeddings (terms): 0.2486
Avg. Cosine Similarity for embeddings (not terms): 0.2988
Avg. Cosine Similarity for all embeddings: 0.2646
Avg. Cosine Similarity for sentences: 0.7804
Avg. Euclidean Distance for embeddings (terms): 18.7823
Avg. Euclidean Distance for embeddings (not terms): 17.6589
Avg. Euclidean Distance for all embeddings: 18.2916
Avg. Euclidean Distance for sentences: 5.6214


In [5]:
analyze_embedding_magnitudes(torch.load(config['en_bert'])['features'])

Number of embeddings: 433151
Embedding dimensionality: 768
Mean magnitude: 15.0534
Standard deviation of magnitudes: 1.2632
Minimum magnitude: 9.2779
Maximum magnitude: 17.6945


In [8]:
compute_all_metrics('en_bert', do_standardize=True)

Model en_bert
Avg. Cosine Similarity for embeddings (terms): 1e-04
Avg. Cosine Similarity for embeddings (not terms): 0.0002
Avg. Cosine Similarity for all embeddings: 1e-04
Avg. Cosine Similarity for sentences: 0.0011
Avg. Euclidean Distance for embeddings (terms): 39.0514
Avg. Euclidean Distance for embeddings (not terms): 38.9959
Avg. Euclidean Distance for all embeddings: 39.0251
Avg. Euclidean Distance for sentences: 38.4402


In [9]:
compute_all_metrics('ru_bert', do_standardize=False)

Model ru_bert
Avg. Cosine Similarity for embeddings (terms): 0.215
Avg. Cosine Similarity for embeddings (not terms): 0.2125
Avg. Cosine Similarity for all embeddings: 0.21
Avg. Cosine Similarity for sentences: 0.5405
Avg. Euclidean Distance for embeddings (terms): 22.1106
Avg. Euclidean Distance for embeddings (not terms): 22.499
Avg. Euclidean Distance for all embeddings: 22.4349
Avg. Euclidean Distance for sentences: 10.8029


In [10]:
compute_all_metrics('ru_bert', do_standardize=True)

Model ru_bert
Avg. Cosine Similarity for embeddings (terms): 1e-04
Avg. Cosine Similarity for embeddings (not terms): 1e-04
Avg. Cosine Similarity for all embeddings: 1e-04
Avg. Cosine Similarity for sentences: 0.0011
Avg. Euclidean Distance for embeddings (terms): 39.0755
Avg. Euclidean Distance for embeddings (not terms): 39.0879
Avg. Euclidean Distance for all embeddings: 39.0857
Avg. Euclidean Distance for sentences: 38.6648


In [11]:
compute_all_metrics('en_gpt', do_standardize=False)

Model en_gpt
Avg. Cosine Similarity for embeddings (terms): 0.9535
Avg. Cosine Similarity for embeddings (not terms): 0.9605
Avg. Cosine Similarity for all embeddings: 0.9545
Avg. Cosine Similarity for sentences: 0.9977
Avg. Euclidean Distance for embeddings (terms): 82.6051
Avg. Euclidean Distance for embeddings (not terms): 73.1453
Avg. Euclidean Distance for all embeddings: 78.9658
Avg. Euclidean Distance for sentences: 24.2101


In [6]:
analyze_embedding_magnitudes(torch.load(config['en_gpt'])['features'])

Number of embeddings: 441859
Embedding dimensionality: 768
Mean magnitude: 248.7540
Standard deviation of magnitudes: 52.3736
Minimum magnitude: 33.9876
Maximum magnitude: 440.1469


In [12]:
compute_all_metrics('en_gpt', do_standardize=True)

Model en_gpt
Avg. Cosine Similarity for embeddings (terms): 0.0003
Avg. Cosine Similarity for embeddings (not terms): 1e-04
Avg. Cosine Similarity for all embeddings: 1e-04
Avg. Cosine Similarity for sentences: -0.0003
Avg. Euclidean Distance for embeddings (terms): 38.8804
Avg. Euclidean Distance for embeddings (not terms): 38.959
Avg. Euclidean Distance for all embeddings: 38.9419
Avg. Euclidean Distance for sentences: 38.9428


In [13]:
compute_all_metrics('ru_gpt', do_standardize=False)

Model ru_gpt
Avg. Cosine Similarity for embeddings (terms): 0.2805
Avg. Cosine Similarity for embeddings (not terms): 0.2185
Avg. Cosine Similarity for all embeddings: 0.2259
Avg. Cosine Similarity for sentences: 0.625
Avg. Euclidean Distance for embeddings (terms): 45.9859
Avg. Euclidean Distance for embeddings (not terms): 47.9673
Avg. Euclidean Distance for all embeddings: 47.7654
Avg. Euclidean Distance for sentences: 20.8832


In [14]:
compute_all_metrics('ru_gpt', do_standardize=True)

Model ru_gpt
Avg. Cosine Similarity for embeddings (terms): 0.0009
Avg. Cosine Similarity for embeddings (not terms): 0.0005
Avg. Cosine Similarity for all embeddings: 0.0005
Avg. Cosine Similarity for sentences: -1e-04
Avg. Euclidean Distance for embeddings (terms): 38.7132
Avg. Euclidean Distance for embeddings (not terms): 38.8375
Avg. Euclidean Distance for all embeddings: 38.8153
Avg. Euclidean Distance for sentences: 38.8606


In [3]:
def compute_themes_metrics(model, do_standardize=True, language='en'):
    contextualized_sentence_embeddings, labels = load_data(torch.load(config[model]), data_type= 'sentence', language=language)
    contextualized_sentence_embeddings = np.array(contextualized_sentence_embeddings)
    if do_standardize is True: # subtract the mean vector and divide each dimension by standard deviation
        contextualized_sentence_embeddings = standardize(contextualized_sentence_embeddings)

    embeddings_by_label = {label: [] for label in set(labels)}
    for embedding, label in zip(contextualized_sentence_embeddings, labels):
        embeddings_by_label[label].append(embedding)

    same_label_metrics = {label: compute_metrics(np.array(embeddings), do_standardize=False) for label, embeddings in embeddings_by_label.items()}
    diff_label_metrics = []
    for label_1, embeddings_1 in embeddings_by_label.items():
        for label_2, embeddings_2 in embeddings_by_label.items():
            if label_1 != label_2:
                avg_cosine_sim, avg_euclidean_dist = compute_metrics(np.vstack((embeddings_1, embeddings_2)), do_standardize=False)
                diff_label_metrics.append((label_1, label_2, avg_cosine_sim, avg_euclidean_dist))

    overall_avg_cosine_sim = np.mean([metrics[2] for metrics in diff_label_metrics])
    overall_avg_euclidean_dist = np.mean([metrics[3] for metrics in diff_label_metrics])
    print("Model", model)
    print("Same Label Metrics:")
    same_cosine = []
    same_euclidian = []
    for label, metrics in same_label_metrics.items():
        print(f"Label {label}: Cosine Sim {metrics[0]:.4f}, Euclidean Dist {metrics[1]:.2f}")
        same_cosine.append(metrics[0])
        same_euclidian.append(metrics[1])

    print("Overall Avg Same Label Cosine:", round(np.mean(same_cosine), 4))
    print("Overall Avg Same Label Euclidean:", round(np.mean(same_euclidian), 4))

    print("\nDifferent Label Metrics (Overall Average):")
    print(f"Overall Avg Cosine Sim: {overall_avg_cosine_sim:.4f}")
    print(f"Overall Avg Euclidean Dist: {overall_avg_euclidean_dist:.2f}")

In [4]:
compute_themes_metrics('en_multi_bert', do_standardize=False)

Model en_multi_bert
Same Label Metrics:
Label biology: Cosine Sim 0.8505, Euclidean Dist 4.11
Label mathematics: Cosine Sim 0.8558, Euclidean Dist 4.14
Label engineering: Cosine Sim 0.8580, Euclidean Dist 4.00
Label chemistry: Cosine Sim 0.8698, Euclidean Dist 3.79
Label materials science: Cosine Sim 0.8790, Euclidean Dist 3.64
Label computer science: Cosine Sim 0.8587, Euclidean Dist 4.01
Label physics: Cosine Sim 0.8555, Euclidean Dist 4.04
Label psychology: Cosine Sim 0.8474, Euclidean Dist 4.16
Label economics: Cosine Sim 0.8423, Euclidean Dist 4.28
Label medicine: Cosine Sim 0.8538, Euclidean Dist 4.09
Overall Avg Same Label Cosine: 0.8571
Overall Avg Same Label Euclidean: 4.0263

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.8430
Overall Avg Euclidean Dist: 4.22


In [5]:
compute_themes_metrics('en_multi_bert', do_standardize=True)

Model en_multi_bert
Same Label Metrics:
Label biology: Cosine Sim 0.1100, Euclidean Dist 36.25
Label mathematics: Cosine Sim 0.2554, Euclidean Dist 36.31
Label engineering: Cosine Sim 0.0838, Euclidean Dist 35.36
Label chemistry: Cosine Sim 0.2263, Euclidean Dist 33.44
Label materials science: Cosine Sim 0.1693, Euclidean Dist 32.21
Label computer science: Cosine Sim 0.1283, Euclidean Dist 35.53
Label physics: Cosine Sim 0.1183, Euclidean Dist 35.62
Label psychology: Cosine Sim 0.1218, Euclidean Dist 36.72
Label economics: Cosine Sim 0.1621, Euclidean Dist 37.99
Label medicine: Cosine Sim 0.1414, Euclidean Dist 36.06
Overall Avg Same Label Cosine: 0.1517
Overall Avg Same Label Euclidean: 35.551

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.0738
Overall Avg Euclidean Dist: 37.18


In [6]:
compute_themes_metrics('en_bert', do_standardize=False)

Model en_bert
Same Label Metrics:
Label biology: Cosine Sim 0.8277, Euclidean Dist 4.89
Label mathematics: Cosine Sim 0.8190, Euclidean Dist 5.34
Label engineering: Cosine Sim 0.8237, Euclidean Dist 5.10
Label chemistry: Cosine Sim 0.8419, Euclidean Dist 4.70
Label materials science: Cosine Sim 0.8606, Euclidean Dist 4.46
Label computer science: Cosine Sim 0.8265, Euclidean Dist 5.00
Label physics: Cosine Sim 0.8223, Euclidean Dist 5.12
Label psychology: Cosine Sim 0.8227, Euclidean Dist 5.01
Label economics: Cosine Sim 0.8045, Euclidean Dist 5.34
Label medicine: Cosine Sim 0.8244, Euclidean Dist 4.92
Overall Avg Same Label Cosine: 0.8273
Overall Avg Same Label Euclidean: 4.987

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.8047
Overall Avg Euclidean Dist: 5.30


In [7]:
compute_themes_metrics('en_bert', do_standardize=True)

Model en_bert
Same Label Metrics:
Label biology: Cosine Sim 0.1582, Euclidean Dist 33.96
Label mathematics: Cosine Sim 0.3073, Euclidean Dist 37.04
Label engineering: Cosine Sim 0.0968, Euclidean Dist 35.63
Label chemistry: Cosine Sim 0.2766, Euclidean Dist 32.60
Label materials science: Cosine Sim 0.2184, Euclidean Dist 31.00
Label computer science: Cosine Sim 0.1403, Euclidean Dist 35.04
Label physics: Cosine Sim 0.1464, Euclidean Dist 35.57
Label psychology: Cosine Sim 0.1686, Euclidean Dist 35.10
Label economics: Cosine Sim 0.1881, Euclidean Dist 37.74
Label medicine: Cosine Sim 0.1660, Euclidean Dist 34.32
Overall Avg Same Label Cosine: 0.1867
Overall Avg Same Label Euclidean: 34.7985

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.0895
Overall Avg Euclidean Dist: 36.77


In [8]:
compute_themes_metrics('en_gpt', do_standardize=False)

Model en_gpt
Same Label Metrics:
Label biology: Cosine Sim 0.9982, Euclidean Dist 22.50
Label mathematics: Cosine Sim 0.9985, Euclidean Dist 20.40
Label engineering: Cosine Sim 0.9977, Euclidean Dist 20.90
Label chemistry: Cosine Sim 0.9982, Euclidean Dist 19.48
Label materials science: Cosine Sim 0.9986, Euclidean Dist 18.38
Label computer science: Cosine Sim 0.9985, Euclidean Dist 20.34
Label physics: Cosine Sim 0.9984, Euclidean Dist 19.27
Label psychology: Cosine Sim 0.9981, Euclidean Dist 25.02
Label economics: Cosine Sim 0.9981, Euclidean Dist 22.42
Label medicine: Cosine Sim 0.9977, Euclidean Dist 25.58
Overall Avg Same Label Cosine: 0.9982
Overall Avg Same Label Euclidean: 21.4289

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.9979
Overall Avg Euclidean Dist: 23.01


In [9]:
compute_themes_metrics('en_gpt', do_standardize=True)

Model en_gpt
Same Label Metrics:
Label biology: Cosine Sim 0.1339, Euclidean Dist 36.09
Label mathematics: Cosine Sim 0.2530, Euclidean Dist 33.05
Label engineering: Cosine Sim 0.0916, Euclidean Dist 36.62
Label chemistry: Cosine Sim 0.3058, Euclidean Dist 31.16
Label materials science: Cosine Sim 0.2412, Euclidean Dist 31.69
Label computer science: Cosine Sim 0.1399, Euclidean Dist 36.41
Label physics: Cosine Sim 0.1536, Euclidean Dist 34.73
Label psychology: Cosine Sim 0.1639, Euclidean Dist 36.83
Label economics: Cosine Sim 0.2069, Euclidean Dist 36.38
Label medicine: Cosine Sim 0.1487, Euclidean Dist 36.51
Overall Avg Same Label Cosine: 0.1839
Overall Avg Same Label Euclidean: 34.9489

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.0873
Overall Avg Euclidean Dist: 37.03


In [10]:
compute_themes_metrics('ru_multi_bert', do_standardize=False, language='ru')

Model ru_multi_bert
Same Label Metrics:
Label biology: Cosine Sim 0.8698, Euclidean Dist 3.98
Label mathematics: Cosine Sim 0.8590, Euclidean Dist 4.32
Label chemistry: Cosine Sim 0.8715, Euclidean Dist 4.05
Label politics: Cosine Sim 0.8600, Euclidean Dist 4.35
Label humanities: Cosine Sim 0.8719, Euclidean Dist 4.12
Label sociology: Cosine Sim 0.8600, Euclidean Dist 4.30
Label physics: Cosine Sim 0.8731, Euclidean Dist 4.08
Label psychology: Cosine Sim 0.8549, Euclidean Dist 4.32
Label economics: Cosine Sim 0.8570, Euclidean Dist 4.33
Label medicine: Cosine Sim 0.8940, Euclidean Dist 3.61
Overall Avg Same Label Cosine: 0.8671
Overall Avg Same Label Euclidean: 4.1474

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.8617
Overall Avg Euclidean Dist: 4.24


In [11]:
compute_themes_metrics('ru_multi_bert', do_standardize=True, language='ru')

Model ru_multi_bert
Same Label Metrics:
Label biology: Cosine Sim 0.1355, Euclidean Dist 35.94
Label mathematics: Cosine Sim 0.0631, Euclidean Dist 38.87
Label chemistry: Cosine Sim 0.1586, Euclidean Dist 36.50
Label politics: Cosine Sim 0.0877, Euclidean Dist 39.21
Label humanities: Cosine Sim 0.0492, Euclidean Dist 37.22
Label sociology: Cosine Sim 0.0600, Euclidean Dist 38.70
Label physics: Cosine Sim 0.0313, Euclidean Dist 36.73
Label psychology: Cosine Sim 0.0849, Euclidean Dist 38.89
Label economics: Cosine Sim 0.1043, Euclidean Dist 39.02
Label medicine: Cosine Sim 0.2293, Euclidean Dist 32.57
Overall Avg Same Label Cosine: 0.1004
Overall Avg Same Label Euclidean: 37.3639

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.0486
Overall Avg Euclidean Dist: 38.12


In [24]:
compute_themes_metrics('ru_bert', do_standardize=False, language='ru')

Model ru_bert
Same Label Metrics:
Label politics: Cosine Sim 0.5717, Euclidean Dist 10.55
Label chemistry: Cosine Sim 0.5799, Euclidean Dist 10.58
Label economics: Cosine Sim 0.5793, Euclidean Dist 10.51
Label sociology: Cosine Sim 0.5861, Euclidean Dist 10.40
Label computer science: Cosine Sim 0.5267, Euclidean Dist 10.79
Label psychology: Cosine Sim 0.5771, Euclidean Dist 10.31
Label philosophy: Cosine Sim 0.5678, Euclidean Dist 9.97
Label mathematics: Cosine Sim 0.5593, Euclidean Dist 10.95
Label humanities: Cosine Sim 0.6010, Euclidean Dist 10.19
Label medicine: Cosine Sim 0.6217, Euclidean Dist 9.45
Label biology: Cosine Sim 0.5859, Euclidean Dist 9.93
Label physics: Cosine Sim 0.5617, Euclidean Dist 10.56
Label political science: Cosine Sim 0.5683, Euclidean Dist 10.66
Label engineering: Cosine Sim 0.5417, Euclidean Dist 10.80
Label education: Cosine Sim 0.5946, Euclidean Dist 10.56
Overall Avg Same Label Cosine: 0.5749
Overall Avg Same Label Euclidean: 10.4148

Different Label M

In [12]:
compute_themes_metrics('ru_bert', do_standardize=True, language='ru')

Model ru_bert
Same Label Metrics:
Label biology: Cosine Sim 0.1187, Euclidean Dist 35.66
Label mathematics: Cosine Sim 0.0531, Euclidean Dist 39.54
Label chemistry: Cosine Sim 0.1325, Euclidean Dist 38.24
Label politics: Cosine Sim 0.0701, Euclidean Dist 37.81
Label humanities: Cosine Sim 0.0472, Euclidean Dist 36.83
Label sociology: Cosine Sim 0.0522, Euclidean Dist 37.39
Label physics: Cosine Sim 0.0268, Euclidean Dist 38.07
Label psychology: Cosine Sim 0.0854, Euclidean Dist 36.99
Label economics: Cosine Sim 0.1125, Euclidean Dist 37.73
Label medicine: Cosine Sim 0.1859, Euclidean Dist 33.89
Overall Avg Same Label Cosine: 0.0884
Overall Avg Same Label Euclidean: 37.2153

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.0442
Overall Avg Euclidean Dist: 38.02


In [13]:
compute_themes_metrics('ru_gpt', do_standardize=False, language='ru')

Model ru_gpt
Same Label Metrics:
Label biology: Cosine Sim 0.6724, Euclidean Dist 18.53
Label mathematics: Cosine Sim 0.6855, Euclidean Dist 19.35
Label chemistry: Cosine Sim 0.7174, Euclidean Dist 17.79
Label politics: Cosine Sim 0.6640, Euclidean Dist 20.26
Label humanities: Cosine Sim 0.7328, Euclidean Dist 18.57
Label sociology: Cosine Sim 0.6833, Euclidean Dist 19.36
Label physics: Cosine Sim 0.6961, Euclidean Dist 19.31
Label psychology: Cosine Sim 0.6047, Euclidean Dist 20.61
Label economics: Cosine Sim 0.6525, Euclidean Dist 19.83
Label medicine: Cosine Sim 0.6924, Euclidean Dist 18.13
Overall Avg Same Label Cosine: 0.6801
Overall Avg Same Label Euclidean: 19.1751

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.6594
Overall Avg Euclidean Dist: 19.90


In [14]:
compute_themes_metrics('ru_gpt', do_standardize=True, language='ru')

Model ru_gpt
Same Label Metrics:
Label biology: Cosine Sim 0.1977, Euclidean Dist 34.95
Label mathematics: Cosine Sim 0.0748, Euclidean Dist 37.19
Label chemistry: Cosine Sim 0.2233, Euclidean Dist 35.48
Label politics: Cosine Sim 0.1154, Euclidean Dist 36.47
Label humanities: Cosine Sim 0.0619, Euclidean Dist 37.12
Label sociology: Cosine Sim 0.0904, Euclidean Dist 35.86
Label physics: Cosine Sim 0.0345, Euclidean Dist 38.39
Label psychology: Cosine Sim 0.1141, Euclidean Dist 36.88
Label economics: Cosine Sim 0.1521, Euclidean Dist 35.01
Label medicine: Cosine Sim 0.2719, Euclidean Dist 33.59
Overall Avg Same Label Cosine: 0.1336
Overall Avg Same Label Euclidean: 36.0939

Different Label Metrics (Overall Average):
Overall Avg Cosine Sim: 0.0626
Overall Avg Euclidean Dist: 37.57
