In [1]:
import csv
import numpy as np
from os import chdir
import pickle as pkl
import distance_funcs
import preprocess_funcs
from scipy.stats import pearsonr
import preprocess_funcs
import baseline_funcs

In [51]:
chdir("c:/Users/Raya/OneDrive/Documents/3-CSAI/CSAI-Y3-S2/Thesis/Replication")

# 1. Random baseline

## Toy simulation

In [3]:
data = np.load("data/sample_word_vects.npz", allow_pickle=True)
IDS = data['ids']
WORDS = data['words']
VECTORS = data['vectors']

print(f"First 10 ids: {IDS[:10]}")
print(f"First 10 words: {WORDS[:10]}")
print(f"First 10 vectors:\n{VECTORS[:10]}")
print(f"vectors is a {type(VECTORS)}")
print(f"vectors shape: {VECTORS.shape}")

First 10 ids: [3576 4501 3327 1102 1650  240 5480 4211 4321 6761]
First 10 words: ['league' 'album' 'game' 'party' 'women' 'church' 'song' 'station' 'town'
 'president']
First 10 vectors:
[[ 1.13337643e-01  1.40484904e-01  3.74122065e-01 ...  2.33168057e-03
  -7.16891078e-03 -8.52921721e-03]
 [ 1.50450789e-01 -4.31291522e-01  8.04129079e-03 ... -9.17724888e-03
   2.34008918e-03 -1.49247768e-03]
 [ 7.16353961e-02  2.09700719e-02  1.00467108e-01 ...  5.39575412e-03
  -3.49315731e-05 -2.24411586e-03]
 ...
 [ 9.01282158e-02  5.79194661e-02 -1.28253646e-01 ...  1.16877488e-02
   1.31945628e-02 -2.23398991e-03]
 [ 6.85584396e-02  6.90132216e-02 -6.92020170e-02 ...  4.36430862e-03
   3.91536325e-03 -7.95992962e-03]
 [ 5.07194125e-02  2.83445260e-02 -1.18512292e-02 ... -2.41039906e-04
   6.70018938e-03 -6.51316655e-04]]
vectors is a <class 'numpy.ndarray'>
vectors shape: (100, 500)


In [4]:
# Load the vocabulary
VOCAB = {int(id): str(word) for id, word in zip(IDS, WORDS)} # Format: id(int): word(str)
print(f"Vocabulary size: {len(VOCAB)}")
print("First 10 items in the vocabulary:")
for item in list(VOCAB.items())[:10]:
    print(f"\t{item}") 

# Filter words in the vocabulary by word length
words_ids_by_length = {length: [id for id, word in VOCAB.items() if len(word)==length] for length in range(3,8)} # Format: {len: [id1, id2, ...]}
print("First 2 ids and corresponding words of each length:")
for length in words_ids_by_length:
    ids2 = words_ids_by_length[length][:2]
    idswords2 = [(id, VOCAB[id]) for id in ids2]
    print(f"\tLength {length} (total items {len(words_ids_by_length[length])}): {idswords2}")

total_words_with_wordlengths = 0
ids_with_wordlengths = []
for wordlength_ids in words_ids_by_length.values(): # List of indices
    total_words_with_wordlengths += len(wordlength_ids)
    ids_with_wordlengths.extend(wordlength_ids)
print(f"Number of words with suitable word length: {total_words_with_wordlengths}")

long_words = []
for id in VOCAB.keys():
    if id not in ids_with_wordlengths:
        long_words.append(VOCAB[id])
print(f"Excluded {len(long_words)} longer/shorter words: {long_words}") 

Vocabulary size: 100
First 10 items in the vocabulary:
	(3576, 'league')
	(4501, 'album')
	(3327, 'game')
	(1102, 'party')
	(1650, 'women')
	(240, 'church')
	(5480, 'song')
	(4211, 'station')
	(4321, 'town')
	(6761, 'president')
First 2 ids and corresponding words of each length:
	Length 3 (total items 11): [(2440, 'cup'), (951, 'men')]
	Length 4 (total items 19): [(3327, 'game'), (5480, 'song')]
	Length 5 (total items 21): [(4501, 'album'), (1102, 'party')]
	Length 6 (total items 16): [(3576, 'league'), (240, 'church')]
	Length 7 (total items 9): [(4211, 'station'), (1644, 'william')]
Number of words with suitable word length: 76
Excluded 24 longer/shorter words: ['president', 'px', 'football', 'division', 'us', 'building', 'la', 'championship', 'population', 'director', 'education', 'km', 'election', 'development', 'television', 'production', 'research', 'al', 'community', 'military', 'association', 'tv', 'political', 'minister']


In [5]:
# Simualting lsa model projection object (maps word ids to vectors)
PROJECTIONS = {id:vector for id, vector in zip(IDS, VECTORS)}
# for item in list(PROJECTIONS.items())[:2]:
#     print(item)

# Initialize a dictionary to store avg correlation for each word length
avg_corr_pvalue_per_wordlength = {}

# Repeat for each word length
for wordlength, ids in words_ids_by_length.items(): 
    # print(f"--- Computing baseline for word length {wordlength} ---")   
    # Retrieve vectors and words for the IDs of the words of the current length
    vects = np.array([PROJECTIONS[id] for id in ids])
    words = [VOCAB[id] for id in ids]
    
    # Initialize a list to store correlations from all random reassignments
    correlations_for_this_wordlength = []
    pvalues_for_this_wordlength = []
    
    # Perform 10000 random reassignments
    for _ in range(10):
        # print(f"\n> Shuffling nb: {_+1} <")
        # Shuffle a copy of the list of vectors
        shuffled_vectors = vects.copy()
        np.random.shuffle(shuffled_vectors)
        # print(f"1st element of 1st vector after shuffling (should change): {shuffled_vectors[0][0]}")
        # print(f"1st word in the list of words (should remain the same per word length): {words[0]}")
        
        # Compute cosine distances
        cos_distance_matrix = distance_funcs.cosine_distances_matrix(shuffled_vectors)
        # print(f"Cosine distances matrix [2,3] (should change): {cos_distance_matrix[2][3]}")
        # Compute edit distances
        edit_distance_matrix = distance_funcs.edit_distances_matrix(words)
        
        # Make sure the matrices are symmetric
        assert distance_funcs.is_matrix_symmetric(cos_distance_matrix)
        assert distance_funcs.is_matrix_symmetric(edit_distance_matrix)
        
        # Get the indices of the upper triangle of the matrices, excluding the diagonal (because the matrices are symmetrical)
        upper_triangle_indices = np.triu_indices_from(cos_distance_matrix, k=1) 
        # Flatten the upper triangles of the matrices using the upper_triangle_indices
        cos_distances_upper = cos_distance_matrix[upper_triangle_indices]
        edit_distances_upper = edit_distance_matrix[upper_triangle_indices]
        # print(f"Cosine distances:\n{cos_distances_upper}")
        
        # Compute correlation edit and cosine distance for the given word length
        corr, p_value = pearsonr(cos_distances_upper, edit_distances_upper)
        # print(f"r: {corr}, p-value: {p_value}")
        
        # Store the correlation and p-value for the current reassignment 
        correlations_for_this_wordlength.append(corr)
        pvalues_for_this_wordlength.append(p_value)
    
    # Calculate the average correlation and p-value for this word length
    avg_corr = np.mean(correlations_for_this_wordlength)
    avg_p_value = np.mean(pvalues_for_this_wordlength)
    
    # Store the average correlation and p-value for this word length in the dictionary
    avg_corr_pvalue_per_wordlength[wordlength] = (avg_corr, avg_p_value)

print(avg_corr_pvalue_per_wordlength)

{3: (-0.004300785260867531, 0.4847084778823101), 4: (-0.022455313735972253, 0.5015854535955502), 5: (-0.029074909019385715, 0.4323354461769767), 6: (-0.0003486938998628536, 0.3547505422782318), 7: (0.038206448180044116, 0.6823608186481742)}


In [6]:
# Write the avg correlations to a CSV file of the format: word_length | pearson_r | p-value
# Specify the file name for the CSV file
csv_file = "results/rd_baseline_correlations.csv"

# Write the average correlations and p-values to the CSV file
with open(csv_file, mode="w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")
    
    # Write the header row
    writer.writerow(["word_length", "pearson_r", "p-value"])
    
    # Write each word length's average correlation and p-value
    for wordlength, (avg_corr, avg_p_value) in avg_corr_pvalue_per_wordlength.items():
        writer.writerow([wordlength, avg_corr, avg_p_value])

## Real implementation scenarion

In [3]:
# Load vocabulary
vocab = preprocess_funcs.load_vocabulary()
print(f"Vocabulary size: {len(vocab)}")

# Create a dictionary storing words by length from 3 to 7 characters
words_ids_by_length = {length: [id for id, word in vocab.items() if len(word)==length] for length in range(3,8)}

for length, ids in words_ids_by_length.items():
    print(f"\tLength {length} has {len(ids)} items")

total_words_with_wordlengths = 0
ids_with_wordlengths = []
for wordlength_ids in words_ids_by_length.values(): # List of indices
    total_words_with_wordlengths += len(wordlength_ids)
    ids_with_wordlengths.extend(wordlength_ids)
print(f"Number of words with suitable word length: {total_words_with_wordlengths}")

long_words = []
for id in vocab.keys():
    if id not in ids_with_wordlengths:
        long_words.append(vocab[id])
print(f"Excluded {len(long_words)} longer/shorter words: {long_words}") 

# Get embeddings
embeddings_dict = preprocess_funcs.get_vocabulary_embeddings_dict()
print(f"{len(embeddings_dict)} embeddings")

INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model
INFO:gensim.utils:loading id2word recursively from models/wiki_lsi_model.model.id2word.* with mmap=None


Vocabulary size: 5000
	Length 3 has 257 items
	Length 4 has 584 items
	Length 5 has 744 items
	Length 6 has 837 items
	Length 7 has 789 items
Number of words with suitable word length: 3211
Excluded 1789 longer/shorter words: ['president', 'px', 'football', 'division', 'us', 'building', 'la', 'championship', 'population', 'director', 'education', 'km', 'election', 'development', 'television', 'production', 'research', 'al', 'community', 'military', 'association', 'tv', 'political', 'minister', 'do', 'australia', 'european', 'announced', 'position', 'different', 'department', 'nd', 'services', 'we', 'professional', 'california', 'northern', 'produced', 're', 'business', 'southern', 'my', 'language', 'australian', 'rd', 'returned', 'students', 'performance', 'uk', 'tournament', 'together', 'festival', 'described', 'washington', 'recorded', 'information', 'me', 'continued', 'appointed', 'institute', 'championships', 'construction', 'committee', 'considered', 'japanese', 'character', 'orig

INFO:gensim.utils:setting ignored attribute projection to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:loaded models/wiki_lsi_model.model
INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model.projection
INFO:gensim.utils:loading u from models/wiki_lsi_model.model.projection.u.npy with mmap=None
INFO:gensim.utils:loaded models/wiki_lsi_model.model.projection


5000 embeddings


In [4]:
rd_baseline_corr_dict = baseline_funcs.get_rd_baseline_corr_dict(vocab, words_ids_by_length, embeddings_dict, n=5)

INFO:root:Computing baseline for word length 3
INFO:root:Shuffling iteration number 1
INFO:root:Word length 3: Avg correlation: 0.0003328006934064357, Avg p-value: 0.4095434468306636
INFO:root:Computing baseline for word length 4
INFO:root:Shuffling iteration number 1
INFO:root:Word length 4: Avg correlation: -0.0014398214368150513, Avg p-value: 0.45387186868642804
INFO:root:Computing baseline for word length 5
INFO:root:Shuffling iteration number 1
INFO:root:Word length 5: Avg correlation: -0.001682648453883763, Avg p-value: 0.16779296742172017
INFO:root:Computing baseline for word length 6
INFO:root:Shuffling iteration number 1
INFO:root:Word length 6: Avg correlation: 0.002208237702659114, Avg p-value: 0.25942958428207075
INFO:root:Computing baseline for word length 7
INFO:root:Shuffling iteration number 1
INFO:root:Word length 7: Avg correlation: -0.0008266179596940181, Avg p-value: 0.35458012804196115


# 2. Binned baseline

In [5]:
# Testing matrix symmetry & mean (along axes) & get_unique_pairwise_scores

matrix = np.array(([0,1,2],[1,0,3],[2,3,0]))
print(f"The matrix:\n{matrix}")

print(f"Is symmetrical: {np.array_equal(matrix, matrix.T)}")
distinct = distance_funcs.get_unique_pairwise_scores(matrix)
print(f"Upper triangle scores: {distinct}")

# Mean of entire matrix
avg_distinct = np.mean(distinct, axis=0)
print(f"distinct mean: {avg_distinct}")

# Row means including diagonal
avg_array = np.mean(matrix, axis=0)
print(f"matrix row means: {avg_array}")

# Row means excluding diagonal
row_avgs = distance_funcs.average_distances(matrix)
print(f"Avg scores using function`average_distances`: {row_avgs}")

The matrix:
[[0 1 2]
 [1 0 3]
 [2 3 0]]
Is symmetrical: True
Upper triangle scores: [1 2 3]
distinct mean: 2.0
matrix row means: [1.         1.33333333 1.66666667]
Avg scores using function`average_distances`: [1.5 2.  2.5]


In [35]:
def bin_vects_by_avg_cos_sim(word_ids, embeddings, rescaling=None, num_bins=4):
    """
    Organizes word embeddings into bins based on their average cosine similarity.

    Parameters:
        word_ids (list): A list of word IDs corresponding to the embeddings.
        embeddings (np.ndarray): A 2D array of word embeddings, each row representing a word vector.
        rescaling (bool, optional): Whether to rescale the embeddings.
                                    Options: None, 'map_zero_to_one', 'angular_distance'. Defaults to None.
        num_bins (int, optional): The number of bins to organize the vectors into. Defaults to 4.

    Returns:
        dict: A dictionary where keys are bin indices (from 0 to num_bins - 1), and values are dictionaries
        mapping word IDs to their corresponding vectors within each bin.

    Description:
        The function calculates all pairwise cosine distances between word embeddings and computes the average 
        cosine distance for each embedding. Then, it sorts the vectors and word IDs based on their average distance. 
        It divides the sorted vectors and word IDs into the specified number of bins. Each bin contains a 
        dictionary where keys are word IDs and values are their corresponding word vectors.

    Note:
        - The function uses `distance_funcs.pairwise_cosine_distances` and `distance_funcs.average_distances` for distance
          calculations.
        - In case `num_bins` is 1, the function will return one bin containing all the vectors.
    """    
    # Calculate pairwise cosine distances
    all_pairwise_distances = distance_funcs.cosine_distances_matrix(embeddings, rescaling)
    # Calculate average cosine distance for each vector
    avg_distances = distance_funcs.average_distances(all_pairwise_distances)
    
    # Sort vectors based on their average distance and word_ids (in ascending order -> first index corresponds to smallest distance)
    sorted_indices = np.argsort(avg_distances)
    sorted_vects = embeddings[sorted_indices]
    sorted_word_ids = [word_ids[i] for i in sorted_indices]
    # print(f"Sorted word IDs: {sorted_word_ids}")
    
    # Calculate bin size such that vectors are equally distributed across all bins
    n = len(sorted_vects)
    bin_size = n // num_bins
    
    # Initialize a dictionary to represent bins. The values will be dictionaries mapping sorted word IDs to vectors
    bins = {i: {} for i in range(num_bins)}
    
    # Distribute the sorted word IDs and vectors into the bins
    for i in range(num_bins):
        # Set the starting index for the bin
        start_idx = i * bin_size
        # Set the final index for the bin
        if i == num_bins-1:  # For the last bin, include all remaining vectors
            end_idx = n
        else:
            end_idx = (i + 1) * bin_size
        
        # Slice the vectors and word IDs for the current bin
        bin_vects = sorted_vects[start_idx:end_idx]
        bin_word_ids = sorted_word_ids[start_idx:end_idx]
        # print(f"IDs for bin {i}: {bin_word_ids}")
        
        # Append the word IDs and corresponding vectors to the current bin
        for id, vect in zip(bin_word_ids, bin_vects):
            bins[i][id] = vect
    
    # Return the bins
    return bins

def shuffle_bins(bins):
    # Initialize a list to store the shuffled vectors
    shuffled_vects = []
    
    for _, data in bins.items():
        # Extract the vectors from the id:vector inner dictionary
        vectors = list(data.values())
        # Shuffle the vectors within the bin
        np.random.shuffle(vectors)
        # Append the shuffled vectors to the common list
        shuffled_vects.extend(vectors)
        
    # Convert the list of vectors to a numpy array
    shuffled_vects_array = np.array(shuffled_vects)
        
    return shuffled_vects_array

In [42]:
def save_distances(length, word_ids, words, cosine_distances_matrix, edit_distances_matrix, output_file_path):
    with open(output_file_path, 'w', newline='', encoding="utf-8") as f:
        csv_writer = csv.writer(f)
        # Write the header row
        csv_writer.writerow(['word_length', 'word1', 'word2', 'cos_dist', 'edit_dist'])
        
        # Iterate through all word pairs (all combinations of 2 words)
        for i in range(len(word_ids)):
            for j in range(i+1, len(word_ids)): # Don't duplicate word pairs
                # Get the words at the corresponding indices
                word1, word2 = words[i], words[j]
                
                # Get cosine and edit distances from the corresponding matrices
                cos_dist = cosine_distances_matrix[i, j]
                edit_dist = edit_distances_matrix[i, j]
                
                # Write the words and their distances to the CSV file
                csv_writer.writerow([length, word1, word2, cos_dist, edit_dist])
                
    # logging.info(f"Cosine and edit distances for word length {length} saved to {output_file_path}.")

In [None]:
# Binned baseline algorithm

# Load the vocabulary (ID:word)
# Retrieve embeddings for words in vocabulary (ID:vector)
# Filter vocabulary by word length (length:[ids])
# For each word length
    # Get word embeddings (np.array from embeddings_dict based on list of IDs)
    # Bin the embeddings based on avg cos dist (bin:{ID:vector})
    # Shuffle embeddings within bins (2D np.array -> matrix)
    # Calculate cosine distances with shuffled embeddings
    # Get list of words from vocabualry using IDs (list of str)
    # Calculate pairwise edit distances between words
    # Save to CSV cos & edit dist for each word pair

## Toy simulation

In [16]:
# Load vocabulary
vocab = preprocess_funcs.load_vocabulary()
vocab_sample = {id:vocab[id] for id in list(vocab.keys())[:50]}
print(f"Sample vocabulary: {vocab_sample}")

# Get embeddings for the words in the vocabulary
embeddings_dict = preprocess_funcs.get_vocabulary_embeddings_dict(vocab=vocab_sample)
print(f"Size of embeddings dict: {len(embeddings_dict)}")
print(embeddings_dict.keys())

# Create a dictionary storing words by length from 3 to 7 characters
words_ids_by_length = {length: [id for id, word in vocab_sample.items() if len(word)==length] for length in range(3,8)}

INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model
INFO:gensim.utils:loading id2word recursively from models/wiki_lsi_model.model.id2word.* with mmap=None
INFO:gensim.utils:setting ignored attribute projection to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:loaded models/wiki_lsi_model.model
INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model.projection
INFO:gensim.utils:loading u from models/wiki_lsi_model.model.projection.u.npy with mmap=None


Sample vocabulary: {3576: 'league', 4501: 'album', 3327: 'game', 1102: 'party', 1650: 'women', 240: 'church', 5480: 'song', 4211: 'station', 4321: 'town', 6761: 'president', 912: 'line', 5357: 'px', 3822: 'park', 2440: 'cup', 10729: 'player', 3295: 'football', 8538: 'award', 3328: 'games', 3143: 'division', 4062: 'river', 951: 'men', 1492: 'system', 9219: 'round', 1596: 'us', 1322: 'road', 1388: 'show', 2870: 'building', 3555: 'la', 104: 'art', 1690: 'air', 3610: 'london', 1734: 'band', 100: 'army', 185: 'book', 3548: 'king', 5665: 'you', 2935: 'championship', 223: 'center', 879: 'law', 2158: 'population', 663: 'french', 8330: 'director', 1635: 'white', 3634: 'man', 172: 'black', 480: 'education', 4224: 'street', 5092: 'km', 3171: 'election', 1645: 'win'}


INFO:gensim.utils:loaded models/wiki_lsi_model.model.projection


Size of embeddings dict: 50
dict_keys([3576, 4501, 3327, 1102, 1650, 240, 5480, 4211, 4321, 6761, 912, 5357, 3822, 2440, 10729, 3295, 8538, 3328, 3143, 4062, 951, 1492, 9219, 1596, 1322, 1388, 2870, 3555, 104, 1690, 3610, 1734, 100, 185, 3548, 5665, 2935, 223, 879, 2158, 663, 8330, 1635, 3634, 172, 480, 4224, 5092, 3171, 1645])


In [36]:
for wordlength, ids in words_ids_by_length.items():  
    # Retrieve embeddings for words of the current length using their IDs
    embeddings = np.array([embeddings_dict[id] for id in ids])
    print(f"{len(embeddings)} embeddings for wordlength {wordlength}")
    
    # Bin the embeddings in 4 bins based on their average cosine distance 
    bins = bin_vects_by_avg_cos_sim(ids, embeddings) # `bin` is a dict of the format { bin_nb: {id: vect} }
    
    # Shuffle the embeddings within the bins
    shuffled_embeddings = shuffle_bins(bins)
    print(f"Shuffled vectors shape: {shuffled_embeddings}")
    # print(f"First embedding after shuffling: {shuffled_embeddings[0]}")
    
    # Calculate cosine distances of the shuffled embeddings
    cos_distances_matrix = distance_funcs.cosine_distances_matrix(shuffled_embeddings)
    print(f"Cosine distances matrix:\n{cos_distances_matrix}")
    # unique_cos_distances = distance_funcs.get_unique_pairwise_scores(cos_distances_matrix)
    
    # Retrieve the words correponding to the IDs for this word length
    words = [vocab[id] for id in ids]
    # Calculate pairwise edit distances 
    edit_distances_matrix = distance_funcs.edit_distances_matrix(words)
    print(f"Edit distances matrix:\n{edit_distances_matrix}")
    # unique_edit_distances = distance_funcs.get_unique_pairwise_scores(edit_distances_matrix)
    
    output_file = f"results/cosine_edit_distances/cos_edit_dist_{wordlength}"
    
    # save_distances(wordlength, ids, words, cos_distances_matrix, edit_distances_matrix, output_file)

8 embeddings for wordlength 3
Sorted word IDs: [951, 3634, 879, 1690, 2440, 104, 1645, 5665]
IDs for bin 0: [951, 3634]
IDs for bin 1: [879, 1690]
IDs for bin 2: [2440, 104]
IDs for bin 3: [1645, 5665]
Shuffled vectors shape: [[ 0.03532289 -0.01507703 -0.00029856 ... -0.02023189  0.03852744
   0.00380326]
 [ 0.04489965  0.02221564  0.03785923 ...  0.01479159  0.02369672
   0.00374773]
 [ 0.04211781  0.019698   -0.01653891 ...  0.00478332  0.00177945
  -0.01176264]
 ...
 [ 0.05282964 -0.00836429 -0.01799915 ... -0.00622537 -0.01083229
  -0.0001308 ]
 [ 0.03377029  0.02208087  0.06499686 ... -0.00872213  0.01645583
   0.00781215]
 [ 0.05532805 -0.09630436 -0.0001041  ... -0.0092827  -0.01810851
   0.00033   ]]
Cosine distances matrix:
[[0.         0.81826598 1.01164516 0.99694552 1.00474556 1.01173224
  0.99905891 1.05769625]
 [0.81826598 0.         0.99840557 0.99446463 0.99671339 0.99939206
  1.0182088  1.00210162]
 [1.01164516 0.99840557 0.         1.00244207 1.00030728 1.00311415
  0

  avg_distances = np.nanmean(distances_matrix_copy, axis=1)


# Real implementation

In [39]:
# Load vocabulary
vocab = preprocess_funcs.load_vocabulary()
print(f"Vocabulary size: {len(vocab)}")

# Create a dictionary storing words by length from 3 to 7 characters
words_ids_by_length = {length: [id for id, word in vocab.items() if len(word)==length] for length in range(3,8)}

# Get embeddings for the words in the vocabulary
embeddings_dict = preprocess_funcs.get_vocabulary_embeddings_dict(vocab)
print(f"Size of embeddings dict: {len(embeddings_dict)}")
print(embeddings_dict.keys())

INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model
INFO:gensim.utils:loading id2word recursively from models/wiki_lsi_model.model.id2word.* with mmap=None
INFO:gensim.utils:setting ignored attribute projection to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:loaded models/wiki_lsi_model.model
INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model.projection
INFO:gensim.utils:loading u from models/wiki_lsi_model.model.projection.u.npy with mmap=None


Vocabulary size: 5000


INFO:gensim.utils:loaded models/wiki_lsi_model.model.projection


Size of embeddings dict: 5000
dict_keys([3576, 4501, 3327, 1102, 1650, 240, 5480, 4211, 4321, 6761, 912, 5357, 3822, 2440, 10729, 3295, 8538, 3328, 3143, 4062, 951, 1492, 9219, 1596, 1322, 1388, 2870, 3555, 104, 1690, 3610, 1734, 100, 185, 3548, 5665, 2935, 223, 879, 2158, 663, 8330, 1635, 3634, 172, 480, 4224, 5092, 3171, 1645, 1338, 8286, 1644, 1136, 3993, 3343, 1324, 3514, 418, 3045, 4266, 351, 1193, 1293, 4185, 1693, 289, 2368, 3341, 15758, 2492, 2019, 2473, 3678, 1155, 4871, 3675, 380, 5414, 1667, 5430, 3764, 1925, 122, 3995, 2658, 1053, 1657, 5586, 1145, 1583, 4911, 2566, 919, 252, 2263, 11921, 5086, 1317, 3723, 456, 519, 10268, 2467, 3614, 4150, 645, 1412, 868, 1024, 2563, 658, 915, 1262, 2732, 2159, 1049, 1470, 3381, 5562, 423, 5509, 1141, 1407, 643, 551, 3110, 660, 1116, 3739, 8417, 1827, 17, 1613, 3170, 1611, 1307, 4124, 1019, 923, 3473, 1323, 2374, 3933, 990, 8019, 4184, 206, 5879, 1030, 381, 1192, 1601, 1106, 1276, 1538, 5368, 2880, 5152, 550, 2166, 6543, 3117, 2661, 1951, 

In [43]:
for wordlength, ids in words_ids_by_length.items():    
    # Retrieve embeddings for words of the current length using their IDs
    embeddings = np.array([embeddings_dict[id] for id in ids])
    print(f"{len(embeddings)} for wordlength {wordlength}")
    
    # Bin the embeddings in 4 bins based on their average cosine distance 
    bins = bin_vects_by_avg_cos_sim(ids, embeddings) # `bin` is a dict of the format { bin_nb: {id: vect} }
    
    # Shuffle the embeddings within the bins
    shuffled_embeddings = shuffle_bins(bins)
    # print(shuffled_embeddings)
    
    # Calculate cosine distances of the shuffled embeddings
    cos_distances_matrix = distance_funcs.cosine_distances_matrix(shuffled_embeddings)
    
    # Retrieve the words correponding to the IDs for this word length
    words = [vocab[id] for id in ids]
    # Calculate pairwise edit distances 
    edit_distances_matrix = distance_funcs.edit_distances_matrix(words)
    
    output_file = f"results/binned_baseline/cos_edit_dist_{wordlength}"
    
    save_distances(wordlength, ids, words, cos_distances_matrix, edit_distances_matrix, output_file)

257 for wordlength 3
Sorted word IDs: [3345, 16678, 4744, 9319, 8924, 22727, 16244, 4304, 5081, 1551, 9331, 23038, 5138, 3524, 2847, 8966, 9812, 3150, 5772, 4986, 5271, 943, 1345, 8358, 2379, 5804, 8558, 29056, 9454, 4081, 3217, 5085, 13765, 4628, 8952, 918, 8908, 29006, 168, 32331, 29022, 1665, 7634, 23982, 41223, 8937, 32390, 20575, 12993, 749, 5109, 621, 7763, 1568, 5356, 198, 895, 1344, 8010, 4270, 7392, 3833, 1028, 14160, 4837, 597, 6387, 4797, 13110, 955, 2491, 1645, 5665, 23125, 19397, 9488, 1332, 21137, 4749, 60002, 8315, 12681, 74322, 3307, 13736, 10956, 27172, 25188, 4815, 8216, 25299, 10159, 1075, 2097, 16108, 12100, 3070, 3634, 4529, 5367, 3413, 6482, 31474, 960, 3580, 8245, 861, 9899, 6876, 11920, 11331, 30744, 2045, 8974, 16868, 42315, 1188, 9433, 13305, 542, 1163, 20329, 2360, 23143, 2663, 48, 3737, 24935, 11222, 9841, 13775, 15661, 2266, 17899, 15473, 7619, 5857, 4783, 2290, 32859, 2916, 3329, 9945, 3868, 5111, 1757, 23000, 3433, 15601, 9793, 23215, 50, 7556, 22917, 51,

# Results

## Random baseline

## Binned baseline

Analyses to do:
- [ ] Scatterplot of cos dist vs edit dist for each wordlength + correlation for test scores
    - [ ] Repeat for baselines
    - [ ] Repeat with log (and/or) z transformed (or subplots)
- [ ] (box and whiskers) plot of the avg cos distance of the words within the bins
- [ ] TSNE visualization of words with color based on avg edit dist