In [1]:
import os
import csv
import logging
import numpy as np
import pickle as pkl
import distance_funcs
import preprocess_funcs
from scipy.stats import pearsonr
from collections import defaultdict
import preprocess_funcs
import corr_funcs

In [2]:
os.chdir("c:/Users/Raya/OneDrive/Documents/3-CSAI/CSAI-Y3-S2/Thesis/Replication")

# Globals

In [3]:
# Sample from real data for test

# Load the vocabulary and embeddings
vocab_path = "data/vocab.pkl" 
lsa_model_path = "models/wiki_lsi_model.model" 
vocab = preprocess_funcs.load_vocabulary(vocab_path) # {id:word}
embeddings_dict = preprocess_funcs.get_vocabulary_embeddings_dict(vocab, lsa_model=lsa_model_path) # {id:vector}

# Dictionary storing words by length (3-7)
words_ids_by_length = preprocess_funcs.word_ids_by_word_length(vocab)

test_ids = [id for id in list(vocab.keys())[:30]]
print(f"Test ids (20): {test_ids}")

test_vocab = {}
for id in test_ids:
    test_vocab[id] = vocab[id]
print(f"Test vocabulary: {test_vocab}")

test_words = list(test_vocab.values())
print(f"Test words: {test_words}")

test_word_ids_by_length = {}
for word_length, ids in words_ids_by_length.items():
    kept_ids = [id for id in ids if id in test_ids]
    test_word_ids_by_length[word_length] = kept_ids
print(test_word_ids_by_length)

Test ids (20): [3576, 4501, 3327, 1102, 1650, 240, 5480, 4211, 4321, 6761, 912, 5357, 3822, 2440, 10729, 3295, 8538, 3328, 3143, 4062, 951, 1492, 9219, 1596, 1322, 1388, 2870, 3555, 104, 1690]
Test vocabulary: {3576: 'league', 4501: 'album', 3327: 'game', 1102: 'party', 1650: 'women', 240: 'church', 5480: 'song', 4211: 'station', 4321: 'town', 6761: 'president', 912: 'line', 5357: 'px', 3822: 'park', 2440: 'cup', 10729: 'player', 3295: 'football', 8538: 'award', 3328: 'games', 3143: 'division', 4062: 'river', 951: 'men', 1492: 'system', 9219: 'round', 1596: 'us', 1322: 'road', 1388: 'show', 2870: 'building', 3555: 'la', 104: 'art', 1690: 'air'}
Test words: ['league', 'album', 'game', 'party', 'women', 'church', 'song', 'station', 'town', 'president', 'line', 'px', 'park', 'cup', 'player', 'football', 'award', 'games', 'division', 'river', 'men', 'system', 'round', 'us', 'road', 'show', 'building', 'la', 'art', 'air']
{3: [2440, 951, 104, 1690], 4: [3327, 5480, 4321, 912, 3822, 1322, 13

In [4]:
# Simple toy data for understanding

toy_vects = np.array([np.array(([1.0,-2.0,3.0])), np.array(([-4.0,5.0,6.0])), np.array(([7.0,8.0,-9.0]))])
toy_words = ['cup', 'cat', 'dog']
toy_ids = [f"0{i}" for i in range(len(toy_words))]

# Random baseline

## Algorithm
Goal: get baseline distribution of correlations

1. Load:
    - vocab
    - embeddings
    - word IDs by word length

2. Initialize dictionary to store results: `{ wordlength: {'transformed_correlations':[]}, {'p_values': []} }`

3. Iterate over word lengths
    - Get vectors and words
    - Initialize lists to store transformed correlations & p-values for this word length: `[transformed_corrs]`, `[p_values]`
    - Iterate over shuffling iterations 
        - Shuffle vectors
        - Compute cosine & edit distances
        - Compute correlation & p-value
        - Transform correlation using Fisher z-transformation: `z = 0.5 * (ln(1 + r)  ln(1 - r))`
        - Store transformed correlation score & p-value in corresponding lists
4. Save results to a CSV file:  
word_length, transformed_corr, p_value

## Scripts

### Toy simulation

In [None]:
# 2. Initialize dictionary to store results: `{ wordlength: {'transformed_correlations':[]}, {'p_values': []} }`
results = {}

# 3. Iterate over word lengths
# for word_length, ids in words_ids_by_length.items(): # NOTE: uncomment
    # logging.info(f"Analyzing word length {word_length}")
    
# Retrieve vectors and words for current word length
vects = np.array([np.array(([1.0,-2.0,3.0])), np.array(([-4.0,5.0,6.0])), np.array(([7.0,8.0,-9.0]))])
print(f"Original vectors array:\n{vects}")
words = ['cup', 'cat', 'dog']
# vects = np.array([embeddings_dict[id] for id in ids]) # NOTE: uncomment
# words = [vocab[id] for id in ids] # NOTE: uncomment

# Initialize lists to store transformed correlations
transformed_correlations, p_values = [], []

cos_dist_rescaling = 'abs_cos_sim'
num_reassignments = 2
# Perform random permutations of word-vector mappings
for iteration in range(num_reassignments): 
    if iteration % 10 == 0: # NOTE: change back to 100
        logging.info(f"Reassignment iteration {iteration + 1}/{num_reassignments} for word length ") # NOTE: add {word_length}
    
    # Shuffle vectors randomly
    shuffled_vectors = vects.copy()
    np.random.shuffle(shuffled_vectors)
    print(f"Shuffled vectors array:\n{shuffled_vectors}")
    
    # Compute distances
    cos_distance_matrix = distance_funcs.cosine_distances_matrix(shuffled_vectors, rescaling=cos_dist_rescaling)
    print(f"Matrix of cosine distances:\n{cos_distance_matrix}")
    edit_distance_matrix = distance_funcs.edit_distances_matrix(words)
    print(f"Matrix of edit distances:\n{edit_distance_matrix}")
    
    # Extract unique pairwise scores
    unique_cos_distances = distance_funcs.get_unique_pairwise_scores(cos_distance_matrix)
    print(f"Unique cosine distances: {unique_cos_distances}")
    unique_edit_distances = distance_funcs.get_unique_pairwise_scores(edit_distance_matrix)
    print(f"Unique edit distances: {unique_edit_distances}")
    
    # Compute Pearson correlation between distances
    correlation, p_value = pearsonr(unique_cos_distances, unique_edit_distances)
    print(f"r: {correlation}, pvalue: {p_value}")
    
    # Transform correlation using Fisher Z-transformation: z = 0.5 * (ln(1 + r)  ln(1 - r))
    # for more info on np.log1p: https://numpy.org/doc/stable/reference/generated/numpy.log1p.html#numpy.log1p
    transformed_corr = 0.5 * (np.log1p(correlation) - np.log1p(-correlation)) 
    print(f"Transformed r: {transformed_corr}")
    
    # Store the transformed correlation
    transformed_correlations.append(transformed_corr) 
    p_values.append(p_value)

print(f"All transformed correlations: {transformed_correlations}")
print(f"All p_values: {p_values}")


### Real implementation testing

In [38]:
# ---------- Arguments & config ----------

vocab_path = "data/vocab.pkl" # sys.argv[1]  #
lsa_model_path = "models/wiki_lsi_model.model" # sys.argv[2]  #
cos_dist_rescaling = 'abs' # sys.argv[3] # options: 'none', 'abs', 'norm', 'ang'
output_folder = f"results/correlations/rd_baseline" # sys.argv[4] # 
output_file = f"rd_bl_corrs_{cos_dist_rescaling}.csv" # sys.argv[5]  #

# Set default number of random reassignments
# num_reassignments = int(sys.argv[6]) if len(sys.argv) > 5 else 10000 # Number of random vector-word permutations to be perfomed
num_reassignments = 2 # Number of random vector-word permutations to be perfomed

# Create output directory if it does not exist
os.makedirs(output_folder, exist_ok=True)

# Set a random seed for reproducibility
np.random.seed(4242)

# Define the correct string argument for the cosine_distances_matrix function based on the input
rescaling_options = ['none', 'abs', 'norm', 'ang']
if cos_dist_rescaling == 'none':
    rescaling_string = None
elif cos_dist_rescaling == 'abs':
    rescaling_string = 'abs_cos_sim'
elif cos_dist_rescaling == 'norm':
    rescaling_string = 'norm_cos_sim'
elif cos_dist_rescaling == 'ang':
    rescaling_string = 'angular_dist'
else:
    logging.error(f"Invalid input for cos_dist_rescaling. Supported values: {rescaling_options}")

logging.info(f"Config: rescaling: {rescaling_string} | num_reassignments: {num_reassignments}")

INFO:root:Config: rescaling: abs_cos_sim | num_reassignments: 2


c:\Users\Raya\OneDrive\Documents\3-CSAI\CSAI-Y3-S2\Thesis\Replication\results\correlations\rd_baseline


In [30]:
# ---------- Load data ----------

# Load the vocabulary and embeddings
vocab = preprocess_funcs.load_vocabulary(vocab_path) # {id:word}
embeddings_dict = preprocess_funcs.get_vocabulary_embeddings_dict(vocab, lsa_model=lsa_model_path) # {id:vector}

# Dictionary storing words by length (3-7)
words_ids_by_length = preprocess_funcs.word_ids_by_word_length(vocab)

# NOTE: delete print statement
for wordlength, ids in words_ids_by_length.items():
    print(f"Word length {wordlength} | First 10 IDs: {ids[:10]}")

INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model
INFO:gensim.utils:loading id2word recursively from models/wiki_lsi_model.model.id2word.* with mmap=None
INFO:gensim.utils:setting ignored attribute projection to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:loaded models/wiki_lsi_model.model
INFO:gensim.utils:loading LsiModel object from models/wiki_lsi_model.model.projection
INFO:gensim.utils:loading u from models/wiki_lsi_model.model.projection.u.npy with mmap=None
INFO:gensim.utils:loaded models/wiki_lsi_model.model.projection


Word length 3 | First 10 IDs: [2440, 951, 104, 1690, 5665, 879, 3634, 1645, 5430, 3995]
Word length 4 | First 10 IDs: [3327, 5480, 4321, 912, 3822, 1322, 1388, 1734, 100, 185]
Word length 5 | First 10 IDs: [4501, 1102, 1650, 8538, 3328, 4062, 9219, 1635, 172, 3514]
Word length 6 | First 10 IDs: [3576, 240, 10729, 1492, 3610, 223, 663, 4224, 8286, 3993]
Word length 7 | First 10 IDs: [4211, 1644, 351, 4185, 15758, 3678, 4871, 2658, 2566, 519]


In [40]:
# 2. Initialize dictionary to store results: `results = { wordlength: {'raw_correlations':[]}, {'transformed_correlations':[]}, {'p_values': []} }`
results = defaultdict(dict)

# 3. Iterate over word lengths
for word_length, ids in words_ids_by_length.items(): 
    logging.info(f"Analyzing word length {word_length}")
    
    # Retrieve vectors and words for current word length
    vects = np.array([embeddings_dict[id] for id in ids])
    vects = vects[:5]
    # print(vects[:2])
    words = [vocab[id] for id in ids] 
    words = words[:5]
    # print(words[:2])

    # Initialize lists to store transformed correlations
    transformed_correlations, p_values = [], []

    # Perform random permutations of word-vector mappings
    for iteration in range(num_reassignments): 
        if iteration % 100 == 0: 
            logging.info(f"Reassignment iteration {iteration + 1}/{num_reassignments} for word length {word_length}") 
        
        # Shuffle vectors randomly
        shuffled_vectors = vects.copy()
        np.random.shuffle(shuffled_vectors)
        # print(f"Shuffled vectors array:\n{shuffled_vectors}")
        
        # Compute distances
        cos_distance_matrix = distance_funcs.cosine_distances_matrix(shuffled_vectors, rescaling=rescaling_string)
        # print(f"Matrix of cosine distances:\n{cos_distance_matrix}")
        edit_distance_matrix = distance_funcs.edit_distances_matrix(words)
        # print(f"Matrix of edit distances:\n{edit_distance_matrix}")
        
        # Extract unique pairwise scores
        unique_cos_distances = distance_funcs.get_unique_pairwise_scores(cos_distance_matrix)
        # print(f"Unique cosine distances: {unique_cos_distances}")
        unique_edit_distances = distance_funcs.get_unique_pairwise_scores(edit_distance_matrix)
        # print(f"Unique edit distances: {unique_edit_distances}")
        
        # Compute Pearson correlation between distances
        correlation, p_value = pearsonr(unique_cos_distances, unique_edit_distances)
        # print(f"r: {correlation}, pvalue: {p_value}")
        
        # Transform correlation using Fisher Z-transformation: z = 0.5 * (ln(1 + r)  ln(1 - r))
        # for more info on np.log1p: https://numpy.org/doc/stable/reference/generated/numpy.log1p.html#numpy.log1p
        transformed_corr = 0.5 * (np.log1p(correlation) - np.log1p(-correlation)) 
        # print(f"Transformed r: {transformed_corr}")
        
        # Store the transformed correlation
        transformed_correlations.append(transformed_corr) 
        p_values.append(p_value)
    
    results[word_length]['transformed_correlations'] = transformed_correlations
    results[word_length]['p_values'] = p_values
    
# print(results)
# Save results to a CSV file: word_length, transformed_corr, p_value
output_file_path = os.path.normpath(f"{output_folder}/{output_file}")
with open(output_file_path, 'w', newline='', encoding="utf-8") as f:
    csv_writer = csv.writer(f)
    # Write the header row
    csv_writer.writerow(['word_length', 'transformed_corr', 'p_value'])
    for word_length in results:
        for i in range(num_reassignments):
            corr = results[word_length]['transformed_correlations'][i]
            p = results[word_length]['p_values'][i]
            csv_writer.writerow([word_length, corr, p])
    logging.info(f"Correlation scores for all word lengths for random baseline with rescaling {cos_dist_rescaling} saved to {output_file} in {output_folder}")

INFO:root:Analyzing word length 3
INFO:root:Reassignment iteration 1/2 for word length 3
INFO:root:Analyzing word length 4
INFO:root:Reassignment iteration 1/2 for word length 4
INFO:root:Analyzing word length 5
INFO:root:Reassignment iteration 1/2 for word length 5
INFO:root:Analyzing word length 6
INFO:root:Reassignment iteration 1/2 for word length 6
INFO:root:Analyzing word length 7
INFO:root:Reassignment iteration 1/2 for word length 7
INFO:root:Correlation scores for all word lengths for random baseline with rescaling abs saved to rd_bl_corrs_abs.csv in results/correlations/rd_baseline


# Binned baseline

In [5]:
def bin_vects_by_avg_cos_sim(word_ids, embeddings, rescaling=None, num_bins=4):
    """
    Organizes word embeddings into bins based on their average cosine similarity.

    Parameters:
        word_ids (list): A list of word IDs corresponding to the embeddings.
        embeddings (np.ndarray): A 2D array of word embeddings, each row representing a word vector.
        rescaling (bool, optional): Whether to rescale the embeddings.
                                    Options: None, 'map_zero_to_one', 'angular_distance'. Defaults to None.
        num_bins (int, optional): The number of bins to organize the vectors into. Defaults to 4.

    Returns:
        dict: A dictionary where keys are bin indices (from 0 to num_bins - 1), and values are dictionaries
        mapping word IDs to their corresponding vectors within each bin.

    Description:
        The function calculates all pairwise cosine distances between word embeddings and computes the average 
        cosine distance for each embedding. Then, it sorts the vectors and word IDs based on their average distance. 
        It divides the sorted vectors and word IDs into the specified number of bins. Each bin contains a 
        dictionary where keys are word IDs and values are their corresponding word vectors.

    Note:
        - The function uses `distance_funcs.pairwise_cosine_distances` and `distance_funcs.average_distances` for distance
          calculations.
        - In case `num_bins` is 1, the function will return one bin containing all the vectors.
    """    
    
    print(f"Words IDs: {word_ids}")
    print(f"{len(embeddings)} embeddings")
    
    # Calculate pairwise cosine distances
    all_pairwise_distances = distance_funcs.cosine_distances_matrix(embeddings, rescaling)
    # print(f"Cosine distances:\n{all_pairwise_distances}")
    # Calculate average cosine distance for each vector
    avg_distances = distance_funcs.average_distances(all_pairwise_distances)
    print(f"Average distances: {avg_distances}")
    
    # Sort vectors based on their average distance and word_ids (in ascending order -> first index corresponds to smallest distance)
    sorted_indices = np.argsort(avg_distances)
    print(f"Sorted indices: {sorted_indices}")
    sorted_vects = embeddings[sorted_indices]
    # print(f"Sorted vectors:\n{sorted_vects}")
    sorted_word_ids = [word_ids[i] for i in sorted_indices]
    print(f"Sorted word IDs by cosine distance ascending: {sorted_word_ids}")
    
    # Calculate bin size such that vectors are equally distributed across all bins
    n = len(sorted_vects)
    bin_size = n // num_bins
    
    # Initialize a dictionary to represent bins. The values will be dictionaries mapping sorted word IDs to vectors
    bins = {i: {} for i in range(num_bins)}
    
    # Distribute the sorted word IDs and vectors into the bins
    for i in range(num_bins):
        # Set the starting index for the bin
        start_idx = i * bin_size
        # Set the final index for the bin
        if i == num_bins-1:  # For the last bin, include all remaining vectors
            end_idx = n
        else:
            end_idx = (i + 1) * bin_size
        
        # Slice the vectors and word IDs for the current bin
        bin_vects = sorted_vects[start_idx:end_idx]
        bin_word_ids = sorted_word_ids[start_idx:end_idx]
        # print(f"IDs for bin {i}: {bin_word_ids}")
        
        # Append the word IDs and corresponding vectors to the current bin
        for id, vect in zip(bin_word_ids, bin_vects):
            bins[i][id] = vect
    
    # Return the bins
    return bins

def shuffle_bins(bins):
    # Initialize a list to store the shuffled vectors
    shuffled_vects_list = []
    
    for _, data in bins.items():
        # Extract the vectors from the id:vector inner dictionary
        vectors = list(data.values())
        print(f"Vectors: {vectors}")
        # Make a copy of the vectors list to ensure safe shuffling
        shuffled_vectors = vectors.copy()
        # Shuffle the copy of the vectors within the bin
        np.random.shuffle(shuffled_vectors)
        print(f"Shuffled vectors: {shuffled_vectors}")
        # Append the shuffled vectors to the common list
        shuffled_vects_list.extend(shuffled_vectors)
        
    # Convert the list of vectors to a numpy array
    shuffled_vects_array = np.array(shuffled_vects_list)
        
    return shuffled_vects_array

In [11]:
vects = toy_vects
print(f"Original vectors array:\n{vects}")
words = toy_words
ids = [f"0{i}" for i in range(len(words))]

bins = bin_vects_by_avg_cos_sim(ids, vects, rescaling='abs_cos_sim', num_bins=2)
for bin, data in bins.items():
    print(f"Bin {bin}: {data}")

shuffled_vects = shuffle_bins(bins)
print(f"All shuffled vectors:\n{shuffled_vects}")

Original vectors array:
[[ 1. -2.  3.]
 [-4.  5.  6.]
 [ 7.  8. -9.]]
Words IDs: ['00', '01', '02']
3 embeddings
Average distances: [0.59369721 0.76726577 0.48279196]
Sorted indices: [2 0 1]
Sorted word IDs by cosine distance ascending: ['02', '00', '01']
Bin 0: {'02': array([ 7.,  8., -9.])}
Bin 1: {'00': array([ 1., -2.,  3.]), '01': array([-4.,  5.,  6.])}
Vectors: [array([ 7.,  8., -9.])]
Shuffled vectors: [array([ 7.,  8., -9.])]
Vectors: [array([ 1., -2.,  3.]), array([-4.,  5.,  6.])]
Shuffled vectors: [array([ 1., -2.,  3.]), array([-4.,  5.,  6.])]
All shuffled vectors:
[[ 7.  8. -9.]
 [ 1. -2.  3.]
 [-4.  5.  6.]]


In [10]:
import os
import sys
import csv
import logging
import numpy as np
from scipy.stats import pearsonr, norm
import distance_funcs
import preprocess_funcs

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s : %(levelname)s : %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])

# if len(sys.argv) < 4:
#     print("Usage: python bin_baseline.py <vocab_path> <lsa_model_path> <output_folder> [num_reassignments]")
#     sys.exit(1)

# Read arguments
vocab_path = "data/vocab.pkl"
lsa_model_path = "models/wiki_lsi_model.model"
output_folder = "results/correlations/test_bin_baseline"
num_reassignments = 100

# Create output directory if it does not exist
os.makedirs(output_folder, exist_ok=True)

# Set a random seed for reproducibility
np.random.seed(4242)

# Load the vocabulary and embeddings
vocab = preprocess_funcs.load_vocabulary(vocab_path)
vocab = {id:vocab[id] for id in list(vocab.keys())[:100]}
print(f"Vocab size: {len(vocab)}")
embeddings_dict = preprocess_funcs.get_vocabulary_embeddings_dict(vocab, lsa_model=lsa_model_path)
print(f"Size of embeddings dict: {len(embeddings_dict)}")

# Create a dictionary storing words by length (3-7)
words_ids_by_length = preprocess_funcs.word_ids_by_word_length(vocab)
# print(f"Words ids by word length: {words_ids_by_length}")

# Define rescaling options for cosine distance
rescaling_options = {
    'none': None,
    'abs': 'abs_cos_sim',
    'norm': 'norm_cos_sim',
    'ang': 'angular_dist'
}
# print(f"Rescaling options: {rescaling_options.keys()}")

# --------------- Functions for binning & shuffling  ---------------
def bin_vects_by_avg_cos_dist(word_ids, embeddings, rescaling, num_bins=2):
    """
    Organizes word embeddings into bins based on their average cosine similarity.

    Parameters:
        word_ids (list): A list of word IDs corresponding to the embeddings.
        embeddings (np.ndarray): A 2D array of word embeddings, each row representing a word vector.
        rescaling (optional): Whether to rescale the embeddings.
                                    Options: None, 'abs_cos_sim', 'norm_cos_sim', 'angular_dist'.
        num_bins (int, optional): The number of bins to organize the vectors into. Defaults to 4.

    Returns:
        dict: A dictionary where keys are bin indices (from 0 to num_bins - 1), and values are dictionaries
        mapping word IDs to their corresponding vectors within each bin.

    Description:
        The function calculates all pairwise cosine distances between word embeddings and computes the average 
        cosine distance for each embedding. Then, it sorts the vectors and word IDs based on their average distance. 
        It divides the sorted vectors and word IDs into the specified number of bins. Each bin contains a 
        dictionary where keys are word IDs and values are their corresponding word vectors.

    Note: In case `num_bins` is 1, the function will return one bin containing all the vectors.
    """    
    # Calculate pairwise cosine distances
    all_pairwise_distances = distance_funcs.cosine_distances_matrix(embeddings, rescaling)
    # Calculate average cosine distance for each vector
    avg_distances = distance_funcs.average_distances(all_pairwise_distances)
    # print(f"Avg distances: {avg_distances}")
    
    # Sort vectors based on their average distance and word_ids (in ascending order -> first index corresponds to smallest distance)
    sorted_indices = np.argsort(avg_distances)
    print(f" Sorted indices: {sorted_indices}")
    sorted_vects = embeddings[sorted_indices]
    sorted_word_ids = [word_ids[i] for i in sorted_indices]
    print(f"Sorted word IDs: {sorted_word_ids}")
    
    # Calculate bin size such that vectors are equally distributed across all bins
    n = len(sorted_vects)
    # print(f"Length of sorted vectors: {n}. Length of ordered vectors: {len(vects)}")
    bin_size = n // num_bins
    
    # Initialize a dictionary to represent bins. The values will be dictionaries mapping sorted word IDs to vectors
    bins = {i: {} for i in range(num_bins)}
    
    # Distribute the sorted word IDs and vectors into the bins
    for i in range(num_bins):
        # Set the starting index for the bin
        start_idx = i * bin_size
        # Set the final index for the bin
        if i == num_bins-1:  # For the last bin, include all remaining vectors
            end_idx = n
        else:
            end_idx = (i + 1) * bin_size
        
        # Slice the vectors and word IDs for the current bin
        bin_vects = sorted_vects[start_idx:end_idx]
        bin_word_ids = sorted_word_ids[start_idx:end_idx]
        # print(f"IDs for bin {i}: {bin_word_ids}")
        
        # Append the word IDs and corresponding vectors to the current bin
        for id, vect in zip(bin_word_ids, bin_vects):
            bins[i][id] = vect
    
    # Return the bins
    return bins

def shuffle_bins(bins):
    # Initialize a list to store the shuffled vectors
    shuffled_vects_list = []
    shuffled_ids_list = []
    
    for _, data in bins.items():
        # Extract the vectors from the id:vector inner dictionary
        vectors = list(data.values())
        ids = list(data.keys())
        
        # Make a copy of the vectors list to ensure safe shuffling
        shuffled_vectors = vectors.copy()
        shuffled_ids = ids.copy()
        
        # Shuffle the copy of the vectors within the bin
        np.random.shuffle(shuffled_vectors)
        np.random.shuffle(shuffled_ids)
        
        # Append the shuffled vectors to the common list
        shuffled_vects_list.extend(shuffled_vectors)
        shuffled_ids_list.extend(shuffled_ids)
        
    # Convert the list of vectors to a numpy array
    shuffled_vects_array = np.array(shuffled_vects_list)
    
    return shuffled_vects_array, shuffled_ids_list

# --------------- Computations ---------------
# Iterate through each rescaling option
for rescaling in rescaling_options:
    rescaling_string = rescaling_options[rescaling]
    logging.info(f"Processing for rescaling: {rescaling_string}")

    # Initialize dictionary to store results for each word length
    results = {wordlength: {
        'raw_correlations': [],
        'transformed_correlations': [],
        'p_values': []}
               for wordlength in range(3,8)}
        
    # Iterate over word lengths
    for word_length, ids in words_ids_by_length.items():
        logging.info(f"Analyzing word length {word_length} with rescaling option: {rescaling}")
        print(f"IDs for word length {word_length}: {ids}")
        
        # Retrieve vectors and words for the current word length
        vects = np.array([embeddings_dict[id] for id in ids])
        print(f"{len(vects)} vectors for word length {word_length}")
        words = [vocab[id] for id in ids]
        print(f"{len(words)} words for word length {word_length}: {words}")
        
        # Bin the vectors of the words of the current word length in 4 bins based on their average cosine distance to the embeddings of the same word length
        bins = bin_vects_by_avg_cos_dist(ids, vects, rescaling=rescaling_string) # `bin` is a dict of the format { bin_nb: {id: vect} }
        # print(f"Bins: {bins}")
        
        # Initialize lists to store raw and transformed correlations and p-values
        raw_correlations, transformed_correlations, p_values = [], [], []
        
        # Perform random permutations of word-vector mappings
        for iteration in range(num_reassignments):
            if iteration % 100 == 0:
                logging.info(f"Reassignment iteration {iteration + 1}/{num_reassignments} for word length {word_length}")
            
            # Shuffle the vectors within the bins
            shuffled_vectors, shuffled_ids = shuffle_bins(bins)
            # print(f"Shuffled vectors array has same shape as original array: {shuffled_vectors.shape == vects.shape}")
            print(f"Shuffled ids: {shuffled_ids}")
            
            # Compute distances
            cos_distance_matrix = distance_funcs.cosine_distances_matrix(shuffled_vectors, rescaling=rescaling_string)
            edit_distance_matrix = distance_funcs.edit_distances_matrix(words)
            
            # Extract unique pairwise scores
            unique_cos_distances = distance_funcs.get_unique_pairwise_scores(cos_distance_matrix)
            unique_edit_distances = distance_funcs.get_unique_pairwise_scores(edit_distance_matrix)
            
            # Compute Pearson correlation between distances
            correlation, p_value = pearsonr(unique_cos_distances, unique_edit_distances)
            
            # Transform correlation using Fisher Z-transformation
            transformed_corr = corr_funcs.fisher_z_transform(correlation)
            
            # Store the raw and transformed correlation and p-value
            raw_correlations.append(correlation)
            transformed_correlations.append(transformed_corr)
            p_values.append(p_value)
        
        results[word_length]['raw_correlations'] = raw_correlations
        results[word_length]['transformed_correlations'] = transformed_correlations
        results[word_length]['p_values'] = p_values
    
    # Save results to a CSV file:
    output_file = os.path.join(output_folder, f"bin_bl_corrs_{rescaling}.csv")
    with open(output_file, 'w', newline='', encoding="utf-8") as f:
        csv_writer = csv.writer(f)
        # Write the header row
        csv_writer.writerow(['word_length', 'raw_corr', 'transformed_corr', 'p-value'])
        
        # Write results for each word length and each permutation
        for word_length in results:
            raw_corrs = results[word_length]['raw_correlations']
            transformed_corrs = results[word_length]['transformed_correlations']
            pvalues = results[word_length]['p_values']
            for raw_corr, transf_corr, p_value in zip(raw_corrs, transformed_corrs, pvalues):
                csv_writer.writerow([word_length, raw_corr, transf_corr, p_value])
        
        logging.info(f"Correlation scores for all word lengths with rescaling option {rescaling} saved to {output_file}")

logging.info("All rescaling options have been processed.")

Vocab size: 100
Size of embeddings dict: 100
IDs for word length 3: [2440, 951, 104, 1690, 5665, 879, 3634, 1645, 5430, 3995, 5086]
11 vectors for word length 3
11 words for word length 3: ['cup', 'men', 'art', 'air', 'you', 'law', 'man', 'win', 'san', 'red', 'jpg']
 Sorted indices: [ 1  6  9  5  8  3  7  0  2 10  4]
Sorted word IDs: [951, 3634, 3995, 879, 5430, 1690, 1645, 2440, 104, 5086, 5665]
Shuffled ids: [951, 3995, 5430, 879, 3634, 1645, 104, 2440, 1690, 5086, 5665]
Shuffled ids: [879, 3995, 5430, 3634, 951, 5086, 104, 2440, 5665, 1690, 1645]
Shuffled ids: [5430, 879, 951, 3995, 3634, 104, 5665, 1645, 1690, 5086, 2440]
Shuffled ids: [951, 5430, 3634, 879, 3995, 5665, 1645, 2440, 104, 5086, 1690]
Shuffled ids: [5430, 951, 879, 3634, 3995, 2440, 5665, 104, 1690, 1645, 5086]
Shuffled ids: [879, 3995, 5430, 951, 3634, 1645, 1690, 5086, 104, 5665, 2440]
Shuffled ids: [3634, 951, 3995, 5430, 879, 2440, 5086, 5665, 104, 1690, 1645]
Shuffled ids: [951, 879, 3634, 3995, 5430, 5665, 1645,

In [16]:
import pandas as pd 

test_df = pd.read_csv(r"C:\Users\Raya\OneDrive\Documents\3-CSAI\CSAI-Y3-S2\Thesis\Replication\results\correlations\bin_baseline\bin_bl_corrs_abs.csv")
# print(test_df)

word_length = 3

mean_corr_tr = np.mean(test_df[test_df['word_length'] == word_length]['transformed_corr'])
std_corr_tr = np.std(test_df[test_df['word_length'] == word_length]['transformed_corr'])
mean_corr_raw = np.mean(test_df[test_df['word_length'] == word_length]['raw_corr'])
std_corr_raw = np.std(test_df[test_df['word_length'] == word_length]['raw_corr'])

print(f"Mean transf: {mean_corr_tr}")
print(f"Mean raw: {mean_corr_raw}")

print(F"Std transf: {std_corr_tr}")
print(f"Std raw: {std_corr_raw}")

Mean transf: 0.019747199265683937
Mean raw: 0.01974402781596958
Std transf: 0.005480850760109978
Std raw: 0.005478407651336509
