In [3]:
import numpy as np
import pickle
import os

# Path to your pickle file
file_path = '/Users/0xr4plh/Documents/Machine Learning/my-nlp-basics/Embeddings Techniques/Word2Vec/embeddings_3d_list.pkl'

# Load the embeddings from the pickle file
with open(file_path, 'rb') as file:
    embeddings_3d_list = pickle.load(file)

# Convert list to NumPy array for efficient computation
embeddings_3d_array = np.array(embeddings_3d_list)

# Perform average pooling across the 1000 vectors (second dimension)
avg_embeddings = np.mean(embeddings_3d_array, axis=1)

# If you need the result as a Python list
avg_embeddings_list = avg_embeddings.tolist()

# Output path for the averaged embeddings
output_path = 'Embeddings Techniques/Word2Vec/avg_embeddings.pkl'

# Ensure the directory exists
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)  # Create the directory if it doesn't exist

# Save the averaged embeddings
with open(output_path, 'wb') as file:
    pickle.dump(avg_embeddings_list, file)

print("Averaged embeddings saved successfully at:", output_path)


Averaged embeddings saved successfully at: Embeddings Techniques/Word2Vec/avg_embeddings.pkl


In [4]:
# Load the averaged embeddings from the saved file
with open('Embeddings Techniques/Word2Vec/avg_embeddings.pkl', 'rb') as file:
    avg_embeddings_list = pickle.load(file)

# Check the dimensions of the loaded embeddings
print(f"Dimensions of averaged embeddings: {len(avg_embeddings_list)} x {len(avg_embeddings_list[0])}")

# View the first 10 rows and their content
for i in range(10):
    print(f"Row {i + 1}: {avg_embeddings_list[i]}")


Dimensions of averaged embeddings: 300 x 300
Row 1: [0.75048828125, -0.07315283268690109, -0.06377527862787247, 0.11872606724500656, 0.005638636648654938, -0.02073349803686142, 0.05923566594719887, -0.046923790127038956, 0.16413775086402893, 0.14196692407131195, -0.006731775123625994, 0.034778766334056854, -0.03227010369300842, 0.12006685137748718, -0.023853162303566933, 0.04053828865289688, -0.027185412123799324, 0.06289862096309662, -0.02853946015238762, -0.08633074909448624, 0.10577314347028732, 0.027533061802387238, 0.19639359414577484, 0.04106973856687546, -0.013925592415034771, -0.028689907863736153, -0.05014152452349663, 0.006213271990418434, 0.032719627022743225, -0.08128256350755692, -0.0775018185377121, -0.07023383677005768, -0.019752895459532738, -0.008351882919669151, 0.08584605157375336, -0.13905182480812073, 0.09367810934782028, -0.07036092132329941, -0.005088310223072767, 0.16861215233802795, -0.008984348736703396, -0.08859222382307053, 0.09107107669115067, 0.07602469623

In [6]:
import numpy as np
import pickle
import gensim.downloader as api
from tqdm import tqdm  # Import the progress bar library

# Load the "word2vec-google-news-300" model
print("Loading the Word2Vec model...")
model = api.load("word2vec-google-news-300")

# Load the averaged embeddings (300 x 300)
print("Loading averaged embeddings...")
with open('Embeddings Techniques/Word2Vec/avg_embeddings.pkl', 'rb') as file:
    avg_embeddings_list = pickle.load(file)

# Ensure the embeddings are in NumPy format for efficient computation
avg_embeddings_array = np.array(avg_embeddings_list)

# List to store the most similar words and their scores
most_similar_words_and_scores = []

# Find the most similar word and score for each row vector with a progress bar
print("Finding most similar words and scores...")
for row_vector in tqdm(avg_embeddings_array, desc="Processing rows", total=len(avg_embeddings_array)):
    # Calculate cosine similarity and get the most similar word and score
    most_similar_word, score = model.most_similar([row_vector], topn=1)[0]
    most_similar_words_and_scores.append((most_similar_word, score))

# The result is a list of 300 tuples (word, score)
print("\nMost similar words and scores for each vector:")
print(most_similar_words_and_scores)


Loading the Word2Vec model...
Loading averaged embeddings...
Finding most similar words and scores...


Processing rows: 100%|██████████| 300/300 [00:33<00:00,  8.84it/s]


Most similar words and scores for each vector:
[('dol##.net_index####.html_http_dol##.net', 0.8217871785163879), ('BY_MATT_PARROTT', 0.8084081411361694), ('DONNER_METALS_LTD', 0.8393741250038147), ('SO_WE_WILL', 0.8322646021842957), ('SO_WE_WILL', 0.8279878497123718), ('AND_TO_REOFFER', 0.8442307114601135), ('AND_TO_REOFFER', 0.8390525579452515), ('SO_WE_WILL', 0.8076137900352478), ('e_mail_info@srssoft.com_fax', 0.8346384167671204), ('AND_TO_REOFFER', 0.8685468435287476), ('dol##.net_index###.html_http_dol##.net', 0.7944468259811401), ('SO_WE_WILL', 0.8198202252388), ('LOW_PRICE_STOCK', 0.8003358840942383), ('AND_TO_REOFFER', 0.8678801655769348), ('News_Weather_SportsWis', 0.7893505096435547), ('bearded_Shawn_Sedonis', 0.833828330039978), ('BY_ERIK_BOLAND_erik.boland', 0.8437321186065674), ('AND_TO_REOFFER', 0.8677354454994202), ('DONNER_METALS_LTD', 0.8225232362747192), ('Actress_Keke_Palmer', 0.7652966976165771), ('SO_WE_WILL', 0.8302062749862671), ('News_Weather_SportsWis', 0.8354


