In [1]:
import numpy as np
import pickle as pkl
from os import chdir
import preprocess_funcs
from gensim.models import LsiModel

In [2]:
chdir("c:/Users/Raya/OneDrive/Documents/3-CSAI/CSAI-Y3-S2/Thesis/Replication")

In [5]:
# Import word embeddings
# NOTE: These are the embeddings of all 100'000 words in the model!
lsa_model_path = "models/wiki_lsi_model.model"
embeddings, words = preprocess_funcs.get_all_word_embeddings_and_words(lsa_model_path) # NOTE: deprecated

In [4]:
# Explore word embeddings
# NOTE: These are the embeddings of all 100'000 words in the model!
print(f"'embeddings' object type: {type(embeddings)}")
print(f"Shape of embeddings: {embeddings.shape}")
# print(f"Embedding 1: {embeddings[0]}")

# Explore words
# NOTE: These are all 100'000 words included in the model!
print(f"'words' object type: {type(words)}")
print(f"Number of words: {len(words)}")
print(f"First 10 words: {words[:10]}")

'embeddings' object type: <class 'numpy.ndarray'>
Shape of embeddings: (100000, 500)
'words' object type: <class 'list'>
Number of words: 100000
First 10 words: ['ability', 'able', 'abolish', 'abolishing', 'abolition', 'above', 'academia', 'academic', 'academics', 'accept']


In [3]:
# Load my vocabulary of the 5000 most frequent words in the corpus
vocab_path = "data/vocab.pkl"
VOCAB = preprocess_funcs.load_vocabulary(vocab_path)

In [7]:
# `VOCAB` is a dictionary mapping word ids to words (id: word)
print(f"'VOCAB' object type: {type(VOCAB)}")
print(f"Length: {len(VOCAB)}")
print(f"First 10 items: {list(VOCAB.items())[:10]}")

'VOCAB' object type: <class 'dict'>
Length: 5000
First 10 items: [(3576, 'league'), (4501, 'album'), (3327, 'game'), (1102, 'party'), (1650, 'women'), (240, 'church'), (5480, 'song'), (4211, 'station'), (4321, 'town'), (6761, 'president')]


In [6]:
# Get word ids by word length
word_ids_by_length = preprocess_funcs.word_ids_by_word_length(VOCAB)
for wordlength, ids in word_ids_by_length.items():
    print(f"Length {wordlength} | First 10 IDs: {ids[:10]}")

print("----- CHECK -----")
check_words_ids_by_length = {length: [id for id, word in VOCAB.items() if len(word)==length] for length in range(3,8)}
for wordlength, ids in check_words_ids_by_length.items():
    print(f"Length {wordlength} | First 10 IDs: {ids[:10]}")

Length 3 | First 10 IDs: [2440, 951, 104, 1690, 5665, 879, 3634, 1645, 5430, 3995]
Length 4 | First 10 IDs: [3327, 5480, 4321, 912, 3822, 1322, 1388, 1734, 100, 185]
Length 5 | First 10 IDs: [4501, 1102, 1650, 8538, 3328, 4062, 9219, 1635, 172, 3514]
Length 6 | First 10 IDs: [3576, 240, 10729, 1492, 3610, 223, 663, 4224, 8286, 3993]
Length 7 | First 10 IDs: [4211, 1644, 351, 4185, 15758, 3678, 4871, 2658, 2566, 519]
----- CHECK -----
Length 3 | First 10 IDs: [2440, 951, 104, 1690, 5665, 879, 3634, 1645, 5430, 3995]
Length 4 | First 10 IDs: [3327, 5480, 4321, 912, 3822, 1322, 1388, 1734, 100, 185]
Length 5 | First 10 IDs: [4501, 1102, 1650, 8538, 3328, 4062, 9219, 1635, 172, 3514]
Length 6 | First 10 IDs: [3576, 240, 10729, 1492, 3610, 223, 663, 4224, 8286, 3993]
Length 7 | First 10 IDs: [4211, 1644, 351, 4185, 15758, 3678, 4871, 2658, 2566, 519]


In [8]:
# Retrieve the embeddings of the words in the vocabulary
vocab_embeddings = preprocess_funcs.get_vocabulary_embeddings_dict()

In [9]:
# Explore vocabulary embeddings
print(f"'vocab_embeddings' object type: {type(vocab_embeddings)}")
print(f"Length: {len(vocab_embeddings)}")
print(f"First item: {list(vocab_embeddings.items())[0]}")

'vocab_embeddings' object type: <class 'dict'>
Length: 5000
First item: ('league', array([ 1.13337643e-01,  1.40484904e-01,  3.74122065e-01, -1.76509153e-01,
       -3.34818812e-02, -8.70434105e-02,  2.50108555e-01, -9.07004811e-02,
        7.72642569e-02,  3.72268912e-02,  2.37550695e-03, -2.85070042e-02,
       -1.72068675e-02,  2.14117853e-02, -2.05244961e-02, -1.55875799e-02,
       -1.96229697e-02, -7.37330813e-02, -6.62071557e-02,  7.94375492e-02,
        1.19980414e-01, -6.89140370e-03, -6.28666333e-02, -9.87873098e-03,
       -4.87113715e-02,  5.95749509e-02,  2.06745951e-02,  2.81389363e-03,
       -1.98662471e-01, -8.01975810e-02, -2.45947662e-02, -2.39427371e-01,
       -1.39878469e-01,  3.26120245e-01, -9.83416254e-02,  1.62333838e-02,
       -5.58726093e-03,  1.06940700e-01, -2.04707758e-03, -3.51202171e-02,
       -1.77388753e-02, -3.39146660e-02,  1.29090874e-01, -2.03333544e-02,
        2.91437346e-02, -2.02470039e-02,  7.39602940e-03,  9.42530138e-03,
       -1.7317053

In [15]:
# Check if preprocess_funcs.get_vocabulary_embeddings() functions correctly -> if it retrieves the correct word embeddings => YES!
# NOTE: the word IDs correspond to the indices of the words in `words` 
    # -> word IDs are created using a Dictionary in the make_wikicorpus script by assigning IDs to words sequentially 
    # based on their frequency in the corpus, with the most frequent words assigned lower IDs.

# Get the first word in the vocabulary
w = 'league'
print(f"Is {w} in `VOCAB`? --> {w in VOCAB.values()}")
print(f"Is {w} in `words`? --> {w in words}")

# Find the index of 'league' in words
idx_words = words.index(w)
print(f"Index of {w} in `words`: {idx_words}")
print(f"Is the ID of {w} {idx_words}? --> {VOCAB[idx_words]==w}")

# Check that word ids are linked to their frequency
# Get the smallest ID in the vocabulary
lowestID = min(VOCAB.keys())
highestID = max(VOCAB.keys())
print(f"Lowest ID in VOCAB: {lowestID}")
print(f"Highest ID in VOCAB: {highestID}")
print(f"Word with ID {lowestID}: {VOCAB[lowestID]}")
print(f"Word with ID {highestID}: {VOCAB[highestID]}")
print(f"Word with ID 100000: {words[100000-1]}")

Is league in `VOCAB`? --> True
Is league in `words`? --> True
Index of league in `words`: 3576
Is the ID of league 3576? --> True
Lowest ID in VOCAB: 0
Highest ID in VOCAB: 94948
Word with ID 0: ability
Word with ID 94948: spacewatch
Word with ID 100000: outshut


In [16]:
# Check if projection.u gives the same vectors as the ones I derived using the transpose of get_topics()
data = np.load("data/sample_word_vects.npz", allow_pickle=True)
IDS = data['ids']
WORDS = data['words']
VECTORS = data['vectors']

w_index = list(WORDS).index(w)
project_vect = VECTORS[w_index] 
my_vect = vocab_embeddings[w]
# print(my_vect)
# print(project_vect)
print(f"Vector for '{w}' is the same: {np.array_equal(np.round(project_vect, 12), np.round(my_vect, 12))}") # becomes false when rounding at the 13th decimal (probably due to numerical imprecisions)

Vector for 'league' is the same: True


In [17]:
lsa_model = LsiModel.load("models/wiki_lsi_model.model")

In [18]:
# Get the projection for word `w` using directly lsa_model.projection.u
w_id = idx_words
lsa_project_vect = lsa_model.projection.u[w_id]
print(f"Vector for '{w}' is the same: {np.array_equal(lsa_project_vect, my_vect)}")

Vector for 'league' is the same: True


In [19]:
lsa_model.__getitem__([(12,3),(1,5)])

[(0, 0.08806341287764444),
 (1, 0.01815538680828466),
 (2, -0.008926209632374088),
 (3, 0.0555912367028911),
 (4, 0.019992661473791463),
 (5, 0.01642402852437277),
 (6, -0.0031852063219334897),
 (7, -0.0016850919281656447),
 (8, 0.033444480007969055),
 (9, 0.027051612188574957),
 (10, 0.028194604997158822),
 (11, 0.019323017932398094),
 (12, -0.0064266648245496),
 (13, -0.007861994907800294),
 (14, 0.04606600388724229),
 (15, 0.03238852821554497),
 (16, 0.06421519762178574),
 (17, 0.040263682338684145),
 (18, 0.025743359658857545),
 (19, 0.03934080178604196),
 (20, 0.018455492534609368),
 (21, 0.07698093633037978),
 (22, 0.05004819148220726),
 (23, 0.017250142629727645),
 (24, -0.016076452501218915),
 (25, 0.03302820629310072),
 (26, 0.02939604279216931),
 (27, 0.00017768059953261056),
 (28, 0.03869775265275791),
 (29, 0.042610936715893305),
 (30, -0.028842948919049392),
 (31, -0.018160584743490124),
 (32, -0.008634428391806073),
 (33, 0.01604352300830321),
 (34, 0.012461661378020376),