In [1]:
import pathlib
import numpy as np
import re
from sklearn.utils.extmath import randomized_svd
from utils import tokenize, build_vocabulary, get_word_to_id_from_vocab, get_id_to_word, build_corpus, create_cooccurrence_matrix

In [2]:
datasets_folder=pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\nlp_with_dl_from_scratch\datasets")
ptb_filename="ptb.train.txt"
text = (datasets_folder/ptb_filename).read_text()

In [3]:
from utils import tokenize

words = tokenize(text,special_words=["<unk>"])
print(f"total of {len(words):,} words")

total of 937,128 words


In [4]:
from utils import build_vocabulary

vocab = build_vocabulary(words)
print(f"vocab has {len(vocab):,} words")

vocab has 9,654 words


In [5]:
word_to_id = get_word_to_id_from_vocab(list(vocab))
print(word_to_id)



In [6]:
id_to_word=get_id_to_word(word_to_id)
print(id_to_word)



In [7]:
corpus=build_corpus(words,word_to_id)
print(corpus[:100])

[3038, 7577, 2333, 6256, 4023, 4249, 5350, 2774, 5629, 5229, 7876, 7337, 6919, 1258, 9498, 7369, 8195, 3085, 4256, 3346, 4385, 7134, 4948, 7876, 5538, 2200, 3806, 462, 1237, 9490, 9273, 2131, 1645, 3358, 8447, 329, 626, 1009, 8464, 1873, 8986, 5474, 6314, 9273, 7951, 6314, 9490, 7453, 7661, 8490, 9490, 7192, 6314, 6341, 6314, 329, 6786, 5182, 6782, 3479, 9490, 9273, 2131, 1645, 1119, 7647, 7661, 8490, 4034, 2297, 7338, 6626, 8807, 4366, 8464, 1873, 8986, 8490, 928, 2025, 5997, 19, 8464, 4649, 8490, 7008, 905, 4924, 6096, 3250, 1019, 8710, 7841, 2019, 4396, 8464, 960, 3092, 8490, 4411]


In [8]:
C = create_cooccurrence_matrix(corpus,window_size=1,vocab_size=len(vocab))
print(f"cooccurrence shape :  {C.shape}")

cooccurrence shape :  (9654, 9654)


In [9]:
from utils import build_ppmi

ppmi = build_ppmi(C)

print(f"ppmi shape : {ppmi.shape}")

PPMI building: 100%|██████████| 93199716/93199716 [04:43<00:00, 328724.64it/s]

ppmi shape : (9654, 9654)





In [10]:
U,S,V = randomized_svd(ppmi,n_components=100, n_iter= 5, random_state=None)

In [11]:
def cosine_similarity(u,v):
    return u@v/(np.linalg.norm(u)*np.linalg.norm(v))

In [12]:
print(f"similarity between car and drive : {cosine_similarity(U[word_to_id['car']],U[word_to_id['drive']])}")
print(f"similarity between car and the : {cosine_similarity(U[word_to_id['car']],U[word_to_id['the']])}")
print(f"similarity between car and street : {cosine_similarity(U[word_to_id['car']],U[word_to_id['street']])}")
print(f"similarity between man and woman : {cosine_similarity(U[word_to_id['man']],U[word_to_id['woman']])}")
print(f"similarity between man and table : {cosine_similarity(U[word_to_id['man']],U[word_to_id['table']])}")
print(f"similarity between man and tree : {cosine_similarity(U[word_to_id['man']],U[word_to_id['tree']])}")
print(f"similarity between man and tree : {cosine_similarity(U[word_to_id['man']],U[word_to_id['tree']])}")
print(f"similarity between investment and portfolio : {cosine_similarity(U[word_to_id['investment']],U[word_to_id['portfolio']])}")
print(f"similarity between investment and drink : {cosine_similarity(U[word_to_id['investment']],U[word_to_id['drink']])}")

similarity between car and drive : 0.25078192353248596
similarity between car and the : 0.08148854970932007
similarity between car and street : -0.11724572628736496
similarity between man and woman : 0.5597501993179321
similarity between man and table : 0.22414074838161469
similarity between man and tree : -0.07137893885374069
similarity between man and tree : -0.07137893885374069
similarity between investment and portfolio : 0.4311946630477905
similarity between investment and drink : -0.02336348220705986


In [13]:
def most_similar(word, U:np.ndarray, word_to_id:dict, id_to_word:dict, n:int=100):
    if word not in word_to_id:
        raise Exception(f"{word} is not in the vocabulary")
    similarities = np.array([cosine_similarity(U[word_to_id[word]],word_rep) for i,word_rep in enumerate(U) if i!=word_to_id[word]])
    most_similar_word_ids = np.argsort(-1*similarities)
    for i in range(n):
        print(f"{i} :{id_to_word[most_similar_word_ids[i]]} (score : {similarities[most_similar_word_ids[i]]:.3f})")
    return most_similar_word_ids[:n]



In [19]:
ret = most_similar("cycle", U, word_to_id, id_to_word, n=10)

0 :machine (score : 0.463)
1 :boom (score : 0.451)
2 :carpet (score : 0.425)
3 :steering (score : 0.415)
4 :sister (score : 0.414)
5 :woes (score : 0.406)
6 :alley (score : 0.398)
7 :tremendous (score : 0.395)
8 :fiber (score : 0.393)
9 :rhetoric (score : 0.393)
