In [1]:
import pathlib
import numpy as np
import re
from sklearn.utils.extmath import randomized_svd
from utils import tokenize, build_vocabulary, get_word_to_id_from_vocab, get_id_to_word, build_corpus, create_cooccurrence_matrix

In [2]:
datasets_folder=pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\nlp_with_dl_from_scratch\datasets")
ptb_filename="ptb.train.txt"
text = (datasets_folder/ptb_filename).read_text()

In [3]:
words = tokenize(text,special_words=["<unk>"])
print(f"total of {len(words):,} words")

total of 937,128 words


In [4]:
vocab = build_vocabulary(words)
print(f"vocab has {len(vocab):,} words")

vocab has 9,654 words


In [5]:
word_to_id = get_word_to_id_from_vocab(list(vocab))
print(word_to_id)



In [6]:
id_to_word=get_id_to_word(word_to_id)
print(id_to_word)



In [7]:
corpus=build_corpus(words,word_to_id)
print(corpus[:100])

[3419, 1093, 5526, 8124, 6361, 2399, 6283, 6234, 4586, 7233, 3949, 9190, 1083, 7602, 5638, 8060, 2870, 8174, 2028, 9184, 5169, 885, 69, 3949, 2306, 1302, 1763, 1596, 6120, 4941, 2667, 3320, 2349, 146, 4506, 5297, 55, 6519, 6529, 655, 9428, 4441, 4043, 2667, 6757, 4043, 4941, 3241, 5669, 1061, 4941, 280, 4043, 786, 4043, 5297, 9538, 324, 5723, 1727, 4941, 2667, 3320, 2349, 107, 8707, 5669, 1061, 2358, 6576, 8427, 9556, 828, 7621, 6529, 655, 9428, 1061, 3155, 1900, 3710, 2568, 6529, 7738, 1061, 9230, 6495, 5841, 7528, 9050, 5920, 7824, 736, 3514, 3738, 6529, 5560, 7795, 1061, 9482]


In [8]:
C = create_cooccurrence_matrix(corpus,window_size=1,vocab_size=len(vocab))
print(f"cooccurrence shape :  {C.shape}")

cooccurrence shape :  (9654, 9654)


In [9]:
from utils import build_ppmi

ppmi = build_ppmi(C)

print(f"ppmi shape : {ppmi.shape}")

PPMI building: 100%|██████████| 93199716/93199716 [03:00<00:00, 515136.68it/s]

ppmi shape : (9654, 9654)





In [10]:
U,S,V = randomized_svd(ppmi,n_components=100, n_iter= 5, random_state=None)

In [11]:
def cosine_similarity(u,v):
    return u@v/(np.linalg.norm(u)*np.linalg.norm(v))

In [12]:
print(f"similarity between car and drive : {cosine_similarity(U[word_to_id['car']],U[word_to_id['drive']])}")
print(f"similarity between car and the : {cosine_similarity(U[word_to_id['car']],U[word_to_id['the']])}")
print(f"similarity between car and street : {cosine_similarity(U[word_to_id['car']],U[word_to_id['street']])}")
print(f"similarity between man and woman : {cosine_similarity(U[word_to_id['man']],U[word_to_id['woman']])}")
print(f"similarity between man and table : {cosine_similarity(U[word_to_id['man']],U[word_to_id['table']])}")
print(f"similarity between man and tree : {cosine_similarity(U[word_to_id['man']],U[word_to_id['tree']])}")
print(f"similarity between man and tree : {cosine_similarity(U[word_to_id['man']],U[word_to_id['tree']])}")
print(f"similarity between investment and portfolio : {cosine_similarity(U[word_to_id['investment']],U[word_to_id['portfolio']])}")
print(f"similarity between investment and drink : {cosine_similarity(U[word_to_id['investment']],U[word_to_id['drink']])}")

similarity between car and drive : 0.3246625065803528
similarity between car and the : 0.08762232959270477
similarity between car and street : -0.08179930597543716
similarity between man and woman : 0.5836743712425232
similarity between man and table : 0.2704184651374817
similarity between man and tree : -0.08587023615837097
similarity between man and tree : -0.08587023615837097
similarity between investment and portfolio : 0.4323898255825043
similarity between investment and drink : 0.09997933357954025


In [14]:
def most_similar(word, U:np.ndarray, word_to_id:dict, id_to_word:dict, n:int=100):
    if word not in word_to_id:
        raise Exception(f"{word} is not in the vocabulary")
    similarities = np.array([cosine_similarity(U[word_to_id[word]],word_rep) for i,word_rep in enumerate(U) if i!=word_to_id[word]])
    most_similar_word_ids = np.argsort(-1*similarities)
    for i in range(n):
        print(f"{i} :{id_to_word[most_similar_word_ids[i]]} (score : {similarities[most_similar_word_ids[i]]:.3f})")
    return most_similar_word_ids[:n]



In [17]:
ret = most_similar("auto", U, word_to_id, id_to_word, n=10)

0 :pharmaceutical (score : 0.699)
1 :developing (score : 0.684)
2 :redeemed (score : 0.671)
3 :fate (score : 0.643)
4 :withheld (score : 0.607)
5 :fee (score : 0.606)
6 :dismissed (score : 0.600)
7 :bitterly (score : 0.580)
8 :refunding (score : 0.580)
9 :airline (score : 0.579)
