In [1]:
import os
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from numpy.linalg import norm
from tqdm import tqdm

In [2]:
# Load enwik 8
file = open("enwik88.txt", "r")
doclist = [line for line in file]
docstr = ''.join(doclist)
sentences = re.split(r'[.!?]', docstr)
sentences = [sentence.split() for sentence in sentences if len(sentence) > 1]

In [3]:
def create_vocabulary(sentences, r=200):
    '''
    create a vocabulary with words that apears in the text more then 200 times
    '''
    vocabulary = {}
    num={}
    k=0
      
    for sentance in tqdm(sentences):
        for word in sentance:
            if word in num:
                num[word]=num[word]+1
            else:
                num[word]=1
    
    for sentance in tqdm(sentences):
        for word in sentance:
            if num[word]>r and not(word in vocabulary):
                vocabulary[word]=k
                k+=1
        
    return vocabulary


In [4]:
def create_corpus_matrix(sentences, vocabulary):
    """
    create a corpus matrix of cotnexts of 5-word window
    """
    len_vocab = len(vocabulary)
    corpus_matrix=np.zeros((len_vocab,len_vocab))
    for sentence in tqdm(sentences):
        tmp=len(sentence)-1
        for i in range (len(sentence)):
            w0=sentence[i]
            if w0 in vocabulary:
                for j in range(max(0,i-2),min(tmp,i+2)):
                    w = sentence[j]
                    if (w  in vocabulary)and (i!=j):
                        corpus_matrix[vocabulary[w0],vocabulary[w]]+=1
    return corpus_matrix

In [5]:
vocab = create_vocabulary(sentences, r=200)
D = create_corpus_matrix(sentences, vocab)

100%|██████████████████████████████████████████████████████████████████████| 889156/889156 [00:03<00:00, 247467.46it/s]
100%|██████████████████████████████████████████████████████████████████████| 889156/889156 [00:02<00:00, 354158.93it/s]
100%|███████████████████████████████████████████████████████████████████████| 889156/889156 [00:28<00:00, 30708.49it/s]


In [6]:
def compute_embeddings(D, k, d=200):
    """
    create embeddings matrix for word2vec representation
    """
    length = len(D)
    sum_D = D.sum()
    w = D.sum(axis=1)
    c = D.sum(axis=0)
    length2 = length**2 
    D = np.array(D)
    M = np.zeros((length,length))
    for i in tqdm(range(length)):
        for j in range(length):
            if D[i,j] != 0:
                M[i,j] = np.log((D[i,j]*sum_D)/(k*w[i]*c[j]))
                M[i,j] = max(0,M[i,j])
    M = csr_matrix(M)
    U,S,V = svds(M,d)
    embedding_matrix=U@np.diag(np.sqrt(S))
    return embedding_matrix

In [7]:
k = 5 # negative sampling parameter
W = compute_embeddings(D, k)

100%|█████████████████████████████████████████████████████████████████████████████| 5758/5758 [00:19<00:00, 289.33it/s]


In [8]:
class WordVectors:
    def __init__(self, vocabulary, embedding_matrix):
        self.vocab = vocabulary
        self.W = embedding_matrix
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        
    def word_vector(self, word):
        """ 
        Takes word and returns its word vector.
        """
        if word in self.vocab.keys():
            i = self.vocab[word]
            word_vector = self.W[i, :]
        else:
            raise Exception("There is no such word in the dictionary :)))")
        
        return word_vector
    
    def nearest_words(self, word, top_n=10):
        """ 
        Takes word from the vocabulary and returns its top_n
        nearest neighbors in terms of cosine similarity.
        """
        neighbors = []
        v1 = self.word_vector(word)
        sims = []
        tmp = []
        
        for w in self.vocab.items():
            if w[0] == word:continue
            else:
                v2 = self.word_vector(w[0])
                cosine = (v2 @ v1) / (norm(v1)*norm(v2))
                sims.append((w[0],round(cosine,3)))
                tmp.append(cosine)
        largest_n = np.argpartition(tmp, -top_n)[-top_n:]
        for i in largest_n:
            neighbors.append(sims[i])
        return neighbors

In [9]:
model = WordVectors(vocab, W)
model.nearest_words("opportunities")

[('advantages', 0.375),
 ('education', 0.382),
 ('assistance', 0.402),
 ('chance', 0.403),
 ('funding', 0.394),
 ('protection', 0.439),
 ('employment', 0.543),
 ('benefits', 0.445),
 ('advice', 0.526),
 ('opportunity', 0.629)]