In [1]:
#Importing Modules
import numpy as np
import pandas as pd
import nltk
import pickle
from package import get_dict, cosine_similarity

print("Modules import successfully")

Modules import successfully


In [2]:
en_embeddings = pickle.load(open('en_embeddings.p', 'rb')) #english word embeddings
fr_embeddings = pickle.load(open('fr_embeddings.p', 'rb')) #french word embeddings

In [3]:
print("No. of English Words: ", len(en_embeddings)) #checking the length of english embeddings
print("No. of French Words: ", len(fr_embeddings)) #checking the length of french embeddings

No. of English Words:  6370
No. of French Words:  5766


In [4]:
print("10 English Words: ",list(en_embeddings.keys())[:10]) #first 10 english words
print("10 French Words: ", list(fr_embeddings.keys())[:10]) #first 10 french words

10 English Words:  ['the', 'was', 'for', 'that', 'with', 'from', 'this', 'utc', 'his', 'not']
10 French Words:  ['la', 'était', 'pour', 'cela', 'avec', 'depuis', 'ce', 'tuc', 'son', 'pas']


In [5]:
en_fr_train = get_dict("en-fr.train.txt") #loading english to french training dictionary
en_fr_test = get_dict("en-fr.test.txt") #loading english to french test dictionary

In [6]:
print("Training: ", len(en_fr_train)) #checking the length of training dictionary
print("Test: ", len(en_fr_test)) #checking the length of test dictionary

Training:  5000
Test:  1500


In [7]:
def get_matrices(en_fr, english_vecs, french_vecs):
    
    X_l=[] #initialising list to get embeddings for english words
    Y_l=[] #initialising list to get embeddings for french words
    
    english_set = english_vecs.keys() #list of english words in embeddings
    french_set = french_vecs.keys() #list of french words in embeddings
    
    for en_word, fr_word in en_fr.items(): 
        if en_word in english_set and fr_word in french_set: #checking if the english and french word have embeddings or not
            
            en_vec = english_vecs[en_word] #getting the embedding for english word
            fr_vec = french_vecs[fr_word] #getting the embedding for french word
            
            X_l.append(en_vec) #add the embedding for english word
            Y_l.append(fr_vec) #add the embedding for french word
    
    X = np.vstack(X_l) #making a matrix for all the  english word embeddings
    Y = np.vstack(Y_l) #matrix for all the french word embeddings
    
    return X, Y

In [8]:
X_train, Y_train = get_matrices(en_fr_train, en_embeddings, fr_embeddings) #getting our training matrices for english and french words

In [9]:
def compute_loss(X, Y, R):
    
    m= X.shape[0] #total words in english embedding matrix
    
    diff = np.dot(X,R) - Y #calculating difference (XR-Y)
    
    diff_squared = diff**2 #absolute square for each element in our matrix (will be using for frobenius norm)
    
    sum_diff_squared = np.sum(diff_squared) #sum of all the squared elements
    
    loss = sum_diff_squared/m #calculating average loss (we took average loss so model wont be affected by variation in size of data)
    
    return loss

In [10]:
def compute_gradient(X, Y, R):
    m = X.shape[0] #no. of rows in X
    gradient = np.dot(X.transpose(), (np.dot(X,R)-Y))*2/m #calculating gradient (2/m*(X'.(XR-Y)))
    return gradient

In [11]:
def align_embeddings(X, Y, steps=100, learning_rate=0.0003):
    np.random.seed(11)
    
    R = np.random.rand(X.shape[1], X.shape[1]) #initialising R to random values
    
    for i in range(steps): 
        if i%25==0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}") #printing loss after every 25th iteration
            
        grad = compute_gradient(X, Y, R) #calculate gradient
        
        R -= learning_rate*grad #updating R according to learning rate and gradient
        
    return R

In [12]:
#validating our functions
np.random.seed(11)
m = 10
n = 5
X = np.random.rand(m, n)
Y = np.random.rand(m, n)
R = align_embeddings(X, Y, steps=200)

loss at iteration 0 is: 3.4632
loss at iteration 25 is: 3.3618
loss at iteration 50 is: 3.2640
loss at iteration 75 is: 3.1697
loss at iteration 100 is: 3.0789
loss at iteration 125 is: 2.9914
loss at iteration 150 is: 2.9070
loss at iteration 175 is: 2.8257


In [13]:
def nearest_neighbor(v, candidates, k=1):
    
    similarity_l = [] #initalising similarity list
    
    for c in candidates:
        cos_similarity = cosine_similarity(v, c) #cosine similarity for v and c
        similarity_l.append(cos_similarity) #adding the current similarity to similarity list
    
    sorted_ids = np.argsort(similarity_l) #sorting the similarity list and getting their indices 
    
    k_ids = sorted_ids[-k:] #selecting indices of k candidates with most similarity
    
    return k_ids

In [14]:
#validating our nearest neighbor function
v = np.array([1, 0, 1])
candidates = np.array([[1, 0, 5], [-2, 5, 3], [2, 0, 1], [6, -9, 5], [9, 9, 9]]) 
print(candidates[nearest_neighbor(v, candidates, 3)]) #3 most similar candidates

[[9 9 9]
 [1 0 5]
 [2 0 1]]


In [15]:
def test_vocabulary(X, Y, R):
    pred = np.dot(X,R) #predicting the embeddings for the english words with the help of transformation matrix
    
    num_correct = 0 #counter to count correct values
    
    for i in range(len(pred)):
        pred_ids = nearest_neighbor(pred[i], Y) #getting the index of most similar word for our predicted embeddings
        
        if pred_ids == i: #if predicted index matches the i 
            num_correct += 1 #adding to correct counter
    
    accuracy = num_correct/ len(pred) #calculating accuracy for predictions
    
    return accuracy

In [16]:
X_val, Y_val = get_matrices(en_fr_test, en_embeddings, fr_embeddings) #getting the matrices for our test data

In [17]:
R_train = align_embeddings(X_train, Y_train, steps=400, learning_rate=0.8) #getting the transformation matrix

loss at iteration 0 is: 969.5962
loss at iteration 25 is: 97.7116
loss at iteration 50 is: 26.8955
loss at iteration 75 is: 9.8601
loss at iteration 100 is: 4.4277
loss at iteration 125 is: 2.3598
loss at iteration 150 is: 1.4674
loss at iteration 175 is: 1.0456
loss at iteration 200 is: 0.8324
loss at iteration 225 is: 0.7190
loss at iteration 250 is: 0.6562
loss at iteration 275 is: 0.6203
loss at iteration 300 is: 0.5992
loss at iteration 325 is: 0.5865
loss at iteration 350 is: 0.5787
loss at iteration 375 is: 0.5738


In [18]:
acc = test_vocabulary(X_val, Y_val, R_train) #calculating the accuracy on our data and transformation matrix
print(f"accuracy on test set is {acc:.3f}") 

accuracy on test set is 0.560


In [19]:
#Trial
dict_trial = {
    "how":"comment",
    "are":"son",
    "you":"toi"
}

In [20]:
#trial matrix
X_trial, Y_trial = get_matrices(dict_trial, en_embeddings, fr_embeddings)

In [21]:
#trial transformation matrix
R_trial = align_embeddings(X_trial, Y_trial, steps = 300, learning_rate=0.006)

loss at iteration 0 is: 306.3372
loss at iteration 25 is: 71.9307
loss at iteration 50 is: 17.1479
loss at iteration 75 is: 4.1356
loss at iteration 100 is: 1.0060
loss at iteration 125 is: 0.2463
loss at iteration 150 is: 0.0606
loss at iteration 175 is: 0.0149
loss at iteration 200 is: 0.0037
loss at iteration 225 is: 0.0009
loss at iteration 250 is: 0.0002
loss at iteration 275 is: 0.0001


In [22]:
#accuracy on trial
acc_trial = test_vocabulary(X_trial, Y_trial, R_trial)

In [23]:
print("accuracy on trial data: ",acc_trial)

accuracy on trial data:  1.0
