In [71]:
import os
import numpy as np
import nltk
import scipy
import sklearn
from utils import cosine_similarity, get_dict, process_tweet
import pickle
import string
from nltk.corpus import stopwords, twitter_samples


In [5]:
os.mkdir('data')
data_path = '/data'
nltk.data.path.append(data_path)

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Shahbaz
[nltk_data]     Akhtar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to C:\Users\Shahbaz
[nltk_data]     Akhtar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [11]:
en_embeddings_subset = pickle.load(open('en_embeddings.p', 'rb'))
fr_embeddings_subset = pickle.load(open('fr_embeddings.p', 'rb'))

In [47]:
print('English embeddings size: ', len(en_embeddings_subset.keys()))
print('French embeddings size:  ', len(fr_embeddings_subset.keys()))
print('Embedding dimensions:    ',len(en_embeddings_subset[list(en_embeddings_subset.keys())[0]]))

English embeddings size:  6370
French embeddings size:   5766
Embedding dimensions:     300


#### Load dictionaries mapping english to french words

In [49]:
en_fr_train = get_dict('en-fr.train.txt')
print('EN - FR train size: ', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')
print('EN - FR test size: ', len(en_fr_test))

EN - FR train size:  5000
EN - FR test size:  1500


#### Generate embedding matrices E, F

In [84]:
def get_matrices(en_fr, en_vecs, fr_vecs):
    e = [] #list containing english vectors
    f = [] #list containing french vectors
    f_idx_word_map = []
    
    en_set = en_vecs.keys()
    fr_set = fr_vecs.keys()
    
    fr_words = en_fr.values()
    
    
    for en_word, fr_word in en_fr.items():
        
        if en_word in en_set and fr_word in fr_set:
            en_vec = en_vecs[en_word]
            fr_vec = fr_vecs[fr_word]
            f_idx_word_map.append(fr_word)
            
            
            e.append(en_vec)
            f.append(fr_vec)
    E = np.vstack(e)
    F = np.vstack(f)
    
    return E, F, f_idx_word_map

In [85]:
X_train, Y_train, f_idx_word_map = get_matrices(en_fr_train, en_embeddings_subset, fr_embeddings_subset)

#### Loss function: Modified Forbenius norm of the matrices E, F:
##### Forbinius norm: ||E*R - F ||
##### loss function: (1/m)* || E*R - F ||^2

In [54]:
def compute_loss(X, Y, R):
    m = X.shape[0]
    diff = np.dot(X,R) - Y
    squared_diff_sum = np.sum(diff**2)
    loss = squared_diff_sum / m
    return loss    

##### Gradient of the loss function: (E^T)*(ER - F) * (2 / m)

In [55]:
def compute_gradient(X, Y, R):
    m = X.shape[0]
    gradient = np.dot(X.T, np.dot(X, R) - Y) * (2/m)
    return gradient

#### Perform Gradient Descent

In [57]:
def get_R(X, Y, train_steps = 100, learning_rate = 0.001):
    np.random.seed(1927)
    
    R = np.random.randn(X.shape[1], X.shape[1])
    
    for i in range(train_steps):
        if i % 25 == 0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        gradient = compute_gradient(X,Y,R)
        
        R -=  learning_rate * gradient
    return R

In [58]:
R_train = get_R(X_train, Y_train, train_steps=400, learning_rate=0.8)

loss at iteration 0 is: 2547.8574
loss at iteration 25 is: 303.0439
loss at iteration 50 is: 84.1080
loss at iteration 75 is: 29.6959
loss at iteration 100 is: 12.2631
loss at iteration 125 is: 5.7500
loss at iteration 150 is: 3.0422
loss at iteration 175 is: 1.8223
loss at iteration 200 is: 1.2364
loss at iteration 225 is: 0.9395
loss at iteration 250 is: 0.7821
loss at iteration 275 is: 0.6950
loss at iteration 300 is: 0.6452
loss at iteration 325 is: 0.6156
loss at iteration 350 is: 0.5977
loss at iteration 375 is: 0.5864


#### Search for translation embedding using k-NN
##### using cosine similarity

In [64]:
def knn(v, candidates, k = 1):
    similarity_list = []
    for row in candidates:
        cos = cosine_similarity(v, row)
        similarity_list.append(cos)
        
    sorted_indices = np.argsort(similarity_list)
    k_idx = sorted_indices[-k:]
    
    return k_idx

In [60]:
def test_vocabulary(X, Y, R):
    pred = np.dot(X, R)
    
    num_correct = 0
    for i in range(len(pred)):
        pred_idx = knn(pred[i], Y)
        
        if pred_idx == i:
            num_correct += 1
    accuracy = num_correct / len(pred)
    return accuracy

#### Get accuracy

In [65]:
X_val, Y_val = get_matrices(en_fr_test, en_embeddings_subset, fr_embeddings_subset)
acc = test_vocabulary(X_val, Y_val, R_train)
print(f'Accuracy is {acc:.3f}')

Accuracy is 0.560


In [121]:
def predict(sentence):
    sentence = sentence.split()
    translation = ''
    for word in sentence:
        
        embd = en_embeddings_subset[word]
        pred_embd_vec = np.dot(embd, R_train)
        pred_embd_idx = knn(pred_embd_vec, Y_train)
        translation += ' ' + f_idx_word_map[int(pred_embd_idx)]
    return translation

en_sentence = 'the cat was black'
predict(en_sentence)

' même chien était noirs'