<a href="https://colab.research.google.com/github/rposhala/En-Fr_Translation-Toolbar_with_AutoCorrect-AutoFill/blob/development/En_Fr_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle
import string
import gensim
import nltk
import numpy as np
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [2]:
def get_dict(file_name):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        # indexing into the rows.
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof


def cosine_similarity(A, B):
    '''
    Input: A: a numpy array which corresponds to a word vector
          B: A numpy array which corresponds to a word vector
    Output: cos: numerical number representing the cosine similarity between A and B.
    '''
    # you have to set this variable to the true label.
    cos = -10
    dot = np.dot(A, B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B)
    cos = dot / (norma * normb)

    return cos


<a name="1"></a>

# The word embeddings data for English and French words

Data: The full dataset for English embeddings is about 3.64 gigabytes, and the French embeddings are about 629 megabytes. 

* English embeddings are downloaded from Google code archive word2vec and unzipped
[GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/)

* and the French embeddings are downloaded from
[cross_lingual_text_classification](https://github.com/vjstark/crosslingual_text_classification).
 it done through the terminal, by typing (in one line)
    `curl -o ./wiki.multi.fr.vec https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec`

In [3]:
# I used my google drive manage the dataset
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("drive/MyDrive/Colab Notebooks/NLP_Specialization")
!ls


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Course2-NLPProbModels  GoogleNews-vectors-negative300.bin
en_embeddings.p        GoogleNews-vectors-negative300.bin.gz
en-fr.test.txt	       NLP_with_Classification_and_Vector_Spaces
en-fr.train.txt        project.ipynb
fr_embeddings.p        wiki.multi.fr.vec


In [4]:

en_embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)
fr_embeddings = KeyedVectors.load_word2vec_format('./wiki.multi.fr.vec')
print("English and French embeddings are loaded")

# loading the english to french dictionaries
en_fr_train = get_dict('./en-fr.train.txt')
print('The length of the english to french training dictionary is', len(en_fr_train))
en_fr_test = get_dict('./en-fr.test.txt')
print('The length of the english to french test dictionary is', len(en_fr_train))

english_set = set(en_embeddings.vocab)
french_set = set(fr_embeddings.vocab)
en_embeddings_subset = {}
fr_embeddings_subset = {}
french_words = set(en_fr_train.values())

for en_word in en_fr_train.keys():
    fr_word = en_fr_train[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]


for en_word in en_fr_test.keys():
    fr_word = en_fr_test[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]


pickle.dump( en_embeddings_subset, open( "./en_embeddings.p", "wb" ) )
pickle.dump( fr_embeddings_subset, open( "./fr_embeddings.p", "wb" ) )

English and French embeddings are loaded
The length of the english to french training dictionary is 5000
The length of the english to french test dictionary is 5000


In [5]:
en_embeddings_subset = pickle.load(open("./en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("./fr_embeddings.p", "rb"))

In [6]:
def get_matrices(en_fr, french_vecs, english_vecs):
    """
    Input:
        en_fr: English to French dictionary
        french_vecs: French words to their corresponding word embeddings.
        english_vecs: English words to their corresponding word embeddings.
    Output: 
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the projection matrix that minimizes the F norm ||X R -Y||^2.
    """

    X_l = list()
    Y_l = list()

    english_set = set(english_vecs.keys())

    french_set = set(french_vecs.keys())
    french_words = set(en_fr.values())

    for en_word, fr_word in en_fr.items():
        if fr_word in french_set and en_word in english_set:
            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]
            X_l.append(en_vec)
            Y_l.append(fr_vec)
    X = np.stack(X_l)
    Y = np.stack(Y_l)

    return X, Y


In [7]:
# getting the training set:
X_train, Y_train = get_matrices(
    en_fr_train, fr_embeddings_subset, en_embeddings_subset)

In [8]:
def compute_loss(X, Y, R):
    '''
    Inputs: 
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings.
    Outputs:
        L: a matrix of dimension (m,n) - the value of the loss function for given X, Y and R.
    '''
    m = len(X)
    # diff is XR - Y
    diff = np.dot(X, R) - Y
    diff_squared = np.square(diff)
    sum_diff_squared = np.sum(diff_squared)
    loss = sum_diff_squared/m

    return loss


In [9]:
def compute_gradient(X, Y, R):
    '''
    Inputs: 
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings.
    Outputs:
        g: a scalar value - gradient of the loss function L for given X, Y and R.
    '''
    m = len(X)

    # gradient is X^T(XR - Y) * 2/m
    gradient = (2*np.dot(X.T, (np.dot(X, R) - Y)))/m

    return gradient


In [10]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):
    '''
    Inputs:
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        train_steps: positive int - describes how many steps will gradient descent algorithm do.
        learning_rate: positive float - describes how big steps will  gradient descent algorithm do.
    Outputs:
        R: a matrix of dimension (n,n) - the projection matrix that minimizes the F norm ||X R -Y||^2
    '''
    np.random.seed(130)

    R = np.random.rand(X.shape[1], X.shape[1])

    for i in range(train_steps):
        if i % 20 == 0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        gradient = compute_gradient(X, Y, R)
        R -= learning_rate*gradient

    return R


In [11]:
R_train = align_embeddings(X_train, Y_train, train_steps=400, learning_rate=0.8)

loss at iteration 0 is: 962.0391
loss at iteration 20 is: 133.0551
loss at iteration 40 is: 43.0979
loss at iteration 60 is: 17.4929
loss at iteration 80 is: 8.2717
loss at iteration 100 is: 4.4266
loss at iteration 120 is: 2.6432
loss at iteration 140 is: 1.7458
loss at iteration 160 is: 1.2646
loss at iteration 180 is: 0.9935
loss at iteration 200 is: 0.8346
loss at iteration 220 is: 0.7384
loss at iteration 240 is: 0.6787
loss at iteration 260 is: 0.6408
loss at iteration 280 is: 0.6163
loss at iteration 300 is: 0.6001
loss at iteration 320 is: 0.5893
loss at iteration 340 is: 0.5820
loss at iteration 360 is: 0.5769
loss at iteration 380 is: 0.5734


In [12]:
def nearest_neighbor(v, candidates, k=1):
    """
    Input:
      - v, the vector you are going find the nearest neighbor for
      - candidates: a set of vectors where we will find the neighbors
      - k: top k nearest neighbors to find
    Output:
      - k_idx: the indices of the top k closest vectors in sorted form
    """
    similarity_l = []

    for row in candidates:
        cos_similarity = cosine_similarity(v, row)
        similarity_l.append(cos_similarity)
    
    sorted_ids = np.argsort(similarity_l)
    k_idx = sorted_ids[len(sorted_ids)-k:]
    
    return k_idx


In [13]:
def test_vocabulary(X, Y, R):
    '''
    Input:
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the transform matrix which translates word embeddings from
        English to French word vector space.
    Output:
        accuracy: for the English to French capitals
    '''
    pred = np.dot(X, R)

    num_correct = 0

    for i in range(len(pred)):
        pred_idx = nearest_neighbor(pred[i], Y, 1)

        if pred_idx == i:
            num_correct += 1
    accuracy = num_correct/len(X)

    return accuracy


In [14]:
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings_subset, en_embeddings_subset)

In [15]:
acc = test_vocabulary(X_val, Y_val, R_train)
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.552
