# Natural Language Processing with classification and vector spaces

# Machine Translation System

# English to French words

In [48]:
import numpy as np
import pickle
import pandas as pd

The data

In [49]:
#from google.colab import files
#uploaded = files.upload()

In [50]:
en_embeddings = pickle.load(open('en_embeddings.p', 'rb'))
fr_embeddings = pickle.load(open('fr_embeddings.p', 'rb'))

Load two dictionaries mapping the english and frech words

In [51]:
#from google.colab import files
#uploaded = files.upload()

Build a function that returns the english to french dictionary given a file where the each column corresponds to a word.




If there will be repitition of the words then we will consider the final mapping.

In [52]:
def get_dict(file_name):
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr
  
    return etof

In [53]:
en_fr_train = get_dict('en-fr.train.txt')
en_fr_test = get_dict('en-fr.test.txt')

In [54]:
print('The lenght of the English to French translation training dictionay : ', len(en_fr_train))
print('The lenght of the English to French translation testing dictionay : ', len(en_fr_test))

The lenght of the English to French translation training dictionay :  5000
The lenght of the English to French translation testing dictionay :  1500


Thus the training set is of 5000 words and the test set of 1500 words.

**Generate embeddings and the transformation matrices**

We will now implement a function get_matrices, which takes the loaded data and returns matrices X and Y.

In [55]:
def get_matrices(en_fr, fr_vectors, en_vectors):
  # create a list for X and Y
  X_l = list()
  y_l = list()

  # get the english words and store them into the set
  eng_set = en_vectors.keys()

  # get the french words and store them into the set
  fr_set = fr_vectors.keys()

  for en_word, fr_word in en_fr.items():
    if fr_word in fr_set and en_word in eng_set :
      en_vec = en_vectors[en_word]
      fr_vec = fr_vectors[fr_word]

      # append
      X_l.append(en_vec)
      y_l.append(fr_vec)

  X = np.vstack(X_l)
  Y = np.vstack(y_l)
  return X, Y

Now we will use function get_matrices() to obtain sets X_train and Y_train of English and French word embeddings into the corresponding vector space models.

In [56]:
X_train, Y_train = get_matrices(en_fr_train, fr_embeddings, en_embeddings)

**Implementing the Machine Translation Mechanism**

Computing the loss

In [57]:
def compute_loss(X, Y, R):

  m = X.shape[0]

  diff = np.dot(X, R) - Y

  diff_squared = diff**2

  sum_diff_squared = np.sum(diff_squared)

  loss = sum_diff_squared/m

  return loss

Compute the gradient

In [58]:
def compute_gradient(X, Y, R):

  m = X.shape[0]
  gradient = (2*np.dot(X.T, (np.dot(X, R) - Y)))/m
  
  return gradient

The Gradient Descent Algorithm

In [59]:
def align_embeddings(X, Y, training_steps=100, learning_rate = 0.0003):
  np.random.seed(129)
  R = np.random.rand(X.shape[1], X.shape[1])

  for i in range(training_steps):
    if i%25 == 0:
      print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
    gradient = compute_gradient(X, Y, R)
    R = R- learning_rate*gradient
  
  return R

In [60]:
np.random.seed(129)
X = np.random.rand(10, 5)
Y = np.random.rand(10, 5)*.1
R=align_embeddings(X, Y)

loss at iteration 0 is: 3.7242
loss at iteration 25 is: 3.6283
loss at iteration 50 is: 3.5350
loss at iteration 75 is: 3.4442


**Calculate the transformation matrix R**

In [61]:
R_train = align_embeddings(X_train, Y_train, training_steps=400, learning_rate=0.8)

loss at iteration 0 is: 963.0146
loss at iteration 25 is: 97.8292
loss at iteration 50 is: 26.8329
loss at iteration 75 is: 9.7893
loss at iteration 100 is: 4.3776
loss at iteration 125 is: 2.3281
loss at iteration 150 is: 1.4480
loss at iteration 175 is: 1.0338
loss at iteration 200 is: 0.8251
loss at iteration 225 is: 0.7145
loss at iteration 250 is: 0.6534
loss at iteration 275 is: 0.6185
loss at iteration 300 is: 0.5981
loss at iteration 325 is: 0.5858
loss at iteration 350 is: 0.5782
loss at iteration 375 is: 0.5735


**Search for the nearest neighbours**

In [62]:
def cosine_similarity(A, B):
  num = np.dot(A, B)
  den = np.linalg.norm(A) * np.linalg.norm(B)
  return num/den

In [63]:
def nearest_neighbor(v, candidates, k=1):
  similarity_l = []
  for vec in candidates:
    sim = cosine_similarity(v, vec)
    similarity_l.append(sim)
  
  sorted_ids = np.argsort(similarity_l)
  k_idx = sorted_ids[-k:]

  return k_idx

In [64]:
# Test your implementation:
v = np.array([1, 0, 1])
candidates = np.array([[1, 0, 5], [-2, 5, 3], [2, 0, 1], [6, -9, 5], [9, 9, 9]])
print(candidates[nearest_neighbor(v, candidates, 3)])

[[9 9 9]
 [1 0 5]
 [2 0 1]]


In [65]:
v1 = np.array

**Test our function and determine the accuracy**

In [66]:
def test_vocabulary(X, Y, R):
 
  pred = np.dot(X, R)
  num_corrects = 0

  for i in range(len(pred)):
    pred_idx = nearest_neighbor(pred[i], Y)

    if pred_idx == i:
      num_corrects+=1

  accuracy = num_corrects/len(pred)
  return accuracy

In [67]:
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings, en_embeddings)

In [68]:
acc = test_vocabulary(X_val, Y_val, R_train)  # this might take a minute or two
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.557


**Test for few words**

In [69]:
#from google.colab import files
#uploaded = files.upload()

In [70]:
def get_eng(file_name):
    my_file = pd.read_csv(file_name, delimiter=',')
    eng_words = []  # the english to french dictionary to be returned
    for i in range(len(my_file)):
      eng_words.append(my_file.loc[i][0])   
  
    return eng_words

In [71]:
en_fr_demo = get_eng('en_fr_demo2.csv')
en_fr_demo

['tools', 'dog']

In [72]:
def get_vec(en_fr_demo, en_vectors):
  X_l = list()

  eng_words = en_vectors.keys()

  for en_word in en_fr_demo:
    if en_word in eng_words:
      en_vec = en_vectors[en_word]
      X_l.append(en_vec)
  X = np.vstack(X_l) 
  return X

In [73]:
X_demo = get_vec(en_fr_demo, en_embeddings)
X_demo.shape

(2, 300)

In [74]:
def get_key(val):
  for key, value in fr_embeddings.items():
    if (val == value).all():
      return key

In [75]:
for i in range(len(en_fr_demo)):
  pred = np.dot(X_demo[i], R_train)
  match_word_vec = Y_train[nearest_neighbor(pred, Y_train)]
  word = get_key(match_word_vec)
  print(en_fr_demo[i], word)  

tools outils
dog chien


Thus our model performs pretty well.