In [1]:
import pickle

import pdb
import string
import pandas as pd
import numpy as np
import nltk
import time
from nltk.corpus import stopwords, twitter_samples
from os import getcwd
from reference import preprocess_tweets

In [2]:
# add folder, tmp2, from our local workspace containing pre-downloaded corpora files to nltk's data path
filePath = f"{getcwd()}/tmp2/"
filePath

'/Users/pallavisingh/Documents/Data Science/NLP/Module 1/tmp2/'

In [3]:
nltk.data.path.append(filePath)

In [4]:

en_embeddings_subset = pickle.load(open("./data/en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("./data/fr_embeddings.p", "rb"))

In [5]:
def get_dict(file_name):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        # indexing into the rows.
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof

en_fr_train = get_dict('en-fr.train.txt')
en_fr_test = get_dict('en-fr.test.txt')

In [7]:

# loading the english to french dictionaries
en_fr_train = get_dict('./data/en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('./data/en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_test))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 1500


In [8]:
for key in en_fr_train.values():
    print(key)

la
et
était
pour
cela
avec
depuis
ce
tuc
son
pas
sont
parlez
lequel
egalement
étaient
mais
ont
one
nouveautés
premiers
page
you
eux
avais
article
who
all
leurs
là
fabriqué
son
personnes
peut
aprés
autres
devrais
deux
partition
her
peut
ferait
plus
elle
quand
heure
equipe
américains
telles
débat
liens
seule
quelques
vois
unies
ans
école
mondiale
universitaire
lors
out
état
états
nationales
wikipedia
année
most
villes
utilisée
puis
comté
externes
où
sera
quelle
effacer
ces
janvier
mars
août
juillet
être
film
lui
plusieurs
sud
septembre
aimez
entre
octobre
three
juin
bah
utilisez
war
under
eux
avril
born
decembre
lien
ultérieur
partie
novembre
joueurs
listes
svp
suivant
février
connu
seconde
noms
groupe
historique
séries
juste
nord
travailler
avant
puisque
saisons
both
élevé
via
district
maintenant
observations
parceque
football
musique
cependant
diff
century
ligue
modifications
débat
titre
articles
john
même
comprenant
pourraient
anglais
album
numéro
against
familles
usager
basé
domaine


In [9]:
def cosine_similarity(A, B):

    # you have to set this variable to the true label.
    cos = -10    
    dot = np.dot(A, B)
    normb = np.linalg.norm(B)
    
    if len(A.shape) == 1: # If A is just a vector, we get the norm
        norma = np.linalg.norm(A)
        cos = dot / (norma * normb)
    else: # If A is a matrix, then compute the norms of the word vectors of the matrix (norm of each row)
        norma = np.linalg.norm(A, axis=1)
        epsilon = 1.0e-9 # to avoid division by 0
        cos = dot / (norma * normb + epsilon)
        
    return cos

## Translating English dictionary to French by using embeddings

Input
1. en_fr dict
2. en_embeddings
3. fr_embeddings

Output

1. Matrix X and matrix Y, where each row in X is the word embedding for an english word, and the same row in Y is the word embedding for the French version of that English word.

In [10]:
def get_matrices(en_fr, french_words,english_words):
    
    X = list()
    Y = list()
    
    french_set = set(french_words.keys())
    english_set = set(english_words.keys())
  
    
    for en_word, fr_word in en_fr.items():
        if en_word in english_set and fr_word in french_set:
            X.append(english_words[en_word])
            Y.append(french_words[fr_word])
            
    X_a = np.array(X)
    Y_a = np.array(Y)
    
    return X_a, Y_a

In [11]:
X_train, Y_train = get_matrices(en_fr_train,en_embeddings_subset,fr_embeddings_subset)

## Translation

In [12]:
def compute_loss(X,Y,R):
    
    diff = np.dot(X,R)-Y
    diff_squared = diff**2
    
    sum_squared = np.sum(diff_squared)
    
    loss = sum_squared/len(X)
    
    return loss

In [13]:
def compute_gradient(X,Y,R):
    gradient = np.dot(X.T,np.dot(X,R)-Y)*2/len(X)
    return gradient

In [14]:
def align_embeddings(X,Y,train_steps = 100, learn_rate = 0.005, compute_loss=compute_loss, compute_gradient=compute_gradient):
    R = np.random.rand(X.shape[1],X.shape[1] )
    for i in range(train_steps):
        gradient = compute_gradient(X,Y,R)
        
        R -= learn_rate*gradient
        
    return R

In [15]:
R_train = align_embeddings(X_train,Y_train,train_steps = 300, learn_rate = 0.8)

In [16]:
R_train

array([[-0.03665053,  0.2885594 ,  0.53315315, ...,  0.28250543,
         0.24315463,  0.5138078 ],
       [-0.00088202, -0.21133661,  0.15371498, ...,  0.23855552,
        -0.14375687,  0.06113843],
       [ 0.2741176 ,  0.17067269,  0.3531597 , ...,  0.16648302,
         0.19292169,  0.07234712],
       ...,
       [ 0.3692297 ,  0.62006444,  0.39879522, ...,  0.39816625,
         0.33890249,  0.31121741],
       [-0.08746549,  0.09780947,  0.09971761, ..., -0.21381802,
         0.29498149,  0.06461497],
       [ 0.33801974,  0.30380701,  0.41168384, ...,  0.30064477,
         0.42490025, -0.12824767]])

In [17]:
def nearest_neighbor(v,candidates,k=1,cosine_similarity = cosine_similarity ):
    similarity = list()
    
    for i in candidates:

        cos_similarity = cosine_similarity(i,v)
        similarity.append(cos_similarity)
    
    sorted_list = np.argsort(similarity)
    # Reverse the order of the sorted_ids array
    sorted_ids = sorted_list[::-1]
    k_idx = sorted_ids[:k]
    return k_idx

In [18]:
v = np.array([1, 0, 1])
candidates = np.array([[1, 0, 5], [-2, 5, 3], [2, 0, 1], [6, -9, 5], [9, 9, 9]])
candidates[nearest_neighbor(v, candidates, 3)]


array([[2, 0, 1],
       [1, 0, 5],
       [9, 9, 9]])

In [19]:
def test_vocabulary(X,Y,R, nearest_neighbor=nearest_neighbor):
    prediction = np.dot(X,R)
    correct_predictions = 0
    for i in range(len(prediction)):
        actual = nearest_neighbor(prediction[i],Y,k=1,cosine_similarity = cosine_similarity)
        if actual == i:
            correct_predictions += 1
            
    accuracy = correct_predictions/len(prediction)
    
    return accuracy


In [20]:
accuracy = test_vocabulary(X_train,Y_train, R_train)
accuracy

0.5903307888040712

In [21]:
X_train.shape

(1179, 300)

In [22]:
Y_train.shape

(1179, 300)

In [23]:
R_train.shape

(300, 300)

In [24]:
# get the positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets

In [25]:
en_embeddings_subset.keys()



In [26]:
def get_document_embedding(tweet, en_embeddings, preprocess_tweets = preprocess_tweets):
    preprocessed_tweet = preprocess_tweets(tweet)
    doc_embedding = np.zeros(300)
    for i in preprocessed_tweet:
        doc_embedding += en_embeddings.get(i,0)
        
    return doc_embedding 

In [27]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
tweet_embedding = get_document_embedding(custom_tweet, en_embeddings_subset)

In [28]:
tweet_embedding[-5:]

array([-0.00268555, -0.15378189, -0.55761719, -0.07216644, -0.32263184])

In [29]:
all_tweets

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

In [30]:
def get_document_vecs(all_docs,en_embeddings,get_document_embedding = get_document_embedding):
    document_vec_list = []
    document_indices ={}
    for i, doc in enumerate(all_docs):
        document_vec = get_document_embedding(doc,en_embeddings_subset)
        document_indices[i] = document_vec
        document_vec_list.append(document_vec)
        document_vec_matrix = np.vstack(document_vec_list)
    return document_vec_matrix, document_indices
        
        
        

In [31]:
 document_vec_matrix, document_indices = get_document_vecs(all_tweets, en_embeddings_subset)

In [32]:
my_tweet = 'i am happy'
preprocess_tweets(my_tweet)

tweet_embedding = get_document_embedding(my_tweet, en_embeddings_subset)
idx = np.argmax(cosine_similarity(document_vec_matrix,tweet_embedding))
print(all_tweets[idx])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
