# Machine Translation and Locality Sensitive Hashing

In [1]:
#Import the libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pdb
import pickle
import string
import time
import gensim
import nltk
import scipy
import sklearn
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from os import getcwd

from utils import (cosine_similarity, get_dict, process_tweet)

## Word Embeddings fotr English and French Words

In [2]:
#Get the English and French subsets
en_embeddings_subset = pickle.load(open('en_embeddings.p', 'rb'))
fr_embeddings_subset = pickle.load(open('fr_embeddings.p', 'rb'))
print("Length of English embeddings subset dictionary: ", len(en_embeddings_subset))
print("Length of French embeddings subset dictionary: ", len(fr_embeddings_subset))

Length of English embeddings subset dictionary:  6370
Length of French embeddings subset dictionary:  5766


In [3]:
#Load the train and test dictionaries
en_fr_train = get_dict('en-fr.train.txt')
en_fr_test = get_dict('en-fr.test.txt')

print("Length of training dictionary: ", len(en_fr_train))
print("Length of testing dictionary: ", len(en_fr_test))

Length of training dictionary:  5000
Length of testing dictionary:  1500


In [4]:
#Generate the embedding and Transform matrix

#Define the function get_matrices to get the vector embedding for every English and French word
def get_matrices(en_fr, french_vecs, english_vecs):
    
    #Create lists for storing English and France Embeddings
    X_l = list()
    Y_l = list()
    
    #Get the set of English and French words
    english_set = english_vecs.keys()
    french_set = french_vecs.keys()
    
    #Store the fench words that are part of en-fr dictionary
    french_words = set(en_fr.values())
    
    #Iterate through all the english and french words in en_fr dictionary
    for en_word, fr_word in en_fr.items():
        
        #Check if both words have embeddings
        if fr_word in french_set and en_word in english_set:
            
            #Get the English and French embeddings
            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]
            
            #Add french and english words to their lists
            X_l.append(en_vec)
            Y_l.append(fr_vec)
            
    #Stack the vectors of X_l in matrix X, and Y_l to Y
    X = np.stack(X_l)
    Y = np.stack(Y_l)
    
    return X, Y

In [5]:
#Get the X_train and Y_train
X_train, Y_train = get_matrices(en_fr_train, fr_embeddings_subset, en_embeddings_subset)

## Translation

In [6]:
#Translation as linear transformation of embeddings (Using R matrix)
#Define a function to compute loss for R matrix
def compute_loss(X, Y, R):
    
    #Get the difference
    diff = np.dot(X, R) - Y
    
    #Square the differnce
    diff_squared = diff ** 2
    
    #Get the sum of squared difference 
    sum_diff_squared = np.sum(diff_squared)
    
    #Calculate the loss
    loss = sum_diff_squared / len(X)
    
    return loss

In [7]:
#Define a function to compute gradient loss wrt R
def compute_gradient(X, Y, R):
    
    gradient = np.dot(X.T, (np.dot(X, R) - Y)) * 2 / len(X)
    return gradient

In [8]:
#Finding optimal R with gradient descent
def align_embeddings(X, Y, train_steps = 100, learning_rate = 0.0003):
    np.random.seed(129)
    
    #Create random R
    R = np.random.rand(X.shape[1], X.shape[1])
    
    #Iterate over number of epochs
    for i in range(train_steps):
        if i % 25 == 0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        gradient = compute_gradient(X, Y, R)
        
        #Update R
        R -= learning_rate * gradient
    
    return R

In [9]:
#Test the functions
np.random.seed(129)
m = 10
n = 5
X = np.random.rand(m, n)
Y = np.random.rand(m, n) * .1
R = align_embeddings(X, Y)

loss at iteration 0 is: 3.7242
loss at iteration 25 is: 3.6283
loss at iteration 50 is: 3.5350
loss at iteration 75 is: 3.4442


In [10]:
#Calculate R
R_train = align_embeddings(X_train, Y_train, train_steps=400, learning_rate=0.8)

loss at iteration 0 is: 963.0146
loss at iteration 25 is: 97.8292
loss at iteration 50 is: 26.8329
loss at iteration 75 is: 9.7893
loss at iteration 100 is: 4.3776
loss at iteration 125 is: 2.3281
loss at iteration 150 is: 1.4480
loss at iteration 175 is: 1.0338
loss at iteration 200 is: 0.8251
loss at iteration 225 is: 0.7145
loss at iteration 250 is: 0.6534
loss at iteration 275 is: 0.6185
loss at iteration 300 is: 0.5981
loss at iteration 325 is: 0.5858
loss at iteration 350 is: 0.5782
loss at iteration 375 is: 0.5735


In [11]:
#Define a function to find Nearest Neighbor
def nearest_neighbor(v, candidates, k = 1):
    #Define a list to store similarity
    similarity_l = []
    
    #For each candidate vector
    for row in candidates:
        
        #Get the cosine similarity
        cos_similarity = cosine_similarity(v, row)
        
        #Append the similarity to the list
        similarity_l.append(cos_similarity)
        
    #Sort the similarity list and get the indices
    sorted_ids = np.argsort(similarity_l)
    
    #Get the last k most similar vetors (descending)
    k_idx = sorted_ids[-k:]
    
    return k_idx

In [12]:
#Test the knn
v = np.array([1, 0, 1])
candidates = np.array([[1, 0, 5], [-2, 5, 3], [2, 0, 1], [6, -9, 5], [9, 9, 9]])
print(candidates[nearest_neighbor(v, candidates, 3)])

[[9 9 9]
 [1 0 5]
 [2 0 1]]


In [13]:
#Test the vocabulary
def test_vocabulary(X, Y, R):
    
    #Get the prediction
    pred = np.dot(X, R)
    
    #Get the counter of correct number of predictions
    num_correct = 0
    
    #Loop through each row in pred
    for i in range(len(pred)):
        
        #Get the nearest neighbor's index
        pred_idx = nearest_neighbor(pred[i], Y)
        
        #Increment counter if correct predicrtion
        if pred_idx == i:
            num_correct += 1
        
    #Calculate the accuracy
    accuracy = num_correct / len(pred)
    
    return accuracy
        

In [14]:
#Test
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings_subset, en_embeddings_subset)

In [15]:
#Calculate the accuracy
acc = test_vocabulary(X_val, Y_val, R_train)  # this might take a minute or two
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.557


# LSH and Document Search

In [16]:
#Get the tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = positive_tweets + negative_tweets

In [17]:
#Get the documents embeddings

#Define a function to get the embedding of the whole document
def get_document_embedding(tweet, en_embeddings):
    
    #Create empty array of 300 dim to store document embedding
    document_embedding = np.zeros(300)
    
    #Get the processed tweet 
    processed_tweet = process_tweet(tweet)
    
    #Iterate over all the words and get the embeddings of each word and add to document embeddding
    for word in processed_tweet:
        if word in en_embeddings:
            document_embedding += en_embeddings[word]
    
    return document_embedding

In [18]:
#Test the function on a custom tweet
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
tweet_embedding = get_document_embedding(custom_tweet, en_embeddings_subset)
tweet_embedding[-5:]

array([-0.00268555, -0.15378189, -0.55761719, -0.07216644, -0.32263184])

In [19]:
#Define a function to get all the vectors of a document
def get_document_vecs(all_docs, en_embeddings):
    
    #Create a dictionary that has tweet index as key and corresponding document embedding as value
    ind2doc_dict = {}
    
    #Create a list to store alll the document vectors
    document_vectors_l = []
    
    #Iterate over all the documents
    for i, doc in enumerate(all_docs):
        
        #Get the embeddings of document
        doc_embedding = get_document_embedding(doc, en_embeddings_subset)
        
        #Save the doc embedding in dictionary
        ind2doc_dict[i] = doc_embedding
        
        #Appenf the doc_embedding in the list
        document_vectors_l.append(doc_embedding)
        
    #Convert the list to matrix
    document_vec_matrix = np.stack(document_vectors_l)
    
    return document_vec_matrix, ind2doc_dict

In [20]:
#Get all the embeddings for all tweets
document_vecs, ind2Tweet = get_document_vecs(all_tweets, en_embeddings_subset)

In [21]:
#Print the shape of list and dictionary
print(f"length of dictionary {len(ind2Tweet)}")
print(f"shape of document_vecs {document_vecs.shape}")

length of dictionary 10000
shape of document_vecs (10000, 300)


In [22]:
#Find the most similar tweet from the corpus using cosine similarity
my_tweet = 'i am very happy, I am learning'
process_tweet(my_tweet)
tweet_embedding = get_document_embedding(my_tweet, en_embeddings_subset)
idx = np.argmax(cosine_similarity(document_vecs, tweet_embedding))
print(all_tweets[idx])

@PetiteMistress DO IT! I want to start one for making small games, but I feel like I need to get a jump start before asking for support :(


# Finding the most similar tweets using LSH (Instead of searching in 10000 tweets, search only in similar regions)


In [23]:
#Get the number of dimensions and length of vector
N_VECS = len(all_tweets)       # This many vectors.
N_DIMS = len(ind2Tweet[1])     # Vector dimensionality.
print(f"Number of vectors is {N_VECS} and each has {N_DIMS} dimensions.")

Number of vectors is 10000 and each has 300 dimensions.


In [24]:
#Each plane has 2 bunckets, so n will have 2^n buckets, we need 16 vectors in each region, so will have 10 planes. (2 ^ 10 * 16)
N_PLANES = 10
# Number of times to repeat the hashing to improve the search.
N_UNIVERSE = 25

In [25]:
#Create the planes list
np.random.seed(0)
planes_l = [np.random.normal(size = (N_DIMS, N_PLANES)) for _ in range(N_UNIVERSE)]
len(planes_l)

25

In [26]:
#Define a function to get a hash value of a vector wrt multi planes
def hash_value_of_vector(v, planes):
    
    #Get the dot product of vector and plane
    dot_product = np.dot(v, planes)
    
    #Get the sign
    dot_product_sign = np.sign(dot_product)
    
    #Get the scalar sign
    h = dot_product_sign > 0
    
    #Convert 2d array to 1d
    h = np.squeeze(h)
    
    #Initialize the hash value to 0
    hash_value = 0
    
    #Get the number of planes
    n_planes = planes.shape[1]
    
    #Iterate over all planes and calculate the hash value
    for i in range(n_planes):
        hash_value += 2 **i * h[i]
    
    #Create value to integer
    hash_value = int(hash_value)
    
    return hash_value

In [27]:
#Test the function
np.random.seed(0)
idx = 0
planes = planes_l[idx]  # get one 'universe' of planes to test the function
vec = np.random.rand(1, 300)
print(f" The hash value for this vector,",
      f"and the set of planes at index {idx},",
      f"is {hash_value_of_vector(vec, planes)}")

 The hash value for this vector, and the set of planes at index 0, is 768


In [28]:
#Creating the hash buckets
def make_hash_table(vecs, planes):
    #Get the number of planes
    n_planes = planes.shape[1]
    
    #Get the number of buckets
    n_buckets = 2 ** n_planes
    
    #Create an empty hash table and id_table dictionary
    hash_table = {i:[] for i in range(n_buckets)}
    id_table = {i:[] for i in range(n_buckets)}
    
    #Iterate over each vector and place in proper bucket
    for i, v in enumerate(vecs):
        
        #Calculate the hash value
        hash_value = hash_value_of_vector(v, planes)
        
        #Append in the dictionary
        hash_table[hash_value].append(v)
        
        #Append the index in id_table
        id_table[hash_value].append(i)
        
    return hash_table, id_table

In [29]:
#Test the function
np.random.seed(0)
planes = planes_l[0]  # get one 'universe' of planes to test the function
vec = np.random.rand(1, 300)
tmp_hash_table, tmp_id_table = make_hash_table(document_vecs, planes)

print(f"The hash table at key 0 has {len(tmp_hash_table[0])} document vectors")
print(f"The id table at key 0 has {len(tmp_id_table[0])}")
print(f"The first 5 document indices stored at key 0 of are {tmp_id_table[0][0:5]}")

The hash table at key 0 has 1351 document vectors
The id table at key 0 has 1351
The first 5 document indices stored at key 0 of are [3, 8, 16, 18, 29]


In [30]:
#Create all the hash tables
hash_tables = []
id_tables = []

for universe_id in range(N_UNIVERSE):
    print('working on hash universe #:', universe_id)
    planes = planes_l[universe_id]
    hash_table, id_table = make_hash_table(document_vecs, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

working on hash universe #: 0
working on hash universe #: 1
working on hash universe #: 2
working on hash universe #: 3
working on hash universe #: 4
working on hash universe #: 5
working on hash universe #: 6
working on hash universe #: 7
working on hash universe #: 8
working on hash universe #: 9
working on hash universe #: 10
working on hash universe #: 11
working on hash universe #: 12
working on hash universe #: 13
working on hash universe #: 14
working on hash universe #: 15
working on hash universe #: 16
working on hash universe #: 17
working on hash universe #: 18
working on hash universe #: 19
working on hash universe #: 20
working on hash universe #: 21
working on hash universe #: 22
working on hash universe #: 23
working on hash universe #: 24


# Approximate KNN using LSH

In [31]:
#Define a function to get approximate knn
def approximate_knn(doc_id, v, planes_l, k = 1, num_universes_to_use = N_UNIVERSE):
    
    assert num_universes_to_use <= N_UNIVERSE
    
    #Create the list for vectors to be considered
    vecs_to_consider = list()
    
    #List of documents id
    ids_to_consider = list()
    
    #Set for ids to consider
    ids_to_consider_set = set()
    
    #Loop through universe of planes
    for universe_id in range(num_universes_to_use):
        
        #Get the set of planes from list
        planes = planes_l[universe_id]
        
        #Get the hash value for the vector for set of these planes
        hash_value = hash_value_of_vector(v, planes)
        
        #Get the hash table for this universe_id
        hash_table = hash_tables[universe_id]
        
        # get the list of document vectors for this hash table, where the key is the hash_value
        document_vectors_l = hash_table[hash_value]

        # get the id_table for this particular universe_id
        id_table = id_tables[universe_id]

        # get the subset of documents to consider as nearest neighbors from this id_table dictionary
        new_ids_to_consider = id_table[hash_value]

        ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

        # remove the id of the document that we're searching
        if doc_id in new_ids_to_consider:
            new_ids_to_consider.remove(doc_id)
            print(f"removed doc_id {doc_id} of input vector from new_ids_to_search")

        # loop through the subset of document vectors to consider
        for i, new_id in enumerate(new_ids_to_consider):

            # if the document ID is not yet in the set ids_to_consider...
            if new_id not in ids_to_consider_set:
                # access document_vectors_l list at index i to get the embedding
                # then append it to the list of vectors to consider as possible nearest neighbors
                document_vector_at_i = document_vectors_l[i]
                vecs_to_consider.append(document_vector_at_i)

                # append the new_id (the index for the document) to the list of ids to consider
                ids_to_consider.append(new_id)

                # also add the new_id to the set of ids to consider
                # (use this to check if new_id is not already in the IDs to consider)
                ids_to_consider_set.add(new_id)

    # Now run k-NN on the smaller set of vecs-to-consider.
    print("Fast considering %d vecs" % len(vecs_to_consider))

    # convert the vecs to consider set to a list, then to a numpy array
    vecs_to_consider_arr = np.array(vecs_to_consider)

    # call nearest neighbors on the reduced list of candidate vectors
    nearest_neighbor_idx_l = nearest_neighbor(v, vecs_to_consider_arr, k=k)

    # Use the nearest neighbor index list as indices into the ids to consider
    # create a list of nearest neighbors by the document ids
    nearest_neighbor_ids = [ids_to_consider[idx]
                            for idx in nearest_neighbor_idx_l]

    return nearest_neighbor_ids


In [32]:
#document_vecs, ind2Tweet
doc_id = 0
doc_to_search = all_tweets[doc_id]
vec_to_search = document_vecs[doc_id]

In [33]:
#Test
nearest_neighbor_ids = approximate_knn(
    doc_id, vec_to_search, planes_l, k=3, num_universes_to_use=5)

removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
Fast considering 77 vecs


In [34]:
print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")

for neighbor_id in nearest_neighbor_ids:
    print(f"Nearest neighbor at document id {neighbor_id}")
    print(f"document contents: {all_tweets[neighbor_id]}")

Nearest neighbors for document 0
Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Nearest neighbor at document id 2140
document contents: @PopsRamjet come one, every now and then is not so bad :)
Nearest neighbor at document id 701
document contents: With the top cutie of Bohol :) https://t.co/Jh7F6U46UB
Nearest neighbor at document id 51
document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)
