
<h1 style="font-family:verdana;font-size:300%;text-align:center;background-color:#f2f2f2;color:#0d0d0d">AMMI NLP - Review sessions</h1>

<h1 style="font-family:verdana;font-size:180%;text-align:Center;color:#993333"> Lab 2: Introduction to wordvectors </h1>

**Big thanks to Amr Khalifa who improved this lab and made it to a Jupyter Notebook!**

In [1]:
import io, sys
import numpy as np



In [2]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
    return data

In [3]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

'''
word_vectors is a dictionary that maps words to their numerical word vector
[word (string)] = [np-array]
'''
word_vectors = load_vectors('wiki.en.vec')

tree_vector = word_vectors['tree']
print(type(tree_vector), len(tree_vector))


 ** Word vectors ** 

<class 'numpy.ndarray'> 300


In [4]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    '''
    Parameters:
    u : 1-D numpy array
    v : 1-D numpy array

    Returns:
    cos (float) : value of the cosine similairy between vectors u, v
    '''
    ## FILL CODE
    return (u@v)/((np.linalg.norm(u,ord=2))*(np.linalg.norm(v,ord=2)))


In [5]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [6]:
## Functions for nearest neighbor
## This function returns the word corresponding to
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    '''
    Parameters:
    x (string): word to find its nearest neighbour
    word_vectors (Python dict): {word (string): np-array of word vector}
    exclude_words (list of strings): words to be excluded from the search

    Returns:
    best_word (string) : the word whose word vector is the nearest neighbour
    to the word vector of x
    '''
    best_score = -1.0
    best_word = None
    for word in word_vectors.keys():
        if word in exclude_words:
            continue
        score = cosine(word_vectors[word], x)
        if score > best_score:
            best_score= score
            best_word = word
    return best_word

In [7]:
print('')
%timeit
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))


The nearest neighbor of cat is: dog


#### Hint (using python priorty queues with the heapq datastructure):
if you don't want to store all the words and scores you can use the priortiy queue and only store the best K element so far.

In [8]:

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, word_vectors, k):
    '''
    Parameters:
    x (string): word to find its nearest neighbour
    word_vectors (Python dict): {word (string): np-array of word vector}
    k (int): number of nearest neighbours to be found

    Returns:
    k_nearest_neighbors (list of tuples): [(score, word), (score, word), ....]
    '''

    ## FILL CODE
    k_nearest_neighbors = []
    for word in word_vectors.keys():
        score = cosine(x, word_vectors[word])
        if len(k_nearest_neighbors)<k:
            k_nearest_neighbors.append( ( score, word ))
        else:
            smallest_index= 0
            for i, val in enumerate(k_nearest_neighbors):
                if val[0]<score:
                    if k_nearest_neighbors[smallest_index]<k_nearest_neighbors[i]:
                        smallest_index = i
            k_nearest_neighbors[smallest_index]= (score, word)

    return k_nearest_neighbors

In [9]:
knn_cat = knn(word_vectors['cat'], word_vectors, 5)
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print (word + '\t%.3f' % score)



cat
--------------
apples	0.231
cat	1.000
pet	0.573
cats	0.732
rabbit	0.549


#### Hint:
To find the analogies, we find the nearest neighbour associated with the wordvector d
$$ d = \frac{c}{\Vert {c} \Vert} + \frac{b}{\Vert {b} \Vert} - \frac{a}{\Vert {a} \Vert}$$


In [13]:
def normalval(x):
    return x/np.linalg.norm(x,ord=2)

In [14]:
## This function return the words d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    '''
    Parameters:
    a (string): word a
    b (string): word b
    c (string): word c
    word_vectors (Python dict): {word (string): np-array of word vector}

    Returnrs:
    the word d (string) associated with c such that c:d is similar to a:b

    '''
    d = normalval(word_vectors[c]) + normalval(word_vectors[b]) - normalval(word_vectors[a])
    ## FILL CODE
    return nearest_neighbor(d,word_vectors)

In [15]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


france - paris + rome = italy


## A word about biases in word vectors

In [18]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [19]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    '''
    Parameters:
    w (string): word w
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}

    Returnrs:
    strength (float): the value of the association strength
    '''

    strength = 0.0
    part_a = 0.0
    for a in A:
        part_a+=cosine(vectors[w],vectors[a])
    part_a = part_a/len(A)
    part_b = 0.0
    for b in B:
        part_b+=cosine(vectors[w],vectors[b])
    part_b = part_b/len(B)
    ## FILL CODE
    return part_a - part_b

In [21]:
# print(word_vectors.keys())

places = ['france', "canada", "australian", "italy", "california", "kingdom"]
months = [ "september", "october", "november","december", "february", "june"]
association_strength("january", places, months, word_vectors)


-0.7757741522698951

In [22]:
## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    '''
    Parameters:
    X (list of strings): The words belonging to set X
    Y (list of strings): The words belonging to set Y
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    score (float): the value of the group association strength
    '''

    score = 0.0
    scorex =0.0
    scorey = 0.0
    for wordx in X:
        scorex += association_strength(wordx, A, B, vectors)
    scorex = scorex / len(X)
    for wordy in Y:
        scorey += association_strength(wordy, A, B, vectors)
    scorey = scorey / len(Y)
    ## FILL CODE
    return scorex-scorey

In [24]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation',
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(family,career, male, female, word_vectors))


Word embedding association test: -0.106


## Word translation using word vectors

In the following, we will use word vectors in English and French to translate words from English to French. The idea is to learn a linear function that maps English word vectors to their correponding French word vectors. To learn this linear mapping, we will use a small bilingual lexicon, that contains pairs of words in English and French that are translations of each other.

The following function will load the small English-French bilingual lexicon:

In [25]:
def load_lexicon(filename):
    '''
    Parameters:
    filename(string): the path of the lexicon

    Returns:
    data(list of pairs of string): the bilingual lexicon
    '''
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    data = []
    for line in fin:
        a, b = line.rstrip().split(' ')
        data.append((a, b))
    return data

In [26]:
word_vectors_en = load_vectors('wiki.en.vec')
word_vectors_fr = load_vectors('wiki.fr.vec')
lexicon = load_lexicon("lexicon-en-fr.txt")
print(lexicon[:5])

[('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut')]


In [104]:
# We split the lexicon into a train and validation set
train = lexicon[:5000]
valid = lexicon[5000:5100]

In [105]:
def mseLoss(prediction, label):
    return np.average(np.sqrt(np.sum((prediction - label)**2, axis=1)))

def dwMseLoss(x, w, label):
    return 2*x.T@x@w -2*x.T@label

The following function will learn the mapping from English to French. The idea is to build two matrices $X_{\text{en}}$ and $X_{\text{fr}}$, and to find a mapping $M$ that minimizes $||X_{\text{en}} W - X_{\text{fr}} ||_2$. In numpy, this mapping can be obtained by using the `numpy.linalg.lstsq` function.

In [109]:
def align(word_vectors_en, word_vectors_fr, lexicon):
    '''
    Parameters:
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    lexicon(list of pairs of string): bilingual training lexicon

    Returns
    mapping(np.array): the mapping from English to French vectors
    '''
    x_en, x_fr = [], []
    # w = np.zeros((len(word_vectors_en), len(word_vectors_fr)))
    for (en, fr) in lexicon:
        x_en.append(word_vectors_en[en])
        x_fr.append(word_vectors_fr[fr])
    x_en, x_fr = np.array(x_en), np.array(x_fr)


    w = np.linalg.lstsq(x_en, x_fr, rcond=None)
    w=np.array(w[0])

    EPOCHS=10
    batch_size = 5
    initial_index = 0
    lr=0.01
    for epoch in range(EPOCHS):
        losses = []
        for index in range(initial_index, len(x_en), batch_size):
            x_end = x_en[initial_index:initial_index+batch_size if initial_index+batch_size < len(x_en) else len(x_en)]
            prediction = x_end@w
            labels = x_fr[initial_index:initial_index+batch_size if initial_index+batch_size < len(x_en) else len(x_en)]
            loss = mseLoss(prediction, labels)
            losses.append(loss)
            w -= 0.01 * dwMseLoss(x_end, w, labels)
            if len(losses) % 1000 ==0:
                lr/=5
                print("Loss: ", loss)
    return w

In [110]:
mapping = align(word_vectors_en, word_vectors_fr, lexicon)

Loss:  0.9167691711041114
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112
Loss:  0.9167691711041112


Given a mapping, a set of word English word vector and French word vectors, the next function will translate the English word to French. To do so, we apply the mapping on the English word, and retrieve the nearest neighbor of the obtained vector in the set of French word vectors. The translation is then the corresponding French word.

In [111]:
def translate(word, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    word(string): an English word
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors

    Returns
    A string containing the translation of the English word
    '''
    ## FILL CODE
    prediction = np.array(word_vectors_en[word])@mapping
    return nearest_neighbor(prediction, word_vectors_fr)

In [112]:
print(translate("world", word_vectors_en, word_vectors_fr, mapping))
print(translate("machine", word_vectors_en, word_vectors_fr, mapping))
print(translate("learning", word_vectors_en, word_vectors_fr, mapping))

monde
machine
apprentissage


Finally, let's implement a function to evaluate this method on the validation lexicon:

In [113]:
def evaluate(valid, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    valid(a list of pairs of string): the validation lexicon
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_fr(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors

    Returns
    Accuracy(float): the accuracy on the validation lexicon
    '''
    acc = 0.0
    ## FILL CODE
    x_en, x_fr = [], []
    for index, (enw, frw) in enumerate(valid):
        prediction = word_vectors_en[enw]@mapping
        word = translate(enw, word_vectors_en, word_vectors_fr, mapping)
        if word == frw:
            acc+=1
    return (acc/len(valid))*100

In [115]:
print(f"Accuracy: .3f%", evaluate(valid, word_vectors_en, word_vectors_fr, mapping))

Accuracy: .3f% 63.0


In [117]:
def translateSentence(sentence,word_vectors_en, word_vectors_fr, mapping):
    french_sentence = []
    for word in sentence:
        prediction = np.array(word_vectors_en[word])@mapping
        french_sentence.append(nearest_neighbor(prediction, word_vectors_fr))
    return french_sentence

In [118]:
print(translateSentence(["world", "is", "beautiful", "place"], word_vectors_en, word_vectors_fr, mapping))

['monde', 'est', 'beauté', 'lieu']
