## Mount Drive for Google Colab

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

## Import Modules

In [2]:
import pandas as pd
import numpy as np

## Load Word vectors

In [3]:
def load_vectors(file):
    '''Function to load the the word embedding matrix values'''
    with open(file, 'r', encoding="utf-8") as file:
        # unique words
        words = set()
        word_to_vec = {}
        # each line starts with a word then the values for the different features
        for line in file:
            line = line.strip().split()
            # take the word 
            curr_word = line[0]
            words.add(curr_word)
            # rest of the features for the word
            word_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec

In [4]:
# Get the words and their vectors from the Wikipedia 2014 dataset (6 Billions) and emedding size = 100
# words, word_to_vec = load_vectors('/content/drive/My Drive/Colab Notebooks/glove.6B.100d.txt')
words, word_to_vec = load_vectors('Datasets/glove.6B.100d.txt')

## Word Similarity (Cosine Similarity)

In [5]:
def cosine_similarity(u, v):
    '''Function to find the cosine similarity between two word vectors'''
    distance = 0.0
    # The cosine similarity measures the angle between two vectors, and has the property that it only considers the direction of the vectors, not their the magnitudes
    # find the dot product between u and v 
    dot = np.dot(u,v)
    # find the L2 norm of u 
    norm_u = np.sqrt(np.sum(u**2))
    # Compute the L2 norm of v
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity
    cosine_sim = dot/(norm_u)/norm_v
    
    return cosine_sim

In [6]:
# Get some sample word vectors and find their cosine similarity
nepal = word_to_vec["nepal"]
kathmandu = word_to_vec["kathmandu"]
print("\nThe cosine similarity between Nepal and Kathmandu is", cosine_similarity(nepal, kathmandu))

nepal = word_to_vec["nepal"]
sea = word_to_vec["paris"]
print("\nThe cosine similarity between Nepal and Paris is", cosine_similarity(nepal, sea))


The cosine similarity between Nepal and Kathmandu is 0.7068282980813972

The cosine similarity between Nepal and Paris is 0.09641994220643965


## Mikolov et al. Word Analogy Task

In [7]:
def word_analogy(a, b, c, word_to_vec):
    '''Function to find the word analogy based on question: a is to b as c is to __?'''
    # convert words to lower case
    a = a.lower()
    b = b.lower()
    c = c.lower()
    
    # find the word embeddings for a, b, c
    e_a, e_b, e_c = word_to_vec[a], word_to_vec[b], word_to_vec[c]
    
    words = word_to_vec.keys()
    max_cosine_sim = -999              
    d = None                  

    # search for word_d in the whole word vector set
    for w in words:        
        # ignore input words
        if w in [a, b, c] :
            continue

        # Compute cosine similarity between the vectors u and v
        #u:(e_b - e_a) 
        #v:((w's vector representation) - e_c)
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec[w] - e_c)
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            # update word_d
            d = w
        
    return d

## Semantic Questions

In [8]:
# Get three sample word vectors that are analogous and find the fourth word
samples = [('athens','greece','berlin')]
for sample in samples:
    print ('\n> {} is to {} as {} is to {}'.format( *sample, word_analogy(*sample, word_to_vec)))


> athens is to greece as berlin is to germany


## Syntactic Questions

In [9]:
# Get three sample word vectors that are analogous and find the fourth word
samples = [('dance','dancing','fly')]
for sample in samples:
    print ('\n> {} is to {} as {} is to {}'.format( *sample, word_analogy(*sample, word_to_vec)))


> dance is to dancing as fly is to donkeys


## Word Analogy Task with User Input

In [10]:
def word_analogy_manual():
    '''Function that takes a,b,c words from user and prints d'''
    
    print('\nWord Analogy Task (aka a is to b as c is to d)')

    print('\n> Enter words for a, b, c separated by commas')
    words = input().split(',')
    d = word_analogy(*words, word_to_vec)
    print ('\n> {} is to {} as {} is to {}'.format( *words, d))
    print('\n> Analogous word is: ' + d)

In [11]:
# Get input words form user for word analogy task
word_analogy_manual()


Word Analogy Task (aka a is to b as c is to d)

> Enter words for a, b, c separated by commas
waiter,waitress,actor

> waiter is to waitress as actor is to actress

> Analogous word is: actress


## More on Word Analogy (using TorchText)
To load pre-trained GloVe embeddings, we'll use a package called torchtext. The documentation for torchtext GloVe vectors are available at: https://torchtext.readthedocs.io/en/latest/vocab.html#glove

In [12]:
# For import torch not working: !pip3 install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

import torch
import torchtext
# The first time you run this will download a ~823MB file
glove = torchtext.vocab.GloVe(name="6B", # trained on Wikipedia 2014 corpus
                              dim=50)   # embedding size = 100

In [13]:
# Print embeddings of a sample word
glove['house']

tensor([ 0.6014,  0.2852, -0.0320, -0.4303,  0.7481,  0.2622, -0.9736,  0.0786,
        -0.5759, -1.1880, -1.8507, -0.2489,  0.0555,  0.0086,  0.0680,  0.4055,
        -0.0740, -0.2132,  0.3717, -0.7179,  1.2234,  0.3555, -0.4154, -0.2193,
        -0.3966, -1.7831, -0.4151,  0.2953, -0.4125,  0.0201,  2.7425, -0.9926,
        -0.7103, -0.4681,  0.2826, -0.0776,  0.3041, -0.0664,  0.3951, -0.7075,
        -0.3889,  0.2316, -0.4951,  0.1461, -0.0231,  0.5639, -0.8619, -1.0278,
         0.0399,  0.2002])

## Word Similarity (Cosine Similarity)

In [14]:
word = 'cat'
other = ['dog', 'bike', 'kitten', 'puppy', 'kite', 'computer', 'neuron']
print("Similarities of various words with the word: cat")
for w in other:
    dist = torch.cosine_similarity(glove[word].unsqueeze(0) , glove[w].unsqueeze(0))
    print("\n> ", w, float(dist))

Similarities of various words with the word: cat

>  dog 0.9218005537986755

>  bike 0.44144073128700256

>  kitten 0.6386305689811707

>  puppy 0.7625599503517151

>  kite 0.4891083538532257

>  computer 0.3525111973285675

>  neuron 0.21150361001491547


## Most Similar Words

In [15]:
def most_similar(word, word_vec, n=5):
    '''Function that returns the n most similar words to a given input word and its word vector'''
    print("Similarities of various words with: ", word)
    dists = torch.norm(glove.vectors - word_vec, dim=1)     # compute distances to all words
    
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1]) # sort by distance
    
    for idx, difference in lst[1:n+1]:    # take the top n
        print("\n> ",glove.itos[idx], difference)

most_similar("cat", glove["cat"], n=10)

Similarities of various words with:  cat

>  dog 1.8846031

>  rabbit 2.4572797

>  monkey 2.8102052

>  cats 2.8972247

>  rat 2.9455352

>  beast 2.9878407

>  monster 3.0022194

>  pet 3.0396757

>  snake 3.0617998

>  puppy 3.0644655


## Some vector operations

In [16]:
most_similar("(happy + sad) /2", (glove['happy'] + glove['sad']) / 2)

Similarities of various words with:  (happy + sad) /2

>  happy 1.9199749

>  feels 2.3604643

>  sorry 2.4984782

>  hardly 2.52593

>  imagine 2.5652788


In [17]:
most_similar("(king - man + woman)", (glove['king'] - glove['man'] + glove['woman']))

Similarities of various words with:  (king - man + woman)

>  queen 2.8391209

>  prince 3.6610038

>  elizabeth 3.7152522

>  daughter 3.8317878

>  widow 3.8493774


In [18]:
most_similar("(queen - woman + man)", (glove['queen'] - glove['woman'] + glove['man']))

Similarities of various words with:  (queen - woman + man)

>  king 2.8391209

>  prince 3.2508988

>  crown 3.4485192

>  knight 3.5587437

>  coronation 3.6198905


In [19]:
most_similar("(programmer + good - bad)", (glove['programmer'] - glove['bad'] + glove['good']))

Similarities of various words with:  (programmer + good - bad)

>  versatile 4.381561

>  creative 4.5690007

>  entrepreneur 4.6343737

>  enables 4.7177725

>  intelligent 4.7349973


In [20]:
most_similar("(programmer - good + bad)", (glove['programmer'] - glove['good'] + glove['bad']))

Similarities of various words with:  (programmer - good + bad)

>  hacker 3.8383653

>  glitch 4.003873

>  originator 4.041952

>  hack 4.047719

>  serial 4.2250676
