<a href="https://colab.research.google.com/github/rojinadeuja/NLP-Model-Implementations/blob/master/GloVe_Word_Analogy_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import pandas as pd
import numpy as np
import torch
import torchtext

In [31]:
def load_vectors(file):
    '''Function to load the the word embedding matrix values'''
    with open(file, 'r', encoding="utf-8") as file:
        # unique words
        words = set()
        word_to_vec = {}
        # each line starts with a word then the values for the different features
        for line in file:
            line = line.strip().split()
            # take the word 
            curr_word = line[0]
            words.add(curr_word)
            # rest of the features for the word
            word_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec

In [37]:
# Get the words and their vectors from the Wikipedia 2014 dataset (6 Billions) and emedding size = 100
words, word_to_vec = load_vectors('/content/drive/My Drive/Colab Notebooks/glove.6B.100d.txt')

In [41]:
def cosine_similarity(u, v):
    '''Function to find the cosine similarity between two word vectors'''
    distance = 0.0
    
    # find the dot product between u and v 
    dot = np.dot(u,v)
    # find the L2 norm of u 
    norm_u = np.sqrt(np.sum(u**2))
    # Compute the L2 norm of v
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity
    cosine_sim = dot/(norm_u)/norm_v
    
    return cosine_sim

In [55]:
# Get some sample word vectors and find their cosine similarity
nepal = word_to_vec["nepal"]
kathmandu = word_to_vec["kathmandu"]
print("\nThe cosine similarity between Nepal and Kathmandu is", cosine_similarity(nepal, kathmandu))

nepal = word_to_vec["nepal"]
sea = word_to_vec["paris"]
print("\nThe cosine similarity between Nepal and Paris is", cosine_similarity(nepal, sea))


The cosine similarity between Nepal and Kathmandu is 0.7068282980813972

The cosine similarity between Nepal and Paris is 0.09641994220643965


In [61]:
def word_analogy(a, b, c, word_to_vec):
    '''Function to find the word analogy based on question: a is to b as c is to __?'''
    # convert words to lower case
    a = a.lower()
    b = b.lower()
    c = c.lower()
    
    # find the word embeddings for a, b, c
    e_a, e_b, e_c = word_to_vec[a], word_to_vec[b], word_to_vec[c]
    
    words = word_to_vec.keys()
    max_cosine_sim = -999              
    d = None                  

    # search for word_d in the whole word vector set
    for w in words:        
        # ignore input words
        if w in [a, b, c] :
            continue

        # Compute cosine similarity between the vectors u and v
        #u:(e_b - e_a) 
        #v:((w's vector representation) - e_c)
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec[w] - e_c)
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            # update word_d
            d = w
        
    return d

In [66]:
# Get three sample word vectors that are analogous and find the fourth word
samples = [('athens','greece','berlin')]
for sample in samples:
    print ('\n> {} is to {} as {} is to {}'.format( *sample, word_analogy(*sample, word_to_vec)))


> athens is to greece as berlin is to germany


In [67]:
# Get three sample word vectors that are analogous and find the fourth word
samples = [('dance','dancing','fly')]
for sample in samples:
    print ('\n> {} is to {} as {} is to {}'.format( *sample, word_analogy(*sample, word_to_vec)))


> dance is to dancing as fly is to donkeys


In [74]:
def word_analogy_manual():
    '''Function that takes a,b,c words from user and prints d'''
    print('\nWord Analogy Task (aka a is to b as c is to d)')

    print('\n> Enter words for a, b, c separated by commas')
    words = input().split(',')
    d = word_analogy(*words, word_to_vec)
    print ('\n> {} is to {} as {} is to {}'.format( *words, d))
    print('\n> Analogous word is: ' + d)

In [75]:
# Get input words form user for word analogy task
word_analogy_manual()


Word Analogy Task (aka a is to b as c is to d)

> Enter words for a, b, c separated by commas
man,boy,woman

> man is to boy as woman is to girl

> Analogous word is: girl
