# GloVE

Let's work on implementation of GloVE.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import Counter
from string import punctuation
from scipy import spatial
import math
import time
from itertools import combinations_with_replacement

In [3]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import matplotlib
nltk.download('stopwords')
nltk.download('brown')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading news: Package 'news' not found in index


False

## 1. Load data

In [3]:
corpus = brown.sents()

In [4]:
stop_words = set(stopwords.words('english'))
corpus = [[word for word in sent if word.lower() not in stop_words] for sent in corpus]

# Remove punctuation from corpus
corpus = [[word for word in sent if word not in punctuation] for sent in corpus]

# Remove empty sentences
corpus = [sent for sent in corpus if len(sent) > 0]

# Remove sentences with less than 5 words
corpus = [sent for sent in corpus if len(sent) >= 5]

# Remove sentences with more than 20 words
corpus = [sent for sent in corpus if len(sent) <= 20]

# Remove rare words
word_freq = Counter([word for sent in corpus for word in sent])
corpus = [[word for word in sent if word_freq[word] > 5] for sent in corpus]

In [5]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['mm.',
 'physician',
 'Drexel',
 'maintenance',
 'Steele',
 'Would',
 'operation',
 'doctor',
 'Said',
 'breath',
 'representing',
 'Pete',
 'sponsors',
 'wired',
 'marketing',
 'melted',
 'notions',
 'raises',
 'drivers',
 'rang',
 "I'd",
 'assessors',
 'residence',
 'enabled',
 'bad',
 'dominant',
 'Diane',
 'isolate',
 'unconsciously',
 'exhaust',
 'Alabama',
 'used',
 'creates',
 'remarked',
 'interpret',
 'elect',
 'exhibition',
 'recalls',
 'turns',
 'note',
 'literary',
 'permits',
 'warfare',
 'burned',
 'conflict',
 'close',
 'requiring',
 'orange',
 'suffering',
 'shaved',
 'Lublin',
 'privilege',
 'directions',
 'swore',
 'Russell',
 'Staff',
 'gm.',
 'Bryan',
 'asks',
 'principles',
 'Jan.',
 'hate',
 'Horse',
 'textiles',
 'restorative',
 'Pops',
 'Water',
 'tilted',
 'drained',
 'mathematical',
 'witty',
 'resemblance',
 'lamb',
 'Kentucky',
 'Day',
 'stake',
 'Benington',
 'Democratic',
 'injured',
 'companies',
 'contribution',
 'uncertain',
 'steady',
 'services',
 'u

In [6]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [7]:
#vocab size
voc_size = len(vocab)
print(voc_size)

10582


In [8]:
#append UNK
vocab.append('<UNK>')

In [9]:
vocab

['mm.',
 'physician',
 'Drexel',
 'maintenance',
 'Steele',
 'Would',
 'operation',
 'doctor',
 'Said',
 'breath',
 'representing',
 'Pete',
 'sponsors',
 'wired',
 'marketing',
 'melted',
 'notions',
 'raises',
 'drivers',
 'rang',
 "I'd",
 'assessors',
 'residence',
 'enabled',
 'bad',
 'dominant',
 'Diane',
 'isolate',
 'unconsciously',
 'exhaust',
 'Alabama',
 'used',
 'creates',
 'remarked',
 'interpret',
 'elect',
 'exhibition',
 'recalls',
 'turns',
 'note',
 'literary',
 'permits',
 'warfare',
 'burned',
 'conflict',
 'close',
 'requiring',
 'orange',
 'suffering',
 'shaved',
 'Lublin',
 'privilege',
 'directions',
 'swore',
 'Russell',
 'Staff',
 'gm.',
 'Bryan',
 'asks',
 'principles',
 'Jan.',
 'hate',
 'Horse',
 'textiles',
 'restorative',
 'Pops',
 'Water',
 'tilted',
 'drained',
 'mathematical',
 'witty',
 'resemblance',
 'lamb',
 'Kentucky',
 'Day',
 'stake',
 'Benington',
 'Democratic',
 'injured',
 'companies',
 'contribution',
 'uncertain',
 'steady',
 'services',
 'u

In [10]:
word2index['<UNK>'] = 0

In [11]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [12]:
X_i = Counter(flatten(corpus))
X_i

Counter({"''": 5807,
         '``': 5800,
         '--': 2321,
         'one': 2235,
         'would': 2059,
         'said': 1593,
         'could': 1213,
         'time': 1211,
         'two': 1036,
         'may': 1000,
         'like': 967,
         'first': 964,
         'man': 909,
         'made': 846,
         'new': 816,
         'also': 810,
         'must': 799,
         'Af': 793,
         'back': 785,
         'years': 747,
         'even': 734,
         'many': 717,
         'much': 701,
         'way': 680,
         'little': 661,
         'Mr.': 645,
         'people': 632,
         'make': 620,
         'good': 608,
         'work': 589,
         'well': 576,
         'see': 574,
         'get': 573,
         'still': 573,
         'men': 555,
         'long': 548,
         'us': 528,
         'world': 515,
         'might': 514,
         'life': 504,
         'never': 501,
         'used': 495,
         'came': 494,
         'year': 487,
         'last': 487,
        

In [13]:
window_size = 5

def random_batch(corpus, window_size=2):
    skip_grams = []

    for doc in corpus:
        for i in range(window_size, len(doc)-window_size):
            center = doc[i]

            outside_start =  i - window_size
            outside_end =  i + window_size + 1

            for j in range(outside_start, outside_end):
                if i != j:  # Skip the center word
                    outside = doc[j]
                    skip_grams.append((center, outside))

            return skip_grams
        
skip_grams = random_batch(corpus, window_size)

In [14]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('investigation', 'Fulton'): 1,
         ('investigation', 'County'): 1,
         ('investigation', 'Grand'): 1,
         ('investigation', 'said'): 1,
         ('investigation', 'Friday'): 1,
         ('investigation', 'recent'): 1,
         ('investigation', 'primary'): 1,
         ('investigation', 'election'): 1,
         ('investigation', 'produced'): 1,
         ('investigation', '``'): 1})

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [15]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [16]:
X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [17]:
def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [18]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [19]:
x

array([[10356],
       [10356]])

In [20]:
y

array([[4292],
       [9122]])

In [21]:
cooc

array([[0.],
       [0.]])

In [22]:
weighting

array([[0.03162278],
       [0.03162278]])

## 4. Model

<img src ="../figures/glove.png" width=400>

In [23]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [24]:
#test our system
voc_size = len(vocab)
emb_size = 2
model = Glove(voc_size, emb_size)

In [25]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [26]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [27]:
loss

tensor(0.9269, grad_fn=<SumBackward0>)

## 5. Training

In [28]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
# Training
num_epochs = 1000
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    epoch_mins, epoch_secs = epoch_time(start, time.time())

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | Loss: {loss:.6f} | Time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | Loss: 0.477152 | Time: 0m 0s
Epoch: 200 | Loss: 0.332977 | Time: 0m 0s
Epoch: 300 | Loss: 0.227557 | Time: 0m 1s
Epoch: 400 | Loss: 0.152030 | Time: 0m 1s
Epoch: 500 | Loss: 0.100721 | Time: 0m 1s
Epoch: 600 | Loss: 0.067171 | Time: 0m 1s
Epoch: 700 | Loss: 0.045351 | Time: 0m 2s
Epoch: 800 | Loss: 0.030946 | Time: 0m 2s
Epoch: 900 | Loss: 0.021227 | Time: 0m 2s
Epoch: 1000 | Loss: 0.014531 | Time: 0m 3s


## 6. Plotting the embeddings

In [31]:
#list of vocabs
vocab[:10]

['mm.',
 'physician',
 'Drexel',
 'maintenance',
 'Steele',
 'Would',
 'operation',
 'doctor',
 'Said',
 'breath']

In [32]:
word = vocab[0]

In [33]:
#numericalization
id = word2index[word]
id

0

In [34]:
id_tensor = torch.LongTensor([id])
id_tensor

tensor([0])

In [35]:
#get the embedding by averaging
v_embed = model.center_embedding(id_tensor)
u_embed = model.outside_embedding(id_tensor)

v_embed, u_embed

(tensor([[-0.1028, -0.5397]], grad_fn=<EmbeddingBackward0>),
 tensor([[-1.0743, -0.1814]], grad_fn=<EmbeddingBackward0>))

In [36]:
#average to get the word embedding
word_embed = (v_embed + u_embed) / 2
word_embed

tensor([[-0.5885, -0.3606]], grad_fn=<DivBackward0>)

In [37]:
#let's write a function to get embedding given a word
def get_embed(word):
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.center_embedding(id_tensor)
    u_embed = model.outside_embedding(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

## 7. Cosine similarity

Formally the [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) $s$ between two vectors $p$ and $q$ is defined as:

$$s = \frac{p \cdot q}{||p|| ||q||}, \textrm{ where } s \in [-1, 1] $$ 

If $p$ and $q$ is super similar, the result is 1 otherwise 0.

In [38]:
vocab

['mm.',
 'physician',
 'Drexel',
 'maintenance',
 'Steele',
 'Would',
 'operation',
 'doctor',
 'Said',
 'breath',
 'representing',
 'Pete',
 'sponsors',
 'wired',
 'marketing',
 'melted',
 'notions',
 'raises',
 'drivers',
 'rang',
 "I'd",
 'assessors',
 'residence',
 'enabled',
 'bad',
 'dominant',
 'Diane',
 'isolate',
 'unconsciously',
 'exhaust',
 'Alabama',
 'used',
 'creates',
 'remarked',
 'interpret',
 'elect',
 'exhibition',
 'recalls',
 'turns',
 'note',
 'literary',
 'permits',
 'warfare',
 'burned',
 'conflict',
 'close',
 'requiring',
 'orange',
 'suffering',
 'shaved',
 'Lublin',
 'privilege',
 'directions',
 'swore',
 'Russell',
 'Staff',
 'gm.',
 'Bryan',
 'asks',
 'principles',
 'Jan.',
 'hate',
 'Horse',
 'textiles',
 'restorative',
 'Pops',
 'Water',
 'tilted',
 'drained',
 'mathematical',
 'witty',
 'resemblance',
 'lamb',
 'Kentucky',
 'Day',
 'stake',
 'Benington',
 'Democratic',
 'injured',
 'companies',
 'contribution',
 'uncertain',
 'steady',
 'services',
 'u

In [39]:
#let's try similarity between first and second, and second and third
cat          = get_embed('cat')
fruit        = get_embed('fruit')
animal       = get_embed('animal')

In [40]:
def cos_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim
    
print(f"cat vs. fruit: ",        cos_sim(cat, fruit))
print(f"cat vs. animal: ",       cos_sim(cat, animal))
print(f"cat vs. cat: ",          cos_sim(cat, cat))

cat vs. fruit:  0.9701588882919621
cat vs. animal:  0.26114139109912776
cat vs. cat:  1.0


In [41]:
def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

print(f"cat vs. fruit: ",        cos_sim(cat, fruit))
print(f"cat vs. animal: ",       cos_sim(cat, animal))
print(f"cat vs. cat: ",          cos_sim(cat, cat))

cat vs. fruit:  0.9701588882919621
cat vs. animal:  0.2611413910991277
cat vs. cat:  1.0


In [42]:
# Create a pickle of the model
import pickle

with open('../../app/models/glove/glove.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../../app/models/glove/glove_word2index.pkl', 'wb') as f:
    pickle.dump(word2index, f)

with open('../../app/models/glove/glove_index2word.pkl', 'wb') as f:
    pickle.dump(index2word, f)