# GloVE

Let's work on implementation of GloVE.


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
# Detect the device for computation (CPU/GPU/Metal on Mac 💻)
device = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else ("cuda" if torch.cuda.is_available() else "cpu")
)
print(f"Using device: {device}")

Using device: cuda


## 1. Define some very simple data for understanding


In [3]:
import nltk
from nltk.corpus import reuters

# SAMPLE_SIZE = 100
SAMPLE_SIZE = len(reuters.fileids())
corpus = []


for id in reuters.fileids()[:SAMPLE_SIZE]:
    sentences = reuters.words(id)
    sentences = [
        sentence.lower() for sentence in sentences if sentence.isalpha()
    ]
    corpus.append(sentences)

print(corpus[:1])



In [4]:
# corpus = [
#     "apple banana fruit",
#     "banana apple fruit",
#     "banana fruit apple",
#     "dog cat animal",
#     "cat animal dog",
#     "cat dog animal",
# ]

In [5]:
# corpus = [sent.split(" ") for sent in corpus]
# corpus

In [6]:
# get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
# vocab

In [7]:
# numericalization
word2index = {w: i for i, w in enumerate(vocab)}
# print(word2index)

In [8]:
# vocab size
voc_size = len(vocab)
# print(voc_size)

In [9]:
# append UNK
vocab.append("<UNK>")

In [10]:
# vocab

In [11]:
word2index["<UNK>"] = 0

In [12]:
# just in case we need to use
index2word = {v: k for k, v in word2index.items()}

## 2. Build Co-occurence Matrix X


Here, we need to count the co-occurence of two words given some window size. We gonna use window size of 1.


In [13]:
from collections import Counter

X_i = Counter(flatten(corpus))  # X_i
# X_i

In [14]:
# Make skip gram of one size window
skip_grams = []
# loop each word sequence
# we starts from 1 because 0 has no context
# we stop at second last for the same reason
for sent in corpus:
    for i in range(1, len(sent) - 1):
        target = sent[i]
        context = [sent[i - 1], sent[i + 1]]
        for w in context:
            skip_grams.append((target, w))

# skip_grams

In [15]:
X_ik_skipgram = Counter(skip_grams)  # Co-occurece in window size 1
# X_ik_skipgram

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>


In [16]:
# simply a normalized function...don't worry too much
def weighting(w_i, w_j, X_ik):

    # check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  # if does not exist, set it to 1

    x_max = 100  # 100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75

    # if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij / x_max) ** alpha  # scale it
    else:
        result = 1  # if is greater than max, set it to 1 maximum

    return result

In [None]:
import gc
from itertools import combinations_with_replacement

X_ik = {}  # for keeping the co-occurences
weighting_dic = {}  # scaling the percentage of sampling

# Use a generator to avoid creating a large list in memory
for bigram in combinations_with_replacement(vocab, 2):
    co_occer = X_ik_skipgram.get(bigram, 0)  # get the count from what we already counted, default to 0
    if co_occer > 0:
        X_ik[bigram] = co_occer + 1  # + 1 for stability issue
        X_ik[(bigram[1], bigram[0])] = co_occer + 1  # count also for the opposite

    weight = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[bigram] = weight
    weighting_dic[(bigram[1], bigram[0])] = weight

    # Clear RAM
    gc.collect()

# print(f"{X_ik=}")
# print(f"{weighting_dic=}")


## 3. Prepare train data


In [None]:
# for c in corpus:
#     print(c)

In [None]:
import math


def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):

    # convert to id since our skip_grams is word, not yet id
    skip_grams_id = [
        (word2index[skip_gram[0]], word2index[skip_gram[1]])
        for skip_gram in skip_grams
    ]

    random_inputs = []
    random_labels = []
    random_coocs = []
    random_weightings = []
    random_index = np.random.choice(
        range(len(skip_grams_id)), batch_size, replace=False
    )  # randomly pick without replacement

    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3

        # get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])

        # get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])

    return (
        np.array(random_inputs),
        np.array(random_labels),
        np.array(random_coocs),
        np.array(random_weightings),
    )

### Testing the method


In [None]:
# testing the method
batch_size = 2  # mini-batch size
input_batch, target_batch, cooc_batch, weighting_batch = random_batch(
    batch_size, corpus, skip_grams, X_ik, weighting_dic
)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)

# we will convert them to tensor during training, so don't worry...

## 4. Model

<img src ="../figures/glove.png">


In [None]:
class GloVe(nn.Module):

    def __init__(self, vocab_size, embed_size):
        super(GloVe, self).__init__()
        self.embedding_v = nn.Embedding(
            vocab_size, embed_size
        )  # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size)  # out embedding

        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)

    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(
            center_words
        )  # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(
            target_words
        )  # [batch_size, 1, emb_size]

        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)

        inner_product = target_embeds.bmm(
            center_embeds.transpose(1, 2)
        ).squeeze(2)
        # [batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        # note that coocs already got log
        loss = weighting * torch.pow(
            inner_product + center_bias + target_bias - coocs, 2
        )

        return torch.sum(loss)


# Move the model to the GPU
model = GloVe(voc_size, embedding_size).to(device)

## 5. Training


In [None]:
batch_size = 256  # mini-batch size
embedding_size = 100  # so we can later plot
model = GloVe(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time

# Training
num_epochs = 10
for epoch in range(num_epochs):

    start = time.time()

    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(
        batch_size, corpus, skip_grams, X_ik, weighting_dic
    )
    input_batch = torch.LongTensor(input_batch).to(device)  # [batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)  # [batch_size, 1]
    cooc_batch = torch.FloatTensor(cooc_batch).to(device)  # [batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(
        device
    )  # [batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)

    loss.backward()
    optimizer.step()

    end = time.time()

    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1 == 0:
        print(
            f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s"
        )

In [None]:
torch.save(model, "glove.model")

## 6. Plotting the embeddings


In [None]:
# list of vocabs
vocab[:10]

In [None]:
word = vocab[0]

In [None]:
# numericalization
id = word2index[word]
# id

In [None]:
id_tensor = torch.LongTensor([id]).to(device)
# id_tensor

In [None]:
# get the embedding by averaging
v_embed = model.embedding_v(id_tensor)
u_embed = model.embedding_u(id_tensor)

# v_embed, u_embed

In [None]:
# average to get the word embedding
word_embed = (v_embed + u_embed) / 2
word_embed[0][1]

In [None]:
# let's write a function to get embedding given a word
def get_embed(word):
    id_tensor = torch.LongTensor([word2index[word]]).to(device)
    v_embed = model.embedding_v(id_tensor)
    u_embed = model.embedding_u(id_tensor)
    word_embed = (v_embed + u_embed) / 2
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [None]:
plt.figure(figsize=(6, 3))
for i, word in enumerate(vocab[:20]):  # loop each unique vocab
    x, y = get_embed(word)
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords="offset points")
plt.show()

## 7. Cosine similarity

Formally the [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) $s$ between two vectors $p$ and $q$ is defined as:

$$s = \frac{p \cdot q}{||p|| ||q||}, \textrm{ where } s \in [-1, 1] $$

If $p$ and $q$ is super similar, the result is 1 otherwise 0.


In [None]:
vocab

In [None]:
# let's try similarity between first and second, and second and third
cat = get_embed("cat")
fruit = get_embed("fruit")
animal = get_embed("animal")

In [None]:
# numpy version
from numpy import dot
from numpy.linalg import norm


def cos_sim(a, b):
    cos_sim = dot(a, b) / (norm(a) * norm(b))
    return cos_sim


print(f"cat vs. fruit: ", cos_sim(cat, fruit))
print(f"cat vs. animal: ", cos_sim(cat, animal))
print(f"cat vs. cat: ", cos_sim(cat, cat))

In [None]:
# scipy version
from scipy import spatial


def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(
        a, b
    )  # distance = 1 - similarlity, because scipy only gives distance
    return cos_sim


print(f"cat vs. fruit: ", cos_sim(cat, fruit))
print(f"cat vs. animal: ", cos_sim(cat, animal))
print(f"cat vs. cat: ", cos_sim(cat, cat))