<center>
    <img src="https://www.ucalgary.ca/themes/ucalgary/ucws_theme/images/UCalgary.svg" width='30%'>
</center>

[comment]: <> (The following line is for the TOPIC of the week)
<p style="text-align:left;"><font size='4'><b> Introduction to NLP </b></font></p>

---

# Word Embeddings | word2vec


In [1]:
import numpy as np

In [2]:
import os
if not os.path.exists("text8.zip"):
    !wget http://mattmahoney.net/dc/text8.zip
    !unzip text8.zip

--2023-10-01 19:19:49--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 34.198.1.81
Connecting to mattmahoney.net (mattmahoney.net)|34.198.1.81|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip’


2023-10-01 19:19:49 (269 MB/s) - ‘text8.zip’ saved [31344016/31344016]

Archive:  text8.zip
  inflating: text8                   


In [3]:
LIMIT = 100_000
with open('text8') as f:
    corpus = f.read().split()[:LIMIT]

# word co-occurrency

In [4]:
word2idx = {}
idx2word = []
for word in corpus:
    if word not in word2idx:
        word2idx[word] = len(word2idx)
        idx2word.append(word)
vocabulary_size = len(word2idx)
vocabulary_size

12023

In [5]:
co_occurrency_matrix = np.zeros((vocabulary_size, vocabulary_size), dtype=np.uint32)
window = 5
for i in range(len(corpus)):
    token = corpus[i]
    token_idx = word2idx[token]
    for j in range(max(i-window, 0), min(i+window, len(corpus))):
        if i == j:
            continue
        neighbor = corpus[j]
        neighbor_idx = word2idx[neighbor]
        co_occurrency_matrix[token_idx, neighbor_idx] += 1

co_occurrency_matrix

array([[ 43,   1,  14, ...,   0,   0,   0],
       [  1,   0,   1, ...,   0,   0,   0],
       [ 14,   1, 165, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   1,   0],
       [  0,   0,   0, ...,   1,   0,   0],
       [  0,   0,   0, ...,   0,   1,   0]], dtype=uint32)

In [6]:
def cosine_similarity(a,b):
    a = a / np.sqrt((a ** 2).sum(-1))
    b = b / np.sqrt((b ** 2).sum(-1))
    return np.dot(a, b.T)


In [7]:
cosine_similarity(co_occurrency_matrix[word2idx['toronto']], co_occurrency_matrix[word2idx['ottawa']])

0.13074409009212268

In [8]:
cosine_similarity(co_occurrency_matrix[word2idx['like']], co_occurrency_matrix[word2idx['love']])

0.5657035277517756

In [9]:
# choose a random word
widx = np.random.randint(vocabulary_size)
print(widx, idx2word[widx])

cosine_similarity(co_occurrency_matrix[word2idx['toronto']], co_occurrency_matrix[widx])

4624 ph


0.0

## From co-occurrency matrix to embeddings

Use PCA to compress the matrix.

In [10]:
from sklearn.decomposition import PCA

pca_co_occurrency_matrix = PCA(n_components=50).fit_transform(co_occurrency_matrix)

pca_co_occurrency_matrix

array([[ 6.30552646e+01, -1.69815162e+01, -1.61517171e+01, ...,
         2.17251888e+00,  1.57627993e+01,  2.55399765e+01],
       [-3.10055869e+00, -1.15508323e+00, -9.23283847e-01, ...,
         1.05221189e-01, -2.02608554e-01,  4.67699227e-01],
       [ 6.61249912e+02, -1.38696833e+02,  3.80169902e+01, ...,
        -6.91352478e-01,  1.18812189e+00, -1.15850425e+00],
       ...,
       [-6.77471712e+00,  7.92928563e-02, -3.89485414e-01, ...,
         2.92276088e-01,  1.91066177e-01, -2.45395094e-01],
       [-6.84542966e+00,  1.23103711e-01, -3.61984313e-01, ...,
         3.59188213e-01,  2.83413948e-01, -4.34373211e-01],
       [-6.02542216e+00, -1.69505187e-01,  2.47964510e-01, ...,
         2.78817440e-01,  1.92801560e-01, -4.12069765e-01]])

In [11]:
# sqrt(average(word_occurency)) / 128, 256, 1024, 768

In [12]:
pca_co_occurrency_matrix.shape

(12023, 50)

In [13]:
cosine_similarity(pca_co_occurrency_matrix[word2idx['like']], pca_co_occurrency_matrix[word2idx['love']])

0.38361762145970774

In [14]:
cosine_similarity(pca_co_occurrency_matrix[word2idx['like']], pca_co_occurrency_matrix[widx])

-0.8940410414439164

PCA cause loss of information

In [15]:
cosine_similarity(pca_co_occurrency_matrix[word2idx['toronto']], pca_co_occurrency_matrix[word2idx['ottawa']])

0.8576986916756436

# word2vec

## skip-gram with negative sampling

In [16]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

class SkipGram:
    def __init__(self,
                 dimension=50,
                 window=5,
                 epoch=10,
                 learning_rate=0.025,
                 negative_samples=5,
                 corpus=None,):
        self.word2idx = {}
        self.idx2word = []
        self.word_frequency = []
        self.vocabulary_size = 0
        self.dimension = dimension
        self.window = window
        self.epoch = epoch
        self.learning_rate = learning_rate
        self.negative_samples = negative_samples
        if corpus:
            self.build_vocab(corpus)
            self.init_weights()
            self.train(corpus, epoch=self.epoch, learning_rate=self.learning_rate)

    def build_vocab(self, corpus):
        for word in corpus:
            if word not in self.word2idx:
                self.word2idx[word] = len(self.word2idx)
                self.idx2word.append(word)
                self.word_frequency.append(1)
            else:
                self.word_frequency[self.word2idx[word]] += 1

        self.vocabulary_size = len(self.word2idx)

    def init_weights(self):
        # word_idx, dim of the word
        self.word_embeddings = (np.random.random(size=(self.vocabulary_size, self.dimension)) - 0.5) / self.dimension
        self.word_output_weights = np.zeros((self.vocabulary_size, self.dimension))
        word_frequency = np.array(self.word_frequency)
        # create a negative sample draw table
        word_frequency = word_frequency ** 0.75
        negative_sample_draw_table = word_frequency / word_frequency.sum()
        self.negative_sample_draw_table = negative_sample_draw_table.cumsum()


    def train(self, corpus, epoch=1, learning_rate=0.025):
        # convert word to word_idx
        corpus = [self.word2idx[word] for word in corpus if word in self.word2idx]
        corpus_size = len(corpus)

        samples_counts = 0

        for i in range(epoch):
            print(f"epoch {i+1}")
            print(self.word_embeddings[self.word2idx['like']])
            print(self.word_output_weights[self.word2idx['like']])
            print(
                cosine_similarity(
                    self.word_embeddings[self.word2idx['like']],
                    self.word_embeddings[self.word2idx['love']],
                )
            )
            for i, word_idx in enumerate(corpus):
                w = np.random.randint(self.window)
                for j in range(max(i-w, 0), min(i+w+1, corpus_size)):
                    samples_counts += 1
                    # if samples_counts >= 50:
                    #     break

                    neighbor_idx = corpus[j]
                    if i == j:
                        continue
                    # print(f"train window={w} {self.idx2word[word_idx]}, {self.idx2word[neighbor_idx]}")
                    self._train_single_pair(word_idx, neighbor_idx, learning_rate)
                # learning_rate -= samples_counts * delta

    def _train_single_pair(self, i, j, learning_rate):
        # vector of dim = d
        v_w_i = self.word_embeddings[i].copy()

        u_w_j = self.word_output_weights[j].copy()

        gridant_v_w_i = np.zeros(self.dimension)

        # processing negative sample
        for k in self._get_negative_sample():
            # skip if negative sample equals to true sample
            if k == j:
                continue
            u_w_k = self.word_output_weights[k].copy()
            # gridant from negative sample
            gridant_v_w_i += -sigmoid(np.dot(u_w_k,v_w_i))*u_w_k
            # update negative sample
            self.word_output_weights[k] = u_w_k + learning_rate * (-sigmoid(np.dot(u_w_k,v_w_i))*v_w_i)

        # gridant from positive sample
        gridant_v_w_i += (1-sigmoid(np.dot(u_w_j,v_w_i)))*u_w_j
        # update positive sample
        self.word_output_weights[j] = u_w_j + learning_rate * ((1-sigmoid(np.dot(u_w_j,v_w_i)))*v_w_i)

        # update word embedding
        self.word_embeddings[i] = v_w_i + learning_rate * gridant_v_w_i

    def _get_negative_sample(self):
        for i in range(self.negative_samples):
            yield self.negative_sample_draw_table.searchsorted(np.random.random())

    def __getitem__(self, word):
        assert word in self.word2idx, KeyError("word not found in vocabulary")

        return self.word_embeddings[self.word2idx[word]]


In [17]:
model = SkipGram(epoch=1, dimension=50, learning_rate=0.05, corpus=corpus)

epoch 1
[-0.00371853  0.00699322 -0.00612108  0.00866496  0.00634003 -0.00046832
 -0.00994657  0.0051821   0.00509176  0.00117535 -0.00250108 -0.00506595
 -0.00884773 -0.00231176  0.00038681 -0.00020532 -0.00906766 -0.00999556
 -0.00878635 -0.00515008 -0.00222579  0.00513445  0.00586812 -0.00553174
 -0.0064082   0.00143484 -0.00722414 -0.00851997 -0.00614972  0.00073323
  0.00923772  0.00729922 -0.0099276   0.00866795 -0.00901245 -0.00545791
  0.00894026  0.00684891 -0.00301517 -0.00090381  0.00627354  0.00794842
  0.00891131  0.00247643  0.00725883  0.00738247 -0.00529736  0.00022671
 -0.00050927  0.00834318]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
-0.12397147970395939


In [18]:
model.train(corpus, epoch=1, learning_rate=0.0005)

epoch 1
[ 0.21427154 -0.40711229 -0.48098662 -0.05796207 -0.18346259  0.4251024
 -0.53778189 -0.61815905  0.20067967  0.24324349 -0.06698374 -0.30328741
  0.23284905 -0.17497198  0.3223278   0.43442391  0.35162126 -0.49003714
  0.03665783 -0.55578769  0.2805594  -0.29531096  0.38731458  0.63898095
  0.18557924  0.10225986 -0.42208882 -0.29941452  0.20785154 -0.06483977
  0.02746589  0.33904502  0.58675311  0.29616159  0.14822049 -0.08173149
 -0.25489478 -0.02230743 -0.4118917   0.0652149  -0.06966938 -0.18134278
 -0.22621215 -0.10822873  0.27194882  0.16121671 -0.48745452 -0.27932258
  0.00101528 -0.34586833]
[ 0.02394186  0.19493628  0.14712533  0.16032663  0.06561548 -0.21021657
  0.30066553  0.37501663 -0.13518406 -0.08693984  0.0514937   0.22527309
 -0.07502714  0.21642611 -0.25513767 -0.14703731 -0.14151864  0.15333023
  0.08094464  0.23284287 -0.25141465  0.08682712 -0.26900121 -0.15295286
 -0.13643775  0.0356769   0.28069218  0.10145693 -0.15346338 -0.04908361
 -0.15617211 -0.11

In [19]:
neighbor_idx = np.random.randint(model.vocabulary_size)
neighbor = model.idx2word[neighbor_idx]
print(neighbor)
print(cosine_similarity(
    model['like'],
    model[neighbor],
))

microbiologist
0.04493874684759133


In [20]:
freq = np.array([1,2,3,4])
freq = freq / freq.sum()
freq, freq.cumsum()

(array([0.1, 0.2, 0.3, 0.4]), array([0.1, 0.3, 0.6, 1. ]))

In [21]:
model.word2idx['like']
idx = 100
model.idx2word[100]

# dim = np.mean(model.frequence) ** 2

'regarded'

In [22]:
corpus[:100]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing']