<a href="https://colab.research.google.com/github/smlra-kjsce/DL-in-NLP-101/blob/master/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#word2vec from Scratch

In [0]:
#Why are word vectors needed?

#Reference: http://bit.ly/2Q433Iz (Stanford 224n)
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

corpus = [
    'he is a fan of deep learning and natural language processing.',
    'however he does not like to go too deep into exploring all of them',
    'he does however keep a track of all progress made in natural language processing',
    'he asked her to take up a class on computer vision to gain more knowledge',
    'in case he has to implement any research paper he will need atleast a day',
    'both of them went on to become experts in natural language processing',
    'he gave a lecture on human anatomy but no one really understood',
    'is he really going to ask her out for a date',
    'trains fascinate me but not always'
]
cl = [len(i.split()) for i in corpus]
print(cl)

[11, 14, 14, 15, 15, 12, 12, 11, 6]


In [0]:
#In real implementation we would have to perform case normalization, removing some punctuation etc, but for simplicity let’s use this nice and clean data.
def tokenize_corpus(corpus):
  tokens = [x.split() for x in corpus]
  return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)


[['he', 'is', 'a', 'fan', 'of', 'deep', 'learning', 'and', 'natural', 'language', 'processing.'], ['however', 'he', 'does', 'not', 'like', 'to', 'go', 'too', 'deep', 'into', 'exploring', 'all', 'of', 'them'], ['he', 'does', 'however', 'keep', 'a', 'track', 'of', 'all', 'progress', 'made', 'in', 'natural', 'language', 'processing'], ['he', 'asked', 'her', 'to', 'take', 'up', 'a', 'class', 'on', 'computer', 'vision', 'to', 'gain', 'more', 'knowledge'], ['in', 'case', 'he', 'has', 'to', 'implement', 'any', 'research', 'paper', 'he', 'will', 'need', 'atleast', 'a', 'day'], ['both', 'of', 'them', 'went', 'on', 'to', 'become', 'experts', 'in', 'natural', 'language', 'processing'], ['he', 'gave', 'a', 'lecture', 'on', 'human', 'anatomy', 'but', 'no', 'one', 'really', 'understood'], ['is', 'he', 'really', 'going', 'to', 'ask', 'her', 'out', 'for', 'a', 'date'], ['trains', 'fascinate', 'me', 'but', 'not', 'always']]


In [0]:
vocabulary = []
for sentence in tokenized_corpus:
  for token in sentence:
    if token not in vocabulary:
        vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print(vocabulary_size) #Shows total vocabulary length
print(word2idx)

71
{'he': 0, 'is': 1, 'a': 2, 'fan': 3, 'of': 4, 'deep': 5, 'learning': 6, 'and': 7, 'natural': 8, 'language': 9, 'processing.': 10, 'however': 11, 'does': 12, 'not': 13, 'like': 14, 'to': 15, 'go': 16, 'too': 17, 'into': 18, 'exploring': 19, 'all': 20, 'them': 21, 'keep': 22, 'track': 23, 'progress': 24, 'made': 25, 'in': 26, 'processing': 27, 'asked': 28, 'her': 29, 'take': 30, 'up': 31, 'class': 32, 'on': 33, 'computer': 34, 'vision': 35, 'gain': 36, 'more': 37, 'knowledge': 38, 'case': 39, 'has': 40, 'implement': 41, 'any': 42, 'research': 43, 'paper': 44, 'will': 45, 'need': 46, 'atleast': 47, 'day': 48, 'both': 49, 'went': 50, 'become': 51, 'experts': 52, 'gave': 53, 'lecture': 54, 'human': 55, 'anatomy': 56, 'but': 57, 'no': 58, 'one': 59, 'really': 60, 'understood': 61, 'going': 62, 'ask': 63, 'out': 64, 'for': 65, 'date': 66, 'trains': 67, 'fascinate': 68, 'me': 69, 'always': 70}


In [0]:
window_size =  2#@param {type:"number"}
#we predict 2 words on left and two words on right by default
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [0]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [0]:
embedding_dims = 16
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1001
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 100 == 0:    
        print(f'Loss at epoch {epo}: {loss_val/len(idx_pairs)}')

Loss at epoch 0: 9.92307414604264
Loss at epoch 100: 4.307567926345712
Loss at epoch 200: 3.1552284231766516
Loss at epoch 300: 2.6611718442001493
Loss at epoch 400: 2.3851743377552133
Loss at epoch 500: 2.2153667294917327
Loss at epoch 600: 2.1053354962502118
Loss at epoch 700: 2.0321003308259145
Loss at epoch 800: 1.981416930343203
Loss at epoch 900: 1.9449286391389184
Loss at epoch 1000: 1.918187308805594


In [0]:
print(W1.size(), W2.size(), z2.size())

torch.Size([16, 71]) torch.Size([71, 16]) torch.Size([71])


In [0]:
center_word = "natural" #@param {type:"string"}
idx = word2idx[center_word]
v = W1[:,idx]
print(v)
context_word = center_word
u = W2[idx,:]
print(u)

mean_rep = (u+v)/2
print(mean_rep)

tensor([ 2.0985e-02, -7.0129e-01,  1.4114e+00,  1.3066e+00,  3.8581e-01,
        -9.7666e-01,  7.1514e-01,  3.7603e-01, -7.1757e-01,  1.9602e+00,
        -1.4978e-03,  5.7125e-01, -1.8649e+00,  7.0546e-02, -8.6657e-02,
         7.8621e-01], grad_fn=<SelectBackward>)
tensor([ 1.0267,  0.3793,  1.7766,  0.4816, -1.0844, -1.6682,  0.5463, -0.4591,
         0.3813,  1.7502,  0.0580, -1.3125,  1.2381, -1.0006, -0.0437,  0.3919],
       grad_fn=<SliceBackward>)
tensor([ 0.5238, -0.1610,  1.5940,  0.8941, -0.3493, -1.3224,  0.6307, -0.0415,
        -0.1681,  1.8552,  0.0283, -0.3706, -0.3134, -0.4650, -0.0652,  0.5890],
       grad_fn=<DivBackward0>)


#Using in built word2vec of torchtext

##Preprocessing
Credits: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb

Understand torchtext further from: https://torchtext.readthedocs.io/en/latest/data.html

In [0]:
# to build the model
import torch
from torchtext import data
from torchtext import datasets
from torch import nn

# for repeatable results
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# specifically handle NLP data
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [0]:
TEXT.build_vocab

<bound method Field.build_vocab of <torchtext.data.field.Field object at 0x7f472b5b7be0>>

#Playing with embeddings

In [0]:
from torchtext import datasets

# getting the IMDB datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [0]:
import random

# doing train-test split
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [0]:
MAX_VOCAB_SIZE = 25000

# building the vocabulary object with all properties
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [0]:
TEXT.vocab.freqs # observing word frequencies

Counter({'Serum': 8,
         'starts': 836,
         'as': 30346,
         'Eddie': 218,
         '(': 23163,
         'Derek': 93,
         'Phillips': 53,
         ')': 23041,
         'is': 76219,
         'delighted': 30,
         'to': 94046,
         'learn': 524,
         'he': 17379,
         'has': 11851,
         'been': 6581,
         'accepted': 95,
         'into': 6362,
         'medical': 113,
         'school': 1122,
         'carry': 201,
         'on': 23003,
         'the': 203655,
         'family': 2119,
         'tradition': 117,
         'of': 101177,
         'becoming': 251,
         'an': 14544,
         'MD': 6,
         'like': 13638,
         'his': 19471,
         'father': 1391,
         'Richard': 507,
         'Dennis': 114,
         "O'Neill": 22,
         '&': 2451,
         'uncle': 140,
         'David': 659,
         'H.': 50,
         'Hickey': 3,
         ',': 193439,
         'however': 1102,
         'joy': 182,
         'could': 6602,
       

In [0]:
len(TEXT.vocab.freqs.keys()) # len of the vocabulary

101955

In [0]:
TEXT.vocab.stoi #mapping that saves which word corresponds to which index

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'and': 5,
             'a': 6,
             'of': 7,
             'to': 8,
             'is': 9,
             'in': 10,
             'I': 11,
             'it': 12,
             'that': 13,
             '"': 14,
             "'s": 15,
             'this': 16,
             '-': 17,
             '/><br': 18,
             'was': 19,
             'as': 20,
             'with': 21,
             'movie': 22,
             'for': 23,
             'film': 24,
             'The': 25,
             'but': 26,
             '(': 27,
             "n't": 28,
             ')': 29,
             'on': 30,
             'you': 31,
             'are': 32,
             'not': 33,
             'have': 34,
             'his': 35,
             'be': 36,
             'he': 37,
             'one': 38,
             'at': 39,
     

In [0]:
TEXT.vocab.itos # inverse mapping of which index contains which words

['<unk>',
 '<pad>',
 'the',
 ',',
 '.',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'I',
 'it',
 'that',
 '"',
 "'s",
 'this',
 '-',
 '/><br',
 'was',
 'as',
 'with',
 'movie',
 'for',
 'film',
 'The',
 'but',
 '(',
 "n't",
 ')',
 'on',
 'you',
 'are',
 'not',
 'have',
 'his',
 'be',
 'he',
 'one',
 'at',
 '!',
 'by',
 'all',
 'an',
 'who',
 'they',
 'from',
 'like',
 'so',
 'her',
 'about',
 'or',
 'has',
 'It',
 'out',
 "'",
 'just',
 'do',
 '?',
 'some',
 'good',
 'more',
 'very',
 'would',
 'up',
 'what',
 'This',
 'there',
 'time',
 'can',
 'which',
 'when',
 'had',
 'only',
 'if',
 'she',
 'story',
 'really',
 'were',
 'see',
 'their',
 'even',
 'no',
 'my',
 'does',
 'me',
 'did',
 'than',
 '...',
 'could',
 'been',
 ':',
 'much',
 'into',
 'get',
 'well',
 'will',
 'other',
 'bad',
 'we',
 'because',
 'him',
 'people',
 'great',
 'most',
 'made',
 'first',
 'make',
 'also',
 'them',
 'how',
 'way',
 'any',
 'its',
 'movies',
 '/>The',
 '<',
 'too',
 'br',
 'think',
 'characters'

In [0]:
TEXT.vocab.vectors # all word vectors being used

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7551,  0.5348, -0.3787,  ...,  0.2816,  0.1977, -0.3319],
        [-0.3376,  0.0265,  0.1340,  ...,  0.2927,  0.1693, -0.6942],
        [-0.1272, -0.3361, -0.2779,  ..., -0.8548,  0.2575, -0.1269]])

In [0]:
TEXT.vocab.vectors.shape # finding number of words and vector size used to encode the context

torch.Size([25002, 100])

In [0]:
TEXT.vocab.vectors[TEXT.vocab.stoi['king']] # vector for king

tensor([-0.3231, -0.8762,  0.2198,  0.2527,  0.2298,  0.7388, -0.3795, -0.3531,
        -0.8437, -1.1113, -0.3027,  0.3318, -0.2511,  0.3045, -0.0775, -0.8982,
         0.0925, -1.1407, -0.5832,  0.6687, -0.2312, -0.9585,  0.2826, -0.0788,
         0.7531,  0.2658,  0.3422, -0.3395,  0.9561,  0.0656,  0.4575,  0.3984,
         0.5796,  0.3927, -0.2185,  0.5879, -0.5600,  0.6337, -0.0440, -0.6873,
        -0.3784,  0.3803,  0.6164, -0.8827, -0.1235, -0.3793, -0.3832,  0.2387,
         0.6685, -0.4332, -0.1107,  0.0817,  1.1569,  0.7896, -0.2122, -2.3211,
        -0.6781,  0.4456,  0.6571,  0.1045,  0.4622,  0.1991,  0.2580,  0.0572,
         0.5344, -0.4313, -0.3431,  0.5979, -0.5842,  0.0690,  0.2394, -0.8518,
         0.3038, -0.3418, -0.2575, -0.0311, -0.1629,  0.4517, -0.9163,  0.6452,
         0.7328, -0.2275,  0.3023,  0.0448, -0.8374,  0.5501, -0.5251, -1.7357,
         0.4751, -0.7049,  0.0569, -0.7132,  0.0896,  0.4139, -1.3363, -0.6191,
        -0.3309, -0.5288,  0.1648, -0.98

In [0]:
# KING - MAN + WOMAN ~ QUEEN 
(TEXT.vocab.vectors[TEXT.vocab.stoi['king']]-TEXT.vocab.vectors[TEXT.vocab.stoi['man']]+TEXT.vocab.vectors[TEXT.vocab.stoi['woman']]).dot(TEXT.vocab.vectors[TEXT.vocab.stoi['queen']])/((TEXT.vocab.vectors[TEXT.vocab.stoi['king']]-TEXT.vocab.vectors[TEXT.vocab.stoi['man']]+TEXT.vocab.vectors[TEXT.vocab.stoi['woman']]).norm()*(TEXT.vocab.vectors[TEXT.vocab.stoi['queen']]).norm())

tensor(0.7834)

In [0]:
TEXT.vocab.vectors[TEXT.vocab.stoi['earth']].dot(TEXT.vocab.vectors[TEXT.vocab.stoi['world']])/(TEXT.vocab.vectors[TEXT.vocab.stoi['earth']].norm()*TEXT.vocab.vectors[TEXT.vocab.stoi['world']].norm())

tensor(0.5213)

In [0]:
TEXT.vocab.vectors[TEXT.vocab.stoi['i']].dot(TEXT.vocab.vectors[TEXT.vocab.stoi['me']])/(TEXT.vocab.vectors[TEXT.vocab.stoi['i']].norm()*TEXT.vocab.vectors[TEXT.vocab.stoi['me']].norm())

tensor(0.8775)

In [0]:
TEXT.vocab.vectors[TEXT.vocab.stoi['life']].dot(TEXT.vocab.vectors[TEXT.vocab.stoi['death']])/(TEXT.vocab.vectors[TEXT.vocab.stoi['life']].norm()*TEXT.vocab.vectors[TEXT.vocab.stoi['death']].norm())

tensor(0.6755)

In [0]:
s1 = "I like this movie"
s2 = "A boring movie"
A = TEXT.vocab.vectors[TEXT.vocab.stoi['love']]
B = TEXT.vocab.vectors[TEXT.vocab.stoi['hate']]

In [0]:
sum1 = torch.zeros_like(TEXT.vocab.vectors[TEXT.vocab.stoi['I']])
for i in s1.split(' '):
  sum1 += TEXT.vocab.vectors[TEXT.vocab.stoi[i]]*0.1/(0.1+TEXT.vocab.freqs[i]/25e3)

sum2 = torch.zeros_like(TEXT.vocab.vectors[TEXT.vocab.stoi['I']])
for i in s2.split(' '):
  sum2 += TEXT.vocab.vectors[TEXT.vocab.stoi[i]]*0.1/(0.1+TEXT.vocab.freqs[i]/25e3)

sum1 = sum1/len(s1)
sum2 = sum2/len(s2)

In [0]:
sum1

tensor([-0.0015,  0.0117,  0.0077, -0.0095, -0.0033,  0.0021, -0.0117, -0.0025,
        -0.0032,  0.0009, -0.0009, -0.0058,  0.0045,  0.0050,  0.0054,  0.0005,
         0.0054,  0.0063,  0.0038,  0.0149,  0.0077,  0.0051,  0.0020, -0.0057,
         0.0064,  0.0049, -0.0002, -0.0110, -0.0085,  0.0012,  0.0032,  0.0086,
        -0.0008, -0.0007,  0.0081,  0.0051,  0.0020,  0.0050, -0.0002, -0.0045,
        -0.0023,  0.0010, -0.0064, -0.0043,  0.0002, -0.0018,  0.0011, -0.0090,
        -0.0017, -0.0176, -0.0013, -0.0005,  0.0028,  0.0185, -0.0062, -0.0490,
        -0.0057,  0.0090,  0.0270,  0.0103, -0.0048,  0.0225, -0.0014,  0.0026,
         0.0116,  0.0018,  0.0162,  0.0029, -0.0013, -0.0030, -0.0012, -0.0111,
        -0.0022, -0.0051, -0.0096,  0.0049, -0.0024, -0.0016, -0.0030, -0.0008,
         0.0034, -0.0024, -0.0083,  0.0039, -0.0250, -0.0025, -0.0063, -0.0068,
        -0.0089, -0.0183,  0.0038, -0.0046, -0.0037, -0.0032, -0.0091, -0.0066,
        -0.0097, -0.0017,  0.0095,  0.00

In [0]:
sum2

tensor([-0.0151,  0.0045,  0.0139, -0.0509, -0.0403, -0.0149, -0.0178,  0.0258,
         0.0453, -0.0429,  0.0226,  0.0197, -0.0351, -0.0502,  0.0018,  0.0107,
         0.0093,  0.0026,  0.0858,  0.0333,  0.0627,  0.0139,  0.0092, -0.0509,
         0.0459, -0.0026, -0.0041, -0.0243, -0.0564,  0.0060, -0.0182, -0.0203,
        -0.0270, -0.0915,  0.0211,  0.0336, -0.0501,  0.0063,  0.0317,  0.0676,
         0.0101, -0.0206, -0.0576, -0.0091, -0.0449, -0.0171,  0.0120,  0.0144,
        -0.0040, -0.0709, -0.0146, -0.0400,  0.0067,  0.0404,  0.0482, -0.0903,
         0.0019,  0.0583,  0.0158, -0.0045,  0.0223,  0.0735,  0.0028,  0.0018,
         0.0804,  0.0122,  0.0927, -0.0005, -0.0391, -0.0096, -0.0188, -0.0395,
        -0.0138, -0.0208,  0.0039, -0.0086,  0.0197,  0.0045,  0.0350, -0.0204,
         0.0048,  0.0127, -0.0222,  0.0147, -0.0962,  0.0208, -0.0461, -0.0228,
         0.0006, -0.0448,  0.0219,  0.0265,  0.0424,  0.0455,  0.0393, -0.0455,
         0.0214,  0.0299, -0.0014,  0.00

In [0]:
print(sum1.dot(A)/(sum1.norm()*A.norm()))
print(sum2.dot(A)/(sum2.norm()*A.norm()))

tensor(0.7026)
tensor(0.3269)


In [0]:
print(sum1.dot(B)/(sum1.norm()*B.norm()))
print(sum2.dot(B)/(sum2.norm()*B.norm()))

tensor(0.5309)
tensor(0.3318)


In [0]:
sum2.norm()

tensor(0.3706)