## Word2Vec

In [36]:
#importing necessary libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import spacy
import nltk

## 1. Try a real corpus

In [37]:
from nltk.corpus import abc
nltk.download('abc')
corpus = nltk.corpus.abc.words()

[nltk_data] Downloading package abc to C:\Users\Sirikit
[nltk_data]     Joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package abc is already up-to-date!


In [38]:
#let's check the size of the corpus
print(len(corpus))
corpus_para = corpus[0:500]

corpus_lower = [[word.lower() for word in corpus_para]]
# print(corpus_lower)

766811


In [39]:
#tokenizing list of words
word_list = []
for sent in corpus_lower:
    for word in sent:
        word_list.append(word)

word_list[0:5]


['pm', 'denies', 'knowledge', 'of', 'awb']

In [40]:
#2. numericalize

#2.1 get all the unique words
#we want to flatten this (basically merge all list)
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs  = list(set(flatten(corpus_lower)))  #vocabs is a term defining all unique words your system know
print(vocabs[0:5])
print(len(vocabs))

['was', 'average', 'its', 'all', 'denied']
248


In [41]:
#2.2 assign id to all these vocabs
word2index = {v: idx+1 for idx, v in enumerate(vocabs)}

#add <UNK>, which is a very normal token exists in the world
vocabs.append('<UNK>') #chaky, can it be ##UNK, or UNKKKKKK, or anything

word2index['<UNK>'] = 0 

In [42]:
word2index

{'was': 1,
 'average': 2,
 'its': 3,
 'all': 4,
 'denied': 5,
 'valuable': 6,
 'australia': 7,
 ',"': 8,
 'prices': 9,
 'prove': 10,
 'although': 11,
 'put': 12,
 '."': 13,
 's': 14,
 'people': 15,
 'trade': 16,
 'market': 17,
 'south': 18,
 'company': 19,
 'hands': 20,
 'broadly': 21,
 'trading': 22,
 'grain': 23,
 'forwarded': 24,
 'been': 25,
 'from': 26,
 'too': 27,
 'approached': 28,
 'skimmed': 29,
 'multinationals': 30,
 'analyst': 31,
 'with': 32,
 'into': 33,
 'provide': 34,
 'desk': 35,
 'totalled': 36,
 'of': 37,
 'aside': 38,
 'knew': 39,
 'knowledge': 40,
 'show': 41,
 'things': 42,
 'another': 43,
 'biggest': 44,
 'coalition': 45,
 'minister': 46,
 'take': 47,
 'else': 48,
 'grains': 49,
 'stories': 50,
 'close': 51,
 'astonishing': 52,
 'wheat': 53,
 'but': 54,
 'i': 55,
 'released': 56,
 'did': 57,
 'not': 58,
 'to': 59,
 'mr': 60,
 'pretty': 61,
 'best': 62,
 'vaile': 63,
 'ploy': 64,
 'has': 65,
 'east': 66,
 'million': 67,
 'indicated': 68,
 'must': 69,
 'producers':

In [43]:
#create index2word dictionary
   
index2word = {v:k for k, v in word2index.items()}

index2word

{1: 'was',
 2: 'average',
 3: 'its',
 4: 'all',
 5: 'denied',
 6: 'valuable',
 7: 'australia',
 8: ',"',
 9: 'prices',
 10: 'prove',
 11: 'although',
 12: 'put',
 13: '."',
 14: 's',
 15: 'people',
 16: 'trade',
 17: 'market',
 18: 'south',
 19: 'company',
 20: 'hands',
 21: 'broadly',
 22: 'trading',
 23: 'grain',
 24: 'forwarded',
 25: 'been',
 26: 'from',
 27: 'too',
 28: 'approached',
 29: 'skimmed',
 30: 'multinationals',
 31: 'analyst',
 32: 'with',
 33: 'into',
 34: 'provide',
 35: 'desk',
 36: 'totalled',
 37: 'of',
 38: 'aside',
 39: 'knew',
 40: 'knowledge',
 41: 'show',
 42: 'things',
 43: 'another',
 44: 'biggest',
 45: 'coalition',
 46: 'minister',
 47: 'take',
 48: 'else',
 49: 'grains',
 50: 'stories',
 51: 'close',
 52: 'astonishing',
 53: 'wheat',
 54: 'but',
 55: 'i',
 56: 'released',
 57: 'did',
 58: 'not',
 59: 'to',
 60: 'mr',
 61: 'pretty',
 62: 'best',
 63: 'vaile',
 64: 'ploy',
 65: 'has',
 66: 'east',
 67: 'million',
 68: 'indicated',
 69: 'must',
 70: 'produce

In [44]:
vocabs  #checking vocab

['was',
 'average',
 'its',
 'all',
 'denied',
 'valuable',
 'australia',
 ',"',
 'prices',
 'prove',
 'although',
 'put',
 '."',
 's',
 'people',
 'trade',
 'market',
 'south',
 'company',
 'hands',
 'broadly',
 'trading',
 'grain',
 'forwarded',
 'been',
 'from',
 'too',
 'approached',
 'skimmed',
 'multinationals',
 'analyst',
 'with',
 'into',
 'provide',
 'desk',
 'totalled',
 'of',
 'aside',
 'knew',
 'knowledge',
 'show',
 'things',
 'another',
 'biggest',
 'coalition',
 'minister',
 'take',
 'else',
 'grains',
 'stories',
 'close',
 'astonishing',
 'wheat',
 'but',
 'i',
 'released',
 'did',
 'not',
 'to',
 'mr',
 'pretty',
 'best',
 'vaile',
 'ploy',
 'has',
 'east',
 'million',
 'indicated',
 'must',
 'producers',
 ',',
 'asks',
 'lindberg',
 'future',
 'today',
 'new',
 'there',
 'hadn',
 'two',
 '2002',
 'it',
 'growers',
 'for',
 'sent',
 'time',
 'they',
 'connor',
 '-',
 'think',
 'denies',
 'taking',
 'sales',
 'responsibility',
 'among',
 'as',
 'iraq',
 'oil',
 'he',


## 2. Prepare train data


In [45]:
def random_batch(batch_size, word_sequence, win_size=2):

    skip_grams = []

    for sent in corpus_lower:
        for i in range(1, len(sent) - 1): 
            target = word2index[sent[i]]
            
            context = []
            
            for j in range(2):
                
                if i - (j + 1) >= 0: 
                    context.append(word2index[sent[i - (j + 1)]])
                
                if i + (j + 1) < len(sent): 
                    context.append(word2index[sent[i + (j + 1)]])
    
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  
        random_labels.append([skip_grams[i][1]])  
            
    return np.array(random_inputs), np.array(random_labels)

## 3. Model

In [46]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center_word  = nn.Embedding(voc_size, emb_size)  #is a lookup table mapping all ids in voc_size, into some vector of size emb_size
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center_word, outside_word, all_vocabs):
        
        #convert them into embedding
        center_word_embed  = self.embedding_center_word(center_word)     #(batch_size, 1, emb_size)
        outside_word_embed = self.embedding_outside_word(outside_word)   #(batch_size, 1, emb_size)
        all_vocabs_embed   = self.embedding_outside_word(all_vocabs)     #(batch_size, voc_size, emb_size)
        
        #bmm is basically @ or .dot , but across batches (i.e., ignore the batch dimension)
        top_term = outside_word_embed.bmm(center_word_embed.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) ===> (batch_size, 1)
        
        top_term_exp = torch.exp(top_term)  #exp(uo vc)
        #(batch_size, 1)
        
        lower_term = all_vocabs_embed.bmm(center_word_embed.transpose(1, 2)).squeeze(2)
         #(batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size)
         
        lower_term_sum = torch.sum(torch.exp(lower_term), 1) #sum exp(uw vc)
        #(batch_size, 1)
        
        loss_fn = -torch.mean(torch.log(top_term_exp / lower_term_sum))
        #(batch_size, 1) / (batch_size, 1) ==mean==> scalar
        
        return loss_fn

In [47]:
#preparing all_vocabs

batch_size = 2
voc_size = len(vocabs)

def prepare_sequence(seq, word2index):
    #map(function, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs.shape

torch.Size([2, 249])