In [1]:
pip install torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [3]:
corpus = [      # Very first step is to create the vocabulary 
    'he is a king',  # list of unique words with assigned indices 
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',  #in practice would perform normalization
    'berlin is germany capital',
    'paris is france capital',   
]

In [4]:
def tokenize_corpus(corpus):  
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
tokenized_corpus # this gives us our list of tokens
 #iterate over tokens in corpus, and generate list of unique words(tokens)

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]

In [5]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)} #create two dictionaries for mapping between word 
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)} # and index

vocabulary_size = len(vocabulary) # size of our vocab (i.e number of words)
print(word2idx) # word
print(idx2word) #index

{'he': 0, 'is': 1, 'a': 2, 'king': 3, 'she': 4, 'queen': 5, 'man': 6, 'woman': 7, 'warsaw': 8, 'poland': 9, 'capital': 10, 'berlin': 11, 'germany': 12, 'paris': 13, 'france': 14}
{0: 'he', 1: 'is', 2: 'a', 3: 'king', 4: 'she', 5: 'queen', 6: 'man', 7: 'woman', 8: 'warsaw', 9: 'poland', 10: 'capital', 11: 'berlin', 12: 'germany', 13: 'paris', 14: 'france'}


In [6]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
idx_pairs[2] # paired words by index in array

array([1, 0])

In [7]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [8]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 4.6402668952941895
Loss at epo 10: 4.022881507873535
Loss at epo 20: 3.687854528427124
Loss at epo 30: 3.475494861602783
Loss at epo 40: 3.309539556503296
Loss at epo 50: 3.1698966026306152
Loss at epo 60: 3.0497453212738037
Loss at epo 70: 2.945136308670044
Loss at epo 80: 2.8530917167663574
Loss at epo 90: 2.771200656890869


In [9]:
def similarity(v,u):
    return torch.dot(v,u)/(torch.norm(v)*torch.norm(u))

<h2> Question 1:</h2>

In [15]:
similarity(W2[word2idx["she"]], W2[word2idx["king"]])

tensor(0.5850, grad_fn=<DivBackward0>)

In [16]:
similarity(W2[word2idx["she"]], W2[word2idx["queen"]])

tensor(0.9652, grad_fn=<DivBackward0>)

<b>Which pair is more similar? Does the model match your expectations?</b>

The pair of "she" and "queen" are more similar at .96 which intuitively makes sense as we know that Queen is a gendered term appropriate to the use of "she".

<h2> Question 2:</h2>

In [29]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["poland"]])

tensor(-0.1659, grad_fn=<DivBackward0>)

In [30]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["germany"]])

tensor(0.2215, grad_fn=<DivBackward0>)

<b>Which pair is more similar? Does the model match your expectations?</b>

In this case, Warsaw and Poland are not similar which does not make much sense considering that Warsaw is the capitol city of Poland.  Warsaw and Germany however show greater but still low similiarity.  That would make sense independent of the previous comparison with Poland. 

<h2>Question 3:</h2>

In [36]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["capital"]])

tensor(0.5439, grad_fn=<DivBackward0>)

In [37]:
similarity(W2[word2idx["poland"]], W2[word2idx["capital"]])

tensor(-0.3610, grad_fn=<DivBackward0>)

<b>Which pair is more similar? Does the model match your expectations?</b>

The over 50% similarity makes sense for Warsaw and Capital given that Warsaw is a capital city.  Poland and capital makes sense as well because you would expect the word Capital to be before Poland.  

<h2>Question 4:</h2>

In [57]:
embedding_dims = 8
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 4.652128219604492
Loss at epo 10: 4.318161964416504
Loss at epo 20: 4.055862903594971
Loss at epo 30: 3.8413636684417725
Loss at epo 40: 3.6616740226745605
Loss at epo 50: 3.5086333751678467
Loss at epo 60: 3.3767175674438477
Loss at epo 70: 3.262006998062134
Loss at epo 80: 3.1616220474243164
Loss at epo 90: 3.07336163520813
Loss at epo 100: 2.995483160018921
Loss at epo 110: 2.926537036895752
Loss at epo 120: 2.8652613162994385
Loss at epo 130: 2.8105297088623047
Loss at epo 140: 2.7613260746002197
Loss at epo 150: 2.7167537212371826
Loss at epo 160: 2.6760380268096924
Loss at epo 170: 2.6385273933410645
Loss at epo 180: 2.6036932468414307
Loss at epo 190: 2.5711100101470947
Loss at epo 200: 2.5404438972473145
Loss at epo 0: 4.711775779724121
Loss at epo 10: 4.165737628936768
Loss at epo 20: 3.8739123344421387
Loss at epo 30: 3.6618266105651855
Loss at epo 40: 3.4994640350341797
Loss at epo 50: 3.370413064956665
Loss at epo 60: 3.264141321182251
Loss at epo 70: 3.17386

In [39]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["poland"]])

tensor(0.0829, grad_fn=<DivBackward0>)

In [40]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["germany"]])

tensor(-0.2058, grad_fn=<DivBackward0>)

In [41]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["capital"]])

tensor(0.0290, grad_fn=<DivBackward0>)

In [42]:
similarity(W2[word2idx["warsaw"]], W2[word2idx["capital"]])

tensor(0.0290, grad_fn=<DivBackward0>)

The code doesn't appear to be doing better if I'm interpretting correctly.  Now Warsaw and Poland are .08 and Warsaw and Capital are .02 which both seem inherently wrong.  

<h2> Question 5:</h2>

In [66]:
corpus = [      # Very first step is to create the vocabulary 
    'skytta is the best',  # list of unique words with assigned indices 
    'she is a queen',
    'he is a man',
    'she is a woman',
    'washington is state',
    'seattle is capital city',  #in practice would perform normalization
    'portland is oregon capital',
    'paris is france capital',   
]

def tokenize_corpus(corpus):  
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
tokenized_corpus # this gives us our list of tokens
 #iterate over tokens in corpus, and generate list of unique words(tokens)
    
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)} #create two dictionaries for mapping between word 
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)} # and index

vocabulary_size = len(vocabulary) # size of our vocab (i.e number of words)
print(word2idx) # word
print(idx2word) #index

window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
idx_pairs[2] # paired words by index in array

def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

embedding_dims = 10
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 201
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

{'jenny': 0, 'is': 1, 'the': 2, 'best': 3, 'she': 4, 'a': 5, 'queen': 6, 'he': 7, 'man': 8, 'woman': 9, 'washington': 10, 'state': 11, 'seattle': 12, 'capital': 13, 'city': 14, 'portland': 15, 'oregon': 16, 'paris': 17, 'france': 18}
{0: 'jenny', 1: 'is', 2: 'the', 3: 'best', 4: 'she', 5: 'a', 6: 'queen', 7: 'he', 8: 'man', 9: 'woman', 10: 'washington', 11: 'state', 12: 'seattle', 13: 'capital', 14: 'city', 15: 'portland', 16: 'oregon', 17: 'paris', 18: 'france'}
Loss at epo 0: 6.251488208770752
Loss at epo 10: 5.0549845695495605
Loss at epo 20: 4.600773334503174
Loss at epo 30: 4.28962516784668
Loss at epo 40: 4.028888702392578
Loss at epo 50: 3.8029868602752686
Loss at epo 60: 3.6071372032165527
Loss at epo 70: 3.4392237663269043
Loss at epo 80: 3.2968063354492188
Loss at epo 90: 3.176079511642456
Loss at epo 100: 3.0726959705352783
Loss at epo 110: 2.9828290939331055
Loss at epo 120: 2.9034786224365234
Loss at epo 130: 2.8323872089385986
Loss at epo 140: 2.7678864002227783
Loss at e

In [67]:
similarity(W2[word2idx["jenny"]], W2[word2idx["is"]])

tensor(0.1286, grad_fn=<DivBackward0>)

In [68]:
similarity(W2[word2idx["seattle"]], W2[word2idx["state"]])

tensor(-0.7810, grad_fn=<DivBackward0>)

In [69]:
similarity(W2[word2idx["seattle"]], W2[word2idx["capital"]])

tensor(-0.1094, grad_fn=<DivBackward0>)