# General Note 
In order to guide you through the homework, we put "...COMPLETE HERE..." as placeholder for you to complete the homework.

# Word-embeddings
Word embeddings are very important in Natural Language Processing (NLP). Ready-to-use solutions (e.g., *Glove* and many others) are useful and relatively efficient. Skip-gram works empirically worse than others types of word-embeddings. Moreover, it needs much time to fit.

In [5]:
#https://pythonspot.com/nltk-stop-words/

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import random

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
corpus = [
    'Venice is a city in Veneto',
    'Padua is a city in Veneto',
    'Vicenza is a city in Veneto',
    'Verona is a city in Veneto',
    'Treviso is a city in Veneto',
    'Rovigo is a city in Veneto',
    'Bassano is in Veneto',
    'Chioggia is a city in Veneto',
    'Veneto is a region',
    'Bologna is a city in Emilia-Romagna',
    'Reggio-Emilia is in Emilia-Romagna',
    'Parma is a city in Emilia-Romagna',
    'Rimini is a city in Emilia-Romagna',
    'Ravenna is a city in Emilia-Romagna',
    'Piacenza is a city in Emilia-Romagna',
    'Emilia-Romagna is a region'
]

In [6]:
# this is all the necessary code to set the seed
def set_seed(seed : int = 123):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

In [7]:
set_seed()

Skip-gram model tries to *predict context given a word*.

In [8]:
def w_to_i(text : list) -> dict:
  vocab = set()
  for l in text:
    words = l.split()
    vocab |= set(words)

  vocab = dict(zip(vocab, range(0,len(vocab))))
  return vocab

In [9]:
def prepare_set(corpus, n_gram = 2):
    '''Creates a dataset with Input column and Output column for neighboring words.'''
    columns = ['Input', 'Output']
    result = pd.DataFrame(columns = columns)
    for sentence in corpus:
      words = sentence.split()
      for i,w in enumerate(words):
          inp = w
          for n in range(1,n_gram+1):
              # look back
              if (i-n)>=0:
                  out = words[i-n]
                  row = pd.DataFrame([[inp,out]], columns = columns)
                  result = pd.concat([result,row], axis = 0, ignore_index = True)

              # look forward
              if (i+n)<len(words):
                  out = sentence.split()[i+n]
                  row = pd.DataFrame([[inp,out]], columns = columns)
                  result = pd.concat([result,row], axis = 0, ignore_index = True)
    return result

In [10]:
# download stopwords
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))# e.g.: a, in, is
print(stop_words)

def preprocess(corpus):
    result = []
    for i in corpus:
        out = nltk.word_tokenize(i)
        # make capital letters lowercase
        out = [x.lower() for x in out]
        out = [x for x in out if x not in stop_words]
        result.append(" ". join(out))
    return result

processed_corpus = preprocess(corpus)

print('Original corpus')
print(corpus[:10])
print('Processed corpus')
print(processed_corpus[:10])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'their', 'more', 'because', 'had', "you'll", 'is', 'was', 'weren', 'him', 'hers', 'most', 'whom', 'when', 'yourselves', 'myself', 'once', 'but', "shan't", 'herself', 'i', 'd', 've', 'she', 'll', 'some', 'didn', 'than', "wouldn't", 'after', "you've", 'are', 'my', 'ours', 'other', 'those', 'mustn', 'needn', 'does', 'this', "weren't", "you'd", 'each', 'do', 'very', 'been', 'what', 'hasn', 'any', 'why', 's', 'into', 'theirs', 'under', "should've", "won't", 'be', 'in', 'until', 'now', "mustn't", 'only', 'me', 'himself', "it's", 'for', 'her', 'has', 'its', 'from', 'by', "that'll", 'our', 'so', 'we', 'before', 'aren', 'all', 'such', 'again', 'haven', 'above', 'during', 'of', "couldn't", 'ourselves', 'between', "shouldn't", 'should', "hadn't", 'at', 'while', 'ain', "isn't", 'wouldn', 'down', "doesn't", 'there', 'couldn', 'through', 'below', 'over', 'themselves', 'he', 'm', 'isn', 'y', "haven't", 'did', 'the', 'them', 'having', 'up', 'too', 'further', 'hadn', 'won', 'which', 'on', 'an', 'few',

In [11]:
vocabulary = w_to_i(processed_corpus)
dim_vocabulary = len(vocabulary)
print(f'The vocabulaty has dimension {dim_vocabulary}')
print(vocabulary)

The vocabulaty has dimension 18
{'veneto': 0, 'padua': 1, 'vicenza': 2, 'region': 3, 'city': 4, 'piacenza': 5, 'rimini': 6, 'rovigo': 7, 'treviso': 8, 'verona': 9, 'parma': 10, 'ravenna': 11, 'chioggia': 12, 'bassano': 13, 'venice': 14, 'emilia-romagna': 15, 'reggio-emilia': 16, 'bologna': 17}


# Question 1
Choose the more appropriate dimension for the context, that is the paramerer `n_gram` in the function `prepare_set()` and explain why you chose for that dimension.

In [None]:
# choose the more appropriate n_gram
n_gram = "...COMPLETE HERE..."
train_emb = prepare_set(processed_corpus, n_gram=n_gram)
train_emb.head(10)

In [14]:
# from words to indices
# function map to apply to Pandas DataFrames
train_emb.Input = train_emb.Input.map(vocabulary)
train_emb.Output = train_emb.Output.map(vocabulary)

In [15]:
print(train_emb.head(10))


for i,x in enumerate(DataLoader(train_emb.Input.values, batch_size=train_emb.shape[0])):
  print(x.shape)
  break

   Input  Output
0     14       4
1     14       0
2      4      14
3      4       0
4      0       4
5      0      14
6      1       4
7      1       0
8      4       1
9      4       0
torch.Size([80])


In this way, we have represented through numbers both the input and the output. Then, we have to transform this representation of the input and the output with the one-hot-encoding representation.

This means that the intput and the output have to be represented as vectors of the same dimension of the vocabulary, where all the elements in the vectors are zero, except for the one which represents the word.


In [16]:
def get_input_tensor(indices : torch.LongTensor, vocab_size : int):
    '''Transform 1D tensor of word indexes to one-hot encoded 2D tensor'''
    size = indices.size(0)
    # batch size, vocab size
    one_hot_encoded_input = torch.zeros(size, vocab_size).scatter_(1, indices.unsqueeze(1), 1.)
    # print(one_hot_encoded_input)
    return one_hot_encoded_input

In [17]:
for i,x in enumerate(DataLoader(train_emb.Input.values, batch_size=train_emb.shape[0])):
  tmp = get_input_tensor(x, vocab_size=dim_vocabulary)
  break
# transform a tensor to a list
print(tmp.tolist())

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In between the input and output, there is an hidden layer we choose. The length of the hidden layer gives the dimension of the embedding vectors.

The most interesting part of this network are the weights in between the hidden layer and the two other layers: the input and the output.

The multiplication of the one-hot encoded vector with the matrix of weights will activate the only row which corresponds to the $1$ in the input vector.


# Question 2
Try to do the appropriate modifications to this notebook to use the GPU. (Tip: see the introductory notebooks about PyTorch, specifically 2_Introduction_to_Pytorch.ipynb.

# Question 3
Implement SkipGram model by defining an appropriate class which extends `nn.Module` and train it on the data.

# Question 4
Variate the embedding dimension in the range `[2:5]` and try to choose the best embedding dimension. Explain why you chose that dimension.


In [None]:
dim_embedding = "...COMPLETE HERE..."

In [None]:
import torch.nn as nn

class SkipGram(nn.Module):
  def __init__(self, ...):
    super(SkipGram, self).__init__()

    # choose the correct class variables
    "...COMPLETE HERE..."

  def forward(self,x):
    # implement this function
    "...COMPLETE HERE..."
    return x

In [None]:
# make an instance of the model
model = SkipGram("...COMPLETE HERE...")
print(model)

In [None]:
# function to get all the parameters
for i,p in enumerate(model.parameters()):
  print(p.shape)

In [None]:
# function to get the parameters of the model that have a name
for i,p in enumerate(model.named_parameters()):
  print(p)

# Definition of the loss function
We want to predict the context given a word. Thus, we want to maximise the following equation:
$max \prod_{center}\prod_{context}P(context|center; \theta)$.

Thus, we want to minimise
$-min \prod_{center}\prod_{context}P(context|center; \theta)$.

By using the $log(\cdot)$, we can sum up the elements in the product:
$-min \sum_{center}\sum_{context}log\left(P(context|center; \theta)\right)$.


Now, let's define $P(context|center; \theta)$:

$P(context|center; \theta)=$
$\frac{exp(u^T_{context}v_{center})}{\sum_{\omega \in vocab.}exp(u^T_{\omega}v_{center})}$

In [None]:
num_epochs = 2000
lr = 1e-1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = lr)

In [None]:
model.train()
for e in range(num_epochs):
    for x,y in zip(DataLoader(train_emb.Input.values, batch_size=train_emb.shape[0]), DataLoader(train_emb.Output.values, batch_size=train_emb.shape[0])):
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # one-hot encode input tensor
        input_tensor = get_input_tensor(x, vocab_size=dim_vocabulary) #shape N*V
        # compute the predictions
        y_pred = model(input_tensor)
        #compute loss
        loss = criterion(y_pred, y)
        # bakpropagation step
        loss.backward()
        optimizer.step()

    if e%100 == 0:
        print(f'Epoch {e}, loss = {loss}')

Let's have a look to our embeddings.

# Question 5
Plot the original embeddings and the trained ones and explain the difference between the two.

To do so use the function `svd = decomposition.TruncatedSVD(n_components=2)` of the `scikit-learn` package.

Using the SVD decomposition, for example using the Principal Component Analysis (PCA), we can see the embeddings in 2D vector space.


In [None]:
# transform the parameters to numpy vectors
# using numpy() function
W1 = "...COMPLETE HERE..."
W2 = "...COMPLETE HERE..."

# print the dimensions
print(W1.shape)
print(W2.shape)

In [None]:
# take the untrained matrix parameters (the untrained ones)
untrained_model = SkipGram("...COMPLETE HERE...")
W1_untrained = "...COMPLETE HERE..."
W2_untrained = "...COMPLETE HERE..."

In [None]:
from sklearn import decomposition
import seaborn as sns

svd = decomposition.TruncatedSVD(n_components=2)
W1_untrained_dec = svd.fit_transform(W1_untrained)
x = "...COMPLETE HERE..."
y = "...COMPLETE HERE..."
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_untrained_dec.shape[0]):
     plot.text(x[i], y[i], list(vocabulary.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');

In [None]:
from sklearn import decomposition
import seaborn as sns

svd = decomposition.TruncatedSVD(n_components=2)
W1_dec = svd.fit_transform(W1)
x = "...COMPLETE HERE..."
y = "...COMPLETE HERE..."
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_dec.shape[0]):
     plot.text(x[i], y[i], list(vocabulary.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');

In [None]:
W2_dec = svd.fit_transform(W2)
x = "...COMPLETE HERE..."
y = "...COMPLETE HERE..."
plot1 = sns.scatterplot(x=x, y=y)
for i in range(0,W2_dec.shape[0]):
     plot1.text(x[i], y[i], list(vocabulary.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');