# General Note
In order to guide you through the homework, we put "...COMPLETE HERE..." as placeholder for you to complete the homework.

# Word-embeddings
Word embeddings are very important in Natural Language Processing (NLP). Ready-to-use solutions (e.g., *Glove* and many others) are useful and relatively efficient.
*Skip-gram* works *empirically worse* than others types of word-embeddings. Moreover, it needs much time to fit.

In [None]:
#https://pythonspot.com/nltk-stop-words/

import torch
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
import string
import re

import seaborn as sns

# Cloning the git repository
We clone the git repository mainly to have the folder of data inside.

In [None]:
!git clone https://github.com/sarafrr/pytorch-tutorial-nlp-public.git

After moving out the data repository, let's remove it.

In [None]:
!rm -r pytorch-tutorial-nlp-public

In [None]:
# from the folder containing the data, obtain the list of all the files
from glob import glob
file_list = glob( "/content/data/Shakespeare/*.txt")

print(file_list)

Skip-gram model tries to *predict context given a word*.

In [None]:
# download stopwords
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
stop_words = set(stopwords.words('english'))# e.g.: a, in, is
print(stop_words)
punctuation = set(string.punctuation)
print(punctuation)

In [None]:
# import nltk.data
# text = '''
# Punkt knows that the periods in Mr. Smith and Johann S. Bach
# do not mark sentence boundaries.  And sometimes sentences
# can start with non-capitalized words.  i is a good variable
# name.
# '''
# sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
# out = sent_detector.tokenize(text.strip())


In [None]:
# print(type(out))
# out[:3]

In [None]:
def prepare_corpus(file_list : list, min_size : int = 1):
  ''' Read and prepare the corpus as a list of sentences '''
  # retrieve all the text from the files
  # a string we named complete_text
  complete_text = ''
  for p in file_list:
    with open(p, 'r') as f:
      lines = f.readlines()
      for l in lines:
        complete_text += l

  sentences = re.split('[.,!?\\n]', complete_text)
  # retrieve the sentences by the complete_text
  # the correct way to get the sentences, however it takes a lot
  # of time
  #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
  #sentences = sent_detector.tokenize(complete_text.strip())
  # filter the sentences by the number of words
  corpus = []
  # sentences is a list of strings
  for s in sentences:
    if len(s.split()) > min_size*2 + 1:
      corpus.append(s)
  return corpus

def preprocess(corpus):
  ''' Remove the stopwords and the punctuation marks'''
  processed = []
  for i in corpus:
      tokenized = nltk.word_tokenize(i)
      # make capital letters lowercase
      tokenized = [x.lower() for x in tokenized]
      tokenized = [x for x in tokenized if x not in (stop_words|punctuation)]
      processed.append(" ". join(tokenized))
  return processed

corpus = prepare_corpus(file_list=file_list, min_size = 2)
processed_corpus = preprocess(corpus)

n_sentences = 2
print('Original corpus')
print(corpus[:n_sentences])
print('Processed corpus')
print(processed_corpus[:n_sentences])

Create the vocabulary on the processed corpus.

In [None]:
vocab = set()
for l in processed_corpus:
  words = l.split()
  vocab |= set(words)

In [None]:
def w_to_i(vocab : set) -> dict:
  ''' From set to dict '''

  vocab = dict(zip(vocab, range(0,len(vocab))))
  return vocab

In [None]:
vocab = w_to_i(vocab)
print(vocab)
dim_vocab = len(vocab)
print(f'The vocabulaty has dimension {dim_vocab}')

# Question 1
Try to choose the more appropriate dimension of the context to create the data, that is the paramerer `n_gram` in the function `prepare_set()` and explain why you choose for that dimension.

In [None]:
from tqdm import tqdm

def prepare_set(corpus, n_gram = 2):
    '''Creates a dataset with Input column and Output column for neighboring words.'''
    columns = ['Input', 'Output']
    result = pd.DataFrame(columns = columns)
    for sentence in tqdm(corpus):
      words = sentence.split()
      for i,w in enumerate(words):
          inp = w
          for n in range(1,n_gram+1):
              # look back
              if (i-n)>=0:
                  out = words[i-n]
                  row = pd.DataFrame([[inp,out]], columns = columns)
                  result = pd.concat([result,row], axis = 0, ignore_index = True)

              # look forward
              if (i+n)<len(words):
                  out = sentence.split()[i+n]
                  row = pd.DataFrame([[inp,out]], columns = columns)
                  result = pd.concat([result,row], axis = 0, ignore_index = True)
    return result

In [None]:
n_gram = '''COMPLETE HERE''' # context size
train_emb = prepare_set(processed_corpus, n_gram)
train_emb.head(10)

In [None]:
# from words to indices
# function map to apply to Pandas DataFrames
train_emb.Input = train_emb.Input.map(vocab)
train_emb.Output = train_emb.Output.map(vocab)
print(train_emb.head(10))

In [None]:
batch_size = 10
train_loader_inp = DataLoader(train_emb.Input.values, batch_size=batch_size)
train_loader_out = DataLoader(train_emb.Output.values, batch_size=batch_size)

In [None]:
# check that the train_loader has the correct dimension
for i,x in enumerate(train_loader):
  print(x.shape)
  break

torch.Size([10])


In this way, we have represented through numbers both the input and the output. Then, we have to transform this representation of the input and the output with the one-hot-encoding representation.

This means that the intput and the output have to be represented as vectors of the same dimension of the vocabulary, where all the elements in the vectors are zero, except for the one which represents the word.


In [None]:
def get_input_tensor(indices : list, vocab_size : int):
    '''Transform 1D tensor of word indexes to one-hot encoded 2D tensor'''
    # batch size, vocab size
    batch_size = indices.size(0)
    # scatter_(dim, index, src)
    # index is giving the indices along the rows (dim=1) where to modify the tensor that
    # is the caller, src is giving the elements to insert when modifying the
    # tensor
    if indices.is_cuda:
      base_matrix = torch.zeros(batch_size, vocab_size).to('cuda:0')
      one_hot_encoded_input = base_matrix.scatter_(1,indices.unsqueeze(1),1.)
    else:
      one_hot_encoded_input = torch.zeros(batch_size, vocab_size).scatter_(1,indices.unsqueeze(1),1.)
    # print(one_hot_encoded_input)
    return one_hot_encoded_input

In [None]:
# check that the function to obtain the one-hot-encoding is correclty
# working
for i,x in enumerate(train_loader):
  print(x.size())
  x = x.to('cuda:0')
  print(x.device)
  tmp = get_input_tensor(x, vocab_size=dim_vocab)
  print(tmp.device)
  break

In [None]:
# another useful function : get the list from a tensor
print(tmp.tolist())
tmp.size()
# generally you use item() for the getting the error value from the loss
# however it is automatically converted to a list, indeed you can
# have the element with the tolist() function
tmp1 = torch.Tensor(1)
elem = tmp1.item()
elem1 = tmp1.tolist()

print(elem)
print(elem1)

In between the input and output, there is an hidden layer we choose. The *length of the hidden layer* gives the *dimension of the embedding vectors*.

The most interesting part of this network are the *weights in between the hidden layer and the two other layers*: the input and the output.

The multiplication of the one-hot encoded vector with the matrix of weights will activate the only row which corresponds to the $1$ in the input vector.


# Question 2
Implement SkipGram model by defining an appropriate class which extends `nn.Module` and train it on the data.

# Question 3
Variate the embedding dimension in the range `{5,10,100}` and try to choose the best embedding dimension. Explain why you chose that dimension.


In [None]:
dim_embedding = '''COMPLETE HERE'''

device = torch.device('cuda:0')

In [None]:
import torch.nn as nn

class SkipGram(nn.Module):
  def __init__(self,'''COMPLETE HERE'''):
    super(SkipGram, self).__init__()
    '''COMPLETE HERE'''

  def forward(self,x):
    '''COMPLETE HERE'''
    return x

In [None]:
model = SkipGram('''COMPLETE HERE''')

print(model)

In [None]:
for i,p in enumerate(model.parameters()):
  print(p.shape)

# Definition of the loss function
We want to predict the context given a word. Thus, we want to maximise the following equation:
$max \prod_{center}\prod_{context}P(context|center; \theta)$.

Thus, we want to minimise
$-min \prod_{center}\prod_{context}P(context|center; \theta)$.

By using the $log(\cdot)$, we can sum up the elements in the product:
$-min \sum_{center}\sum_{context}log\left(P(context|center; \theta)\right)$.


Now, let's define $P(context|center; \theta)$:

$P(context|center; \theta)=$
$\frac{exp(u^T_{context}v_{center})}{\sum_{\omega \in vocab.}exp(u^T_{\omega}v_{center})}$

In [None]:
num_epochs = 10
lr = 1e-1
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = lr)
mmodel = model.to('cuda:0')

In [None]:
model.train()
for e in range(num_epochs):
    for x,y in zip(train_loader_inp, train_loader_out):
        x = x.to('cuda:0')
        y = y.to('cuda:0')

        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # one-hot encode input tensor
        input_tensor = get_input_tensor(x, vocab_size=dim_vocab)

        # compute the predictions
        y_pred = model(input_tensor)
        #compute loss
        loss = criterion(y_pred, y)
        # bakpropagation step
        loss.backward()
        optimizer.step()

    if e%1 == 0:
        print(f'Epoch {e}, loss = {loss}')

In [None]:
# function to get the parameters of the model that have a name
for i,p in enumerate(model.named_parameters()):
  print(p)

Let's have a look to our embeddings.

# Question 4
Plot the original embeddings and the trained ones and explain the difference between the two.

To do so use the function `tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3)` of the `scikit-learn` package.

Indeed, using T-distributed Stochastic Neighbor Embedding (or other similar functions, such as the PCA), we can see the embeddings in 2D vector space. Explain the results considering the best plot you obtain by one of the two functions.


In [None]:
W1 ='''COMPLETE HERE'''
W2 = '''COMPLETE HERE'''

print(W1.shape)
print(W2.shape)

In [None]:
orig_model = SkipGram(dim_voc=dim_vocab, emb_dim=dim_embedding)
# here we did not pass into the gpu, thus we do not have to pass the matrices
# angain in the cpu
W1_orig = '''COMPLETE HERE'''
W2_orig = '''COMPLETE HERE'''

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3)
W1_orig_dec = tsne.fit_transform(W1_orig)
x = '''COMPLETE HERE'''
y = '''COMPLETE HERE'''
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_orig_dec.shape[0]):
     plot.text(x[i], y[i], list(vocab.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');


In [None]:
tsne = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3)
W1_dec = tsne.fit_transform(W1)
x = '''COMPLETE HERE'''
y = '''COMPLETE HERE'''
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_orig_dec.shape[0]):
     plot.text(x[i], y[i], list(vocab.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');

In [None]:
tsne = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3)
W2_dec = tsne.fit_transform(W2)
x = '''COMPLETE HERE'''
y = '''COMPLETE HERE'''
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_orig_dec.shape[0]):
     plot.text(x[i], y[i], list(vocab.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');

Or using the PCA.

In [None]:
from sklearn import decomposition

svd = decomposition.TruncatedSVD(n_components=2)
W1_orig_dec = svd.fit_transform(W1_orig)
x = '''COMPLETE HERE'''
y = '''COMPLETE HERE'''
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_orig_dec.shape[0]):
     plot.text(x[i], y[i], list(vocab.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');

In [None]:
from sklearn import decomposition
import seaborn as sns

svd = decomposition.TruncatedSVD(n_components=2)
W1_dec = svd.fit_transform(W1)
x = '''COMPLETE HERE''']
y = '''COMPLETE HERE'''
plot = sns.scatterplot(x=x, y=y)

for i in range(0,W1_dec.shape[0]):
     plot.text(x[i], y[i], list(vocab.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');

In [None]:
W2_dec = svd.fit_transform(W2)
x = '''COMPLETE HERE'''
y = '''COMPLETE HERE'''
plot1 = sns.scatterplot(x=x, y=y)
for i in range(0,W2_dec.shape[0]):
     plot1.text(x[i], y[i], list(vocab.keys())[i], horizontalalignment='center', size='small', color='black', weight='semibold');