In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

import collections
import random
import pandas as pd
import numpy as np

# INF554 - Machine and Deep Learning
## Lab 8 - Working with Text, Regularization and Recurrent Neural Networks

In this lab we will be making some *sentiment analysis* with textual data from the IMDB dataset. In its simplest form, the *sentiment analysis* task consists in predicting the polarity of a text as positive or negative.

First of all, let's take a look at the steps of text processing:

1. Tokenization: break sentence into individual words
    - Before: `"PyTorch seems really easy to use!"`
    - After: `["PyTorch", "seems", "really", "easy", "to", "use", "!"]`
2. Building vocabulary: build an index of words associated with unique numbers
    - Before: `["PyTorch", "seems", "really", "easy", "to", "use", "!"]`
    - After: `{"Pytorch: 0, "seems": 1, "really": 2, ...}`
3. Convert to numerals: map words to unique numbers (indices)
    - Before: `{"Pytorch: 0, "seems": 1, "really": 2, ...}`
    - After: `[0, 1, 2, ...]`
4. Embedding look-up: map sentences (indices now) to fixed matrices
    - ```[[0.1, 0.4, 0.3],
       [0.8, 0.1, 0.5],
       ...]```
     

First of all, let's download the IMDB dataset (~85MB) and create the train, test and validation sets:

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

data = pd.read_csv('./data/IMDB Dataset.csv')
data['label']=data['sentiment'].replace(['positive', 'negative'], [1,0])


from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.5, random_state=1337)

train, valid = train_test_split(train, test_size=0.3, random_state=1337)

print(len(train), len(test), len(valid))
train.head()

17500 25000 7500


Unnamed: 0,review,sentiment,label
39913,Domestic Import was a great movie. I laughed t...,positive,1
13622,Midnight Madness is a movie that is unfortunat...,positive,1
875,Let me first off say that I am a believer of g...,negative,0
38725,I think you would have to be from the USA to g...,positive,1
38077,This is no doubt one of the worst movies I hav...,negative,0


Now we build the *vocabulary* (step 2 above), mapping every word into a numerical id. We consider only the text in the training set for this step. We limit the vocabulary to the 1000 most frequent words.

In [3]:
import torchtext
from torchtext.data import get_tokenizer
from collections import Counter

tokenizer = get_tokenizer("basic_english")

# Build vocabulary
words=[]
num_words = 1000

for text in train['review']:
    tokens=tokenizer(text)
    words.extend(tokens)

top_1k = dict(Counter(words).most_common(1000))
vocab = torchtext.vocab.vocab(top_1k, specials = ['<unk>', '<pad>'])
#uncomment the following commented lines if you're using torchtext version 0.11
#older versions do not support this way to build a vocabulary
#vocab = torchtext.vocab.vocab(top_1k)
#vocab.append_token('<unk>')
#vocab.append_token('<pad>')
vocab.set_default_index(vocab['<unk>']) #default index used when an unknown words is found

print(train['review'].iloc[0])
print(vocab.forward(tokenizer(train['review'].iloc[0]))) #example of how a sentence is transformed into a sequence of numerical IDs

Domestic Import was a great movie. I laughed the whole time. It was funny on so many levels from the crazy outfits to the hilarious situations. The acting was great. Alla Korot, Larry Dorf, Howard Hesseman, and all the others did an awesome job. Because it is an independent film written by a first-time writer, it doesn't have the clichés that are expected of other comedies, which was such a relief. It was a unique and interesting and you fall in love with the characters and the heart-warming story. I heard it was based on a true story? If so, then that is hilarious (and amazing!). I highly recommend this movie.
[0, 0, 17, 6, 83, 21, 3, 13, 0, 2, 223, 68, 3, 11, 17, 164, 27, 43, 115, 0, 44, 2, 905, 0, 8, 2, 572, 0, 3, 2, 128, 17, 83, 3, 0, 0, 4, 0, 0, 4, 0, 0, 4, 5, 38, 2, 402, 121, 41, 0, 296, 3, 95, 11, 10, 41, 0, 23, 444, 40, 6, 0, 746, 4, 11, 159, 9, 28, 33, 2, 0, 15, 31, 844, 7, 84, 0, 4, 69, 17, 150, 6, 0, 3, 11, 17, 6, 983, 5, 229, 5, 26, 798, 12, 118, 19, 2, 111, 5, 2, 0, 73, 3,

We now transform all reviews to vectors of word IDs. We also need to make all reviews of the same length. We set max_length to 80, therefore we'll cut reviews that exceed 80 words and pad those that are shorter.

In [4]:
max_len=80

def vectorize_sentences(reviews, max_len):
    vectors=[]
    for text in reviews:
        tokens=tokenizer(text)
        v=vocab.forward(tokens)
        if len(v) > max_len : v = v[:max_len]
        if len(v) < max_len : #padding
            tmp = np.full(max_len, vocab['<pad>'])
            tmp[0:len(v)]=v 
            v = tmp
        vectors.append(np.array(v))
    return np.array(vectors)


train_X = vectorize_sentences(train['review'], max_len)
test_X = vectorize_sentences(test['review'], max_len)
val_X = vectorize_sentences(valid['review'], max_len)

train_y = np.array(train['label']).reshape(-1,1)
test_y = np.array(test['label']).reshape(-1,1)
val_y = np.array(valid['label']).reshape(-1,1)


Now, we create and load the batches for training:

In [5]:
# define batch size
batch_size = 64

# create tensor datasets
trainset = TensorDataset(torch.from_numpy(train_X).to(device), torch.from_numpy(train_y).float().to(device))
validset = TensorDataset(torch.from_numpy(val_X).to(device), torch.from_numpy(val_y).float().to(device))
testset = TensorDataset(torch.from_numpy(test_X).to(device), torch.from_numpy(test_y).float().to(device))

# create dataloaders
train_loader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(validset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(testset, shuffle=True, batch_size=batch_size)

We introduce now a fully connected NN for text classification

In [6]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, max_len):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # Linear function
        self.fc1 = nn.Linear(max_len*embedding_dim, hidden_dim) 
        
        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)
        embedded = embedded.view(-1, max_len*embedding_dim)
        # Linear function
        out = self.fc1(embedded)

        # Non-linear activation function
        out = torch.relu(out)

        # Here we use a final sigmoid function as we are using BCE loss which does not implement it
        out = self.fc2(out)
        out = torch.sigmoid(out)
    
        return out

In [7]:
input_dim = num_words + 2 #add 2 for <unk> and <pad> symbols
embedding_dim = 100
hidden_dim = 32
output_dim = 1

# Instantiate model class and assign to object
model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim, max_len)

# Push model to CUDA device if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function
criterion = nn.BCELoss()

# Optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)

Natural Language Processing problems are affected by the problem of features that occur infrequently. E.g., it is a lot less likely that we will see the word *phantasmagorical* than the word *reading*. Parameters associated with infrequent features only receive meaningful updates whenever these features occur. Given a decreasing learning rate we might end up in a situation where the parameters for common features converge rather quickly to their optimal values, whereas for infrequent features we are still short of observing them sufficiently frequently before their optimal values can be determined. In other words, the learning rate either decreases too slowly for frequent features or too quickly for infrequent ones.

A solution to this problem are *adaptive learning rates*. For instance, **Adagrad** is an optimization technique based on the accumulation of squared gradients.

Here we use the notation $\mathbf{g}_t = \nabla_{\mathbf{w}} l(y_t, f(\mathbf{x}_t, \mathbf{w}))$ for the gradient of the loss function at time $t$. We use the variable $\mathbf{s}_t$ to accumulate past gradient variance.

$$\begin{aligned}
    \mathbf{s}_t & = \mathbf{s}_{t-1} + \mathbf{g}_t^2, \\
    \mathbf{w}_t & = \mathbf{w}_{t-1} - \frac{\eta}{\sqrt{\mathbf{s}_t + \epsilon}} \odot \mathbf{g}_t.
\end{aligned}$$

However Adagrad has some issues and for our case we are going to use an improved version called <a href="https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html">**RMSprop**</a>. It uses a moving average of squared gradients to normalize the gradient. This normalization balances the step size (momentum), decreasing the step for large gradients to avoid exploding and increasing the step for small gradients to avoid vanishing:

$$\begin{aligned}
    \mathbf{s}_t & \leftarrow \gamma \mathbf{s}_{t-1} + (1 - \gamma) \mathbf{g}_t^2, \\
    \mathbf{w}_t & \leftarrow \mathbf{w}_{t-1} - \frac{\eta}{\sqrt{\mathbf{s}_t + \epsilon}} \odot \mathbf{g}_t.
\end{aligned}$$

The constant $\epsilon > 0$ is typically set to $10^{-6}$ to ensure that we do not suffer from division by zero or overly large step sizes. $\gamma$ is usually set to 0.9.

The following block sets up the network with <a href="https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html">Binary Cross Entropy</a> loss and RMSprop optimizer.

In [8]:
# Number of groups of parameters
print('Number of groups of parameters {}'.format(len(list(model.parameters()))))
print('-'*50)
# Print parameters
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())
print('-'*50)

Number of groups of parameters 5
--------------------------------------------------
torch.Size([1002, 100])
torch.Size([32, 8000])
torch.Size([32])
torch.Size([1, 32])
torch.Size([1])
--------------------------------------------------


The following function carries out the training

In [9]:
def train_model(model, optizmizer, loss_criterion):
    iter = 0
    num_epochs = 10
    history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], []
    best_accuracy = 0
    for epoch in range(num_epochs):
        for i, (samples, labels) in enumerate(train_loader):
            # Training mode
            model.train()

            # Load samples
            samples = samples.view(-1, max_len).to(device)
            labels = labels.view(-1, 1).to(device)

            # Clear gradients w.r.t. parameters
            optimizer.zero_grad()

            # Forward pass to get output/logits
            outputs = model(samples)

            # Calculate Loss: softmax --> cross entropy loss
            loss = loss_criterion(outputs, labels)

            # Getting gradients w.r.t. parameters
            loss.backward()

            # Updating parameters
            optimizer.step()

            iter += 1

            if iter % 100 == 0:
                # Get training statistics
                train_loss = loss.data.item()

                # Testing mode
                model.eval()
                # Calculate Accuracy         
                correct = 0
                total = 0
                # Iterate through test dataset
                for samples, labels in valid_loader:
                    # Load samples
                    samples = samples.view(-1, max_len).to(device)
                    labels = labels.view(-1).to(device)

                    # Forward pass only to get logits/output
                    outputs = model(samples)

                    # Val loss
                    val_loss = criterion(outputs.view(-1, 1), labels.view(-1, 1))

                    predicted = outputs.ge(0.5).view(-1)

                    # Total number of labels
                    total += labels.size(0)

                    # Total correct predictions
                    correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum().item()
                    # correct = (predicted == labels.byte()).int().sum().item()

                accuracy = 100. * correct / total

                # Print Loss
                print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(iter, train_loss, val_loss.item(), round(accuracy, 2)))

                # Append to history
                history_val_loss.append(val_loss.data.item())
                history_val_acc.append(round(accuracy, 2))
                history_train_loss.append(train_loss)

                # Save model when accuracy beats best accuracy
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    # We can load this best model on the validation set later
                    torch.save(model.state_dict(), 'best_model.pth')
    return (history_train_acc, history_val_acc, history_train_loss, history_val_loss)



This function plots the losses for the testing and validation sets:

In [10]:
def plot_losses(history_train_loss, history_val_loss):
    # Set plotting style
    #plt.style.use(('dark_background', 'bmh'))
    plt.style.use('bmh')
    plt.rc('axes', facecolor='none')
    plt.rc('figure', figsize=(16, 4))

    # Plotting loss graph
    plt.plot(history_train_loss, label='Train')
    plt.plot(history_val_loss, label='Validation')
    plt.title('Loss Graph')
    plt.legend()
    plt.show()

Let's observe the behaviour of the model:

In [11]:
(train_acc, val_acc, train_loss, val_loss) = train_model(model, optimizer, criterion)

Iter: 100 | Train Loss: 0.6810848712921143 | Val Loss: 0.6830800771713257 | Val Accuracy: 51.77
Iter: 200 | Train Loss: 0.7062846422195435 | Val Loss: 0.7088586688041687 | Val Accuracy: 55.13
Iter: 300 | Train Loss: 0.5619038343429565 | Val Loss: 0.5862455368041992 | Val Accuracy: 56.16
Iter: 400 | Train Loss: 0.6861127018928528 | Val Loss: 0.8024255633354187 | Val Accuracy: 58.43
Iter: 500 | Train Loss: 0.5844285488128662 | Val Loss: 0.5503729581832886 | Val Accuracy: 59.95
Iter: 600 | Train Loss: 0.42505693435668945 | Val Loss: 0.506284236907959 | Val Accuracy: 60.69
Iter: 700 | Train Loss: 0.5001639127731323 | Val Loss: 1.0521844625473022 | Val Accuracy: 60.71
Iter: 800 | Train Loss: 0.3493354320526123 | Val Loss: 0.8583499789237976 | Val Accuracy: 61.29
Iter: 900 | Train Loss: 0.2788204848766327 | Val Loss: 0.5286469459533691 | Val Accuracy: 61.79
Iter: 1000 | Train Loss: 0.22440889477729797 | Val Loss: 0.4389880299568176 | Val Accuracy: 62.29
Iter: 1100 | Train Loss: 0.13953074812

In [12]:
plot_losses(train_loss, val_loss)

KeyboardInterrupt: 

> **Ques 1**: What can you deduce from this graph? Is the network working as expected?

Regularization techniques are required to reduce overfitting (i.e. the predictor fits too closely to the training data and does not generalize well to new data). Overfitting tends to occur when we have little data and a complex hypothesis class.

A first idea for regularization is to introduce a *penalty term*, which makes it harder for the optimizer to return an overly flexible predictor.

We will start by adding a regularization term to our (point-wise) loss:

$$ loss_R = loss + \frac{\lambda}{2}\left\Vert W\right\Vert _{2}^{2}$$

The gradient for the regularized loss is therefore:

$$\nabla_W loss_R = \nabla_W loss + \lambda W$$

With pytorch, adding a regularization term is done on the optimization function by adding the parameter *weight_decay* (corresponding to the above $\lambda$) to the call to the optimizer.

> **Task 1**: set the optimizer *weight_decay* to 0.005 and re-train the model, studying its behaviour

You can use the following function to take a look at the parameters of the model before and after regularization:

In [None]:
def look_parameters(mdl):
    weights = torch.Tensor().to(device)
    for param_group in list(mdl.parameters()):
        weights = torch.cat((param_group.view(-1), weights))
    ws = weights.detach().cpu().numpy()
    plt.hist(ws.reshape(-1), range=(-.5, .5), bins=501)

look_parameters(model)

In [None]:
# Instantiate model class and assign to object
model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim, max_len)
model.to(device)

#enter here the code for Task 1
optimizer = 

(train_acc, val_acc, train_loss, val_loss) = train_model(model, optimizer, criterion)

In [None]:
plot_losses(train_loss, val_loss)
look_parameters(model)

## Dropout

Another solution for regularization in neural network consists in applying **Dropout**: at each step of training, a new subnetwork is selected. As a result an adaptation appears in the final network only if it exists in a sufficient part of the training data.

<img src="figures/dropout.png" alt="dropout" width="400"/><font size="1">Image from (Srivastava et al. ,2014) Dropout: A Simple Way to Prevent Neural Networks from
Overfitting, JMLR</font>

We need to introduce a probability $p$ of deactivating a neuron in the hidden layers. So, if we have $n$ neurons in layer $l$, we expect to have $n * (1-p)$ active neurons.

Therefore the output of a generic layer becomes:

$$ z^l = \alpha(W_l^\top \delta z^{l-1} + b_l) $$

with $\delta \sim Bernoulli(p)$ and $\alpha(x)$ an activation function.

> **Task 2** : add <a href="https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html">dropout</a> to the neural network above, between the two fully connected layers, with p=0.8; re-train the new network and observe the result (note: use the optimizer without weight_decay).

In [None]:
class FeedforwardNeuralNetModelWithDropout(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, max_len):
        super(FeedforwardNeuralNetModelWithDropout, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # Linear function
        self.fc1 = nn.Linear(max_len*embedding_dim, hidden_dim) 
        
        #insert here your code
        
        
        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)
        
        #print("embedding for word ", vocab.lookup_token(83))
        #print(self.embedding(torch.Tensor([83]).int()))
        
        embedded = embedded.view(-1, max_len*embedding_dim)
        # Linear function
        out = self.fc1(embedded)

        # Non-linearity
        out = torch.relu(out)
        
        #insert here your code
        
        
        # Take note here use a final sigmoid function so your loss should not go through sigmoid again as we are using BCE loss.
        out = self.fc2(out)
        out = torch.sigmoid(out)
    
        return out


In [None]:
#insert here your code
# Instantiate model class and assign to object
model_dropout = 

model_dropout.to(device)

# Optimizer
optimizer = 

(train_acc, val_acc, train_loss, val_loss) = train_model(model_dropout, optimizer, criterion)

In [None]:
#train your model here
plot_losses(train_loss, val_loss)

## Using LSTM to classify textual data

A fully connected neural network is not an ideal way of processing textual data. Recurrent Neural Networks are more useful for this task since they are more adequate to work on sequential data such as text.

A particular type of RNN is the Long Short-Term Memory, or <a href="https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html">LSTM</a>.

> **Task 3** : complete the following code to implement a single layer, unidirectional LSTM with a fully connected layer taking the final hidden states of the LSTM as input. Note that you have to set the parameter *batch_first*=True for the LSTM layer as our dataset comes in this format.


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        """
        vocab_size: (int) size of the vocabulary - required by embeddings
        embed_dim: (int) size of embeddings
        hidden_dim: (int) number of hidden units
        num_class: (int) number of classes
        """
        super().__init__()
        self.hidden_dim=hidden_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        #enter here your code
        
       

    def forward(self, text):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """
        #ENTER HERE YOUR CODE
        
        return out


> **Task 4**: run the LSTM model on the IMDB dataset and verify whether it is overfitting or not. 

In [None]:
#insert your code here

lstm = 
lstm.to(device)

optimizer = 

(train_acc, val_acc, train_loss, val_loss) = train_model(lstm, optimizer, criterion)

plot_losses(train_loss, val_loss)

> **Task 5**: implement a dropout layer between the LSTM and the fully connected layer 

### A look at Word Embeddings

The word embeddings calculated during training can be extracted from the models and they can be used to calculate a semantic similarity between words.

> **Task 6** : include the following code into the LSTM implementation

```
def get_embedding_for(self, w):
    idx = vocab.lookup_indices([w])
    return self.embedding(torch.Tensor(idx).int())
````
> and test the following block:

you should see that words that are intuitively semantically closer have higher similarity scores (according to cosine similarity)

In [None]:
v1=(lstm.get_embedding_for("great").detach().numpy()).reshape(1,-1)
v2=(lstm.get_embedding_for("bad").detach().numpy()).reshape(1,-1)
v3=(lstm.get_embedding_for("awful").detach().numpy()).reshape(1,-1)

from numpy import dot
from numpy.linalg import norm

cos_sim = dot(v1, v2.T)/(norm(v1)*norm(v2))

print("cosine sim between great and bad", cos_sim)

cos_sim = dot(v2, v3.T)/(norm(v2)*norm(v3))

print("cosine sim between bad and awful", cos_sim)

cos_sim = dot(v1, v3.T)/(norm(v1)*norm(v3))

print("cosine sim between great and awful", cos_sim)