Task 4: Train an LSTM Model (40 points)
----
1. Using PyTorch, implement a neural network that uses one or more LSTM cells to do sentiment analysis. Use the nn.Embedding, nn.LSTM and nn.Linear layers to construct your model.
2. Note that sequence processing works differently with the PyTorch Embedding layer as compared to my sample code from class. The model input expects a padded tensor of token indices from the vocabulary, instead of one-hot encodings. For evaluation, use a vocabulary size of 10000 (max_features = 10000).
3. The model should have a single output with the sigmoid activation function for classification. The dimensions of the embedding layer and the hidden layer(s) are up to you, but please make sure your model does not take more than ~3 minutes to train.
4. Evaluate the model using PyTorch functions for average accuracy, area under the ROC curve and F1 scores (see [torchedev](https://pytorch.org/torcheval/stable/)).

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available")
else:
    device = torch.device("cpu")
    print("CPU used")

CPU used


In [3]:
train_data_file = 'movie_reviews_train.txt'
train_df = pd.read_csv(train_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_train, y_train = train_df['review'].values, train_df['label'].values

dev_data_file = 'movie_reviews_dev.txt'
dev_df = pd.read_csv(dev_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_dev, y_dev = dev_df['review'].values, dev_df['label'].values

test_data_file = 'movie_reviews_test.txt'
test_df = pd.read_csv(test_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_test, y_test = test_df['review'].values, test_df['label'].values

In [4]:
def preprocess_token(s): # This function is for pre-processing each token, not the entire sequence
    # Retain only alphanumeric characters
    s = re.sub(r'[^a-zA-Z0-9]', '', s)

    # replace digits with no space
    s = re.sub(r'\d', '', s)

    # Replace all whitespace sequences with no space
    s = re.sub(r'\s+', '', s)

    return s

def tokenize(x_train, x_dev, x_test, vocab_size): # This function is for pre-processing strings, which uses the above.
    word_list = []
    # Remove stop words
    stop_words = set(stopwords.words('english')) 
    
    for doc in x_train:
        for word in doc.lower().split():
            word = preprocess_token(word)
            if word not in stop_words and word != '':
                word_list.append(word)
    corpus = Counter(word_list)
    # Retain the 'vocab_size' most frequent words
    corpus = sorted(corpus, key=corpus.get, reverse=True)[:vocab_size]
 
    vocabulary = {w:i for i,w in enumerate(corpus)}
    # tokenize
    # Initialize empty lists to store padded sequences for training, development, and testing data

    padded_seq_train, padded_seq_dev, padded_seq_test = [], [], []
    
    # Iterate through each document in the training data
    for doc in x_train:
            padded_seq_train.append([vocabulary[preprocess_token(token)] for token in doc.lower().split() 
                                     if preprocess_token(token) in vocabulary.keys()])
            
    # Iterate through each document in the development data
    for doc in x_dev:
            padded_seq_dev.append([vocabulary[preprocess_token(token)] for token in doc.lower().split() 
                                    if preprocess_token(token) in vocabulary.keys()])
 

    # Iterate through each document in the testing data
    for doc in x_test:
            padded_seq_test.append([vocabulary[preprocess_token(token)] for token in doc.lower().split() 
                                    if preprocess_token(token) in vocabulary.keys()])
 
    max_seq_size = max(max(max([len(s) for s in padded_seq_train]), max([len(s) for s in padded_seq_test])), max([len(s) for s in padded_seq_dev]))
    
    
    for i in range(len(padded_seq_train)):
        padded_seq_train[i] = [0]*(max_seq_size - len(padded_seq_train[i])) + padded_seq_train[i]
 
    for i in range(len(padded_seq_test)):
        padded_seq_test[i] = [0]*(max_seq_size - len(padded_seq_test[i])) + padded_seq_test[i]
 
    for i in range(len(padded_seq_test)):
        padded_seq_dev[i] = [0]*(max_seq_size - len(padded_seq_dev[i])) + padded_seq_dev[i]
        
        
    # Finally, return the padded sequences (train, development and test) and vocabulary
    return np.array(padded_seq_train), np.array(padded_seq_dev), np.array(padded_seq_test), vocabulary

In [5]:
# Tokenize your train, test and development data
X_train,X_test,X_dev,vocabulary = tokenize(X_train,X_test,X_dev,vocab_size=10000)
### YOUR CODE HERE ###

In [6]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train.astype(np.float32)))
dev_data = TensorDataset(torch.from_numpy(X_dev), torch.from_numpy(y_dev.astype(np.float32)))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test.astype(np.float32)))

# dataloaders
batch_size = 25

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

In [7]:
class SentimentRNN(nn.Module):
    def __init__(self,num_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout_layer = nn.Dropout(drop_prob)

        # linear and sigmoid layer
        self.linear_layer = nn.Linear(hidden_dim, 1)
        self.activation = nn.Sigmoid()



    def forward(self,x,hidden):

        # embeddings
        embedded = self.embedding(x)
        
        # LSTM
        lstm_out, hidden = self.lstm(embedded, hidden)

        # stack up LSTM outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layer
        out = self.dropout_layer(lstm_out)
        out = self.linear_layer(out)

        # sigmoid function
        sig_out = self.activation(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        # get the last batch of labels
        sig_out = sig_out[:, -1]
        
        return sig_out, hidden


    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # initialize hidden state(s) and cell state(s) of LSTM to zero with appropriate dimensions
        h0 = torch.zeros((self.num_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.num_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden


In [8]:
num_layers = 4
vocab_size = len(vocabulary)
embedding_dim = int(32)
output_dim = 1
hidden_dim = 256

model = SentimentRNN(num_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.3).to(device)

In [9]:
lr=0.0001

# you should use binary cross-entropy as your loss function and Adam optimizer for this task

optimizer = torch.optim.Adam(model.parameters(), lr)
loss_func = nn.BCELoss()

# function to predict accuracy
def acc(preds, labels, threshold=0.5):
    binary_predictions = (preds > threshold).float()
    return torch.tensor(torch.sum(binary_predictions == labels).item() / len(preds))

def mean(listt):
    return sum(listt)/len(listt)

  _torch_pytree._register_pytree_node(


In [None]:
clip = 5
epochs = 10
dev_loss_min = np.Inf
best_epoch = 0

epoch_tr_loss,epoch_dv_loss = [],[]
epoch_tr_acc,epoch_dv_acc = [],[]

for epoch in range(epochs): # Train your model
    train_loss = []
    train_acc = []
    dev_loss = []
    dev_acc = []
    
    for features, target in train_loader:
        features = features.to(device)
        target = target.to(device)
        hidden_state = model.init_hidden(batch_size)
        out, _ = model(features, hidden_state)
        loss = loss_func(out, target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        optimizer.zero_grad()
        train_loss.append(loss.item())
        train_acc.append(acc(out, target))
    
    for features, target in dev_loader:
        features = features.to(device)
        target = target.to(device)
        hidden_state = model.init_hidden(batch_size)
        out, _ = model(features, hidden_state)
        loss = loss_func(out, target)
        dev_loss.append(loss.item())
        dev_acc.append(acc(out, target))
    
    mean_dev_loss = mean(dev_loss)
    mean_train_loss = mean(train_loss)
    mean_train_acc = mean(train_acc)
    mean_dev_acc = mean(dev_acc)
    
    if (epoch+1)%2==0 or epoch==0:
        print(f'Epoch {epoch+1}')
        print(f'train_loss : {mean_train_loss} dev_loss : {mean_dev_loss}')
        print(f'train_accuracy : {mean_train_acc} dev_accuracy : {mean_dev_acc}')
        print(25*'==')

    # if dev_loss goes less than or equal to dev_loss_min then save your model and update the dev_loss_min

    if mean_dev_loss<dev_loss_min:
        dev_loss_min = mean_dev_loss
        # save model here
        torch.save(model.state_dict(), f'best_model.pth')
        best_epoch = epoch+1

NOTE: your train loss should be smaller than 1 and your train accuracy should be over 75%

In [None]:
# Load the best model
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

test_acc = 0.0
predictions = []
true_labels = []

# Iterate over test data batches
for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    test_h = model.init_hidden(batch_size)
    # Forward pass
    with torch.no_grad():
        outputs, _ = model(inputs, test_h)
    
    # Compute predictions
    predictions.extend(outputs.cpu().numpy())  # Assuming outputs are on CPU
    true_labels.extend(labels.cpu().numpy())  # Assuming labels are on CPU

# Convert predictions and true_labels to numpy arrays
predictions = torch.tensor(predictions)
true_labels = torch.tensor(true_labels)



#########
# Accuracy
print(f"F1 Score - {binary_f1_score(predictions, true_labels)}")
metric = BinaryAUROC()
metric.update(predictions, true_labels)
print(f"Area under ROC {metric.compute()}")
acc_metric = BinaryAccuracy()
acc_metric.update(predictions, true_labels)
print(f"Accuracy {metric.compute()}")

NOTE: your eval accuracy should be of at least 60%.