Train an LSTM Model 
----
1. Using PyTorch, implement a neural network that uses one or more LSTM cells to do sentiment analysis. Use the nn.Embedding, nn.LSTM and nn.Linear layers to construct your model.
2. Note that sequence processing works differently with the PyTorch Embedding layer as compared to my sample code from class. The model input expects a padded tensor of token indices from the vocabulary, instead of one-hot encodings. For evaluation, use a vocabulary size of 10000 (max_features = 10000).
3. The model should have a single output with the sigmoid activation function for classification. The dimensions of the embedding layer and the hidden layer(s) are up to you, but please make sure your model does not take more than ~3 minutes to train.
4. Evaluate the model using PyTorch functions for average accuracy, area under the ROC curve and F1 scores (see [torchedev](https://pytorch.org/torcheval/stable/)).

In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
!pip install torchmetrics
from torchmetrics.functional import f1_score, auroc



In [15]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available")
else:
    device = torch.device("cpu")
    print("CPU used")

CPU used


In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
train_data_file = 'movie_reviews_train.txt'
train_df = pd.read_csv(train_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_train, y_train = train_df['review'].values, train_df['label'].values

dev_data_file = 'movie_reviews_dev.txt'
dev_df = pd.read_csv(dev_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_dev, y_dev = dev_df['review'].values, dev_df['label'].values

test_data_file = 'movie_reviews_test.txt'
test_df = pd.read_csv(test_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_test, y_test = test_df['review'].values, test_df['label'].values

In [18]:
def preprocess_token(s): # This function is for pre-processing each token, not the entire sequence
    # Retain only alphanumeric characters
    s = re.sub(r'[^\w\s]+', '', s)

    # replace digits with no space
    s = re.sub(r'\d', '', s)

    # Replace all whitespace sequences with no space
    s = re.sub(r'\s+', '', s)

    return s

def tokenize(x_train, x_dev, x_test, vocab_size): # This function is for pre-processing strings, which uses the above.

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    #Use this list to remove given stop words


    # Retain the 'vocab_size' most frequent words
    tokens = []
    for doc in x_train:
        for word in doc.split():
            word = preprocess_token(word.lower())
            if word not in stop_words and word != '':
                tokens.append(word)

    word_counts = Counter(tokens)
    # vocab = [word for word, count in word_counts.most_common(vocab_size) if word not in stop_words]
    vocab = sorted(word_counts, key=word_counts.get,reverse=True)[:vocab_size]
    word_to_idx = {word: i+1 for i, word in enumerate(vocab)}

    # Initialize empty lists to store padded sequences for training, development, and testing data
    x_train_padded = []
    x_dev_padded = []
    x_test_padded = []

    # Iterate through each document in the training data

    for doc in x_train:

        # Tokenize the document, convert tokens to lowercase, and preprocess each token
        # Then, convert tokens to their corresponding indices in the vocabulary if they exist
        tokens = [word_to_idx[preprocess_token(word.lower())] for word in doc.split() if preprocess_token(word.lower()) in word_to_idx.keys()]
        x_train_padded.append(tokens)


    # Iterate through each document in the development data

    for doc in x_dev:

        # Tokenize the document, convert tokens to lowercase, and preprocess each token
        # Then, convert tokens to their corresponding indices in the vocabulary if they exist

        tokens = [word_to_idx[preprocess_token(word.lower())] for word in doc.split() if preprocess_token(word.lower()) in word_to_idx.keys()]
        x_dev_padded.append(tokens)

    # Iterate through each document in the testing data

    for doc in x_test:

        # Tokenize the document, convert tokens to lowercase, and preprocess each token
        # Then, convert tokens to their corresponding indices in the vocabulary if they exist

        tokens = [word_to_idx[preprocess_token(word.lower())] for word in doc.split() if preprocess_token(word.lower()) in word_to_idx.keys()]
        x_test_padded.append(tokens)

    # Determine the maximum sequence size among all datasets (training, development, and testing)
    max_seq_len = max([
        max([len(seq) for seq in x_train_padded]),
        max([len(seq) for seq in x_dev_padded]),
        max([len(seq) for seq in x_test_padded])
    ])
    print("Max Seq Len:", max_seq_len)

    # Pad sequences in the training, testing and development data to ensure uniform length using zero-padding
    train_padded = np.zeros((len(x_train_padded), max_seq_len),dtype=int)
    for i, doc in enumerate(x_train_padded):
        if len(doc) != 0:
            train_padded[i, -len(doc):] = np.array(doc)[:max_seq_len]

    dev_padded = np.zeros((len(x_dev_padded), max_seq_len),dtype=int)
    for i, doc in enumerate(x_dev_padded):
        if len(doc) != 0:
            dev_padded[i, -len(doc):] = np.array(doc)[:max_seq_len]

    test_padded = np.zeros((len(x_test_padded), max_seq_len),dtype=int)
    for i, doc in enumerate(x_test_padded):
        if len(doc) != 0:
            test_padded[i, -len(doc):] = np.array(doc)[:max_seq_len]

    # Finally, return the padded sequences (train, development and test) and vocabulary

    return np.array(train_padded), np.array(dev_padded), np.array(test_padded), word_to_idx

In [19]:
# Tokenize your train, test and development data


vocab_size = 10000

X_train, X_dev, X_test, vocab = tokenize(X_train, X_dev, X_test, vocab_size)

Max Seq Len: 608


In [20]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
dev_data = TensorDataset(torch.from_numpy(X_dev), torch.from_numpy(y_dev))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

In [21]:
class SentimentRNN(nn.Module):
    def __init__(self,num_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()


        # embedding and LSTM layers

        self.embedding_layer = nn.Embedding(vocab_size,embedding_dim)
        self.hidden_dim = hidden_dim
        self.no_layers = num_layers

        # lstm

        self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers,dropout=drop_prob,batch_first=True)

        # dropout layer

        self.dropout = nn.Dropout(drop_prob)


        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim,1)
        self.sig = nn.Sigmoid()


    def forward(self,x,hidden):

        # embeddings and lstm_out
        embeds = self.embedding_layer(x)
        lstm_out,hidden = self.lstm(embeds,hidden)

        # stack up lstm outputs
        lstm_out = lstm_out[:,-1,:]

        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)

        return sig_out,hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # initialize hidden state(s) and cell state(s) of LSTM to zero with appropriate dimensions
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden


In [22]:
no_layers = 2
vocab_size = len(vocab) + 1
embedding_dim = 64
output_dim = 1
hidden_dim = 256

model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

SentimentRNN(
  (embedding_layer): Embedding(10001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [23]:
lr=0.001

# you should use binary cross-entropy as your loss function and Adam optimizer

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()


In [26]:
clip = 5
epochs = 5
dev_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_dv_loss = [],[]
epoch_tr_acc,epoch_dv_acc = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:

        inputs, labels = inputs.to(device), labels.to(device)

        h = tuple([each.data for each in h])

        model.zero_grad()
        output,h = model(inputs,h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()



    dev_h = model.init_hidden(batch_size)
    dev_losses = []
    dev_acc = 0.0
    model.eval()
    for inputs, labels in dev_loader:
            dev_h = tuple([each.data for each in dev_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, dev_h = model(inputs,dev_h)
            val_loss = criterion(output.squeeze(), labels.float())

            dev_losses.append(val_loss.item())

            accuracy = acc(output,labels)
            dev_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_dev_loss = np.mean(dev_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_dev_acc = dev_acc/len(dev_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_dv_loss.append(epoch_dev_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_dv_acc.append(epoch_dev_acc)

    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_dev_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_dev_acc*100}')

    # if dev_loss goes less than or equal to dev_loss_min then save your model and update the dev_loss_min
    if epoch_dev_loss <= dev_loss_min:
        torch.save(model.state_dict(), 'sentiment_model.pt')
        #print('Dev loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_dev_loss))
        valid_loss_min = epoch_dev_loss
    print(25*'==')


Epoch 1
train_loss : 0.6070794276893139 val_loss : 0.6617118120193481
train_accuracy : 67.5625 val_accuracy : 60.5
Epoch 2
train_loss : 0.5303462790325284 val_loss : 0.6197794526815414
train_accuracy : 73.3125 val_accuracy : 68.0
Epoch 3
train_loss : 0.42751097306609154 val_loss : 0.6882550865411758
train_accuracy : 80.1875 val_accuracy : 66.0
Epoch 4
train_loss : 0.34107351722195745 val_loss : 0.8736861050128937
train_accuracy : 84.9375 val_accuracy : 66.0
Epoch 5
train_loss : 0.2371380286058411 val_loss : 0.9725182354450226
train_accuracy : 90.8125 val_accuracy : 66.5


NOTE: your train loss should be smaller than 1 and your train accuracy should be over 75%

In [27]:
model.eval()
test_h = model.init_hidden(batch_size)
test_acc = 0.0

# Evaluate model on test data and report the accuracy, AUROC, and F1 score
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        output, _ = model(inputs, test_h)
        test_acc += acc(output, labels)

test_acc /= len(test_loader.dataset)

# Compute AUROC and F1 score
predictions = torch.sigmoid(output.squeeze())  # Convert logits to probabilities
auroc_score = auroc(predictions, labels, task="binary")
f1_value = f1_score(predictions > 0.5, labels, task="binary")  # Assuming threshold of 0.5 for F1 score


print(f'Test Accuracy: {test_acc:.4f}')
print(f'AUROC: {auroc_score:.4f}')
print(f'F1 Score: {f1_value:.4f}')

Test Accuracy: 0.6650
AUROC: 0.7094
F1 Score: 0.7179


NOTE: your eval accuracy should be of at least 60%.