Task 4: Train an LSTM Model (40 points)
----
1. Using PyTorch, implement a neural network that uses one or more LSTM cells to do sentiment analysis. Use the nn.Embedding, nn.LSTM and nn.Linear layers to construct your model.
2. Note that sequence processing works differently with the PyTorch Embedding layer as compared to my sample code from class. The model input expects a padded tensor of token indices from the vocabulary, instead of one-hot encodings. For evaluation, use a vocabulary size of 10000 (max_features = 10000).
3. The model should have a single output with the sigmoid activation function for classification. The dimensions of the embedding layer and the hidden layer(s) are up to you, but please make sure your model does not take more than ~3 minutes to train.
4. Evaluate the model using PyTorch functions for average accuracy, area under the ROC curve and F1 scores (see [torchedev](https://pytorch.org/torcheval/stable/)).

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available")
else:
    device = torch.device("cpu")
    print("CPU used")

In [None]:
train_data_file = 'movie_reviews_train.txt'
train_df = pd.read_csv(train_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_train, y_train = train_df['review'].values, train_df['label'].values

dev_data_file = 'movie_reviews_dev.txt'
dev_df = pd.read_csv(dev_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_dev, y_dev = dev_df['review'].values, dev_df['label'].values

test_data_file = 'movie_reviews_test.txt'
test_df = pd.read_csv(test_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_test, y_test = test_df['review'].values, test_df['label'].values

In [None]:
def preprocess_token(s): # This function is for pre-processing each token, not the entire sequence
    # Retain only alphanumeric characters

    # replace digits with no space

    # Replace all whitespace sequences with no space

    return s

def tokenize(x_train, x_dev, x_test, vocab_size): # This function is for pre-processing strings, which uses the above.


    # Remove stop words

    # Retain the 'vocab_size' most frequent words


    # Initialize empty lists to store padded sequences for training, development, and testing data


    # Iterate through each document in the training data

    for doc in x_train:

        # Tokenize the document, convert tokens to lowercase, and preprocess each token
        # Then, convert tokens to their corresponding indices in the vocabulary if they exist

        pass

    # Iterate through each document in the development data

    for doc in x_dev:

        # Tokenize the document, convert tokens to lowercase, and preprocess each token
        # Then, convert tokens to their corresponding indices in the vocabulary if they exist

        pass

    # Iterate through each document in the testing data

    for doc in x_test:

        # Tokenize the document, convert tokens to lowercase, and preprocess each token
        # Then, convert tokens to their corresponding indices in the vocabulary if they exist

        pass

    # Determine the maximum sequence size among all datasets (training, development, and testing)


    # Pad sequences in the training, testing and development data to ensure uniform length using zero-padding

    # Finally, return the padded sequences (train, development and test) and vocabulary

    pass

In [None]:
# Tokenize your train, test and development data

### YOUR CODE HERE ###

In [None]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
dev_data = TensorDataset(torch.from_numpy(X_dev), torch.from_numpy(y_dev))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self,num_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()


        # embedding and LSTM layers

        ###### YOUR CODE HERE #######


        # lstm

        ###### YOUR CODE HERE #######


        # dropout layer

        ###### YOUR CODE HERE #######


        # linear and sigmoid layer

        ###### YOUR CODE HERE #######



    def forward(self,x,hidden):

        # embeddings and lstm_out

        ### YOUR CODE HERE ###


        pass



    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # initialize hidden state(s) and cell state(s) of LSTM to zero with appropriate dimensions
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden


In [6]:
no_layers = 4
vocab_size = len(vocab)
embedding_dim = 64
output_dim = 1
hidden_dim = 256

model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

In [None]:
lr=0.001

# you should use binary cross-entropy as your loss function and Adam optimizer for this task

### YOUR CODE HERE ###

# function to predict accuracy
def acc(pred,label):

    ### YOUR CODE HERE ###

    pass

In [7]:
clip = 5
epochs = 5
dev_loss_min = np.Inf

epoch_tr_loss,epoch_dv_loss = [],[]
epoch_tr_acc,epoch_dv_acc = [],[]

for epoch in range(epochs): # Train your model

    ### YOUR CODE HERE ###

    print(f'Epoch {epoch+1}')
    print(f'train_loss : YOUR_TRAIN_LOSS_HERE dev_loss : YOUR_DEV_LOSS_HERE')
    print(f'train_accuracy : YOUR_ACC_HERE dev_accuracy : YOUR_DEV_ACC_HERE')

    # if dev_loss goes less than or equal to dev_loss_min then save your model and update the dev_loss_min

    ### YOUR CODE HERE ###

    print(25*'==')

NOTE: your train loss should be smaller than 1 and your train accuracy should be over 75%

In [9]:
model.eval()
test_h = model.init_hidden(batch_size)
test_acc = 0.0

# Evaluate model on your test data and report the accuracy

### YOUR CODE HERE ###


NOTE: your eval accuracy should be of at least 60%.