# Customized RNN model for sequence classification

Your name: Nguyen Van A

Student ID: USTH001

**Due: March 27, 2023 (Hard Deadline)**

*This is an optional (bonus) assignment*

## How to submit

- Attach notebook file (.ipynb) and submit your work to Google Class Room
- Name your file as YourName_StudentID_Assignment5.ibynb. E.g., Nguyen_Van_A_ST099834_Assignment5.ipynb
- Write your name and student ID into this notebook
- Copying others' assignments is strictly prohibited.

## Policy

- I only grade submissions which can run successfully without syntax and run-time errors
- Please run your notebook on Google Colab or Kaggle notebook.
- You can change epochs to 5 to save time


## Description

In this assignment, we are going to build a customized RNN model for text classification. That is the extended version for the notebook [RNN for Sequence Classification](https://colab.research.google.com/drive/1qkRmcd5PI5ISWScdD0BovDSvgYYkruVc?usp=sharing).

The tasks in this assignment is as follows.

- Initialize Embedding layer in the RNN model with pre-trained word embeddings
- Modify the training loop to print out averaged loss function value after each epoch.

## Set random seed

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f317c331710>

## Download & load data

In [None]:
%%capture
!rm -f titles-en-train.labeled
!rm -f titles-en-test.labeled

!wget https://raw.githubusercontent.com/neubig/nlptutorial/master/data/titles-en-train.labeled
!wget https://raw.githubusercontent.com/neubig/nlptutorial/master/data/titles-en-test.labeled

In [None]:
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if line == '':
                continue
            lb, text = line.split('\t')
            data.append((text,int(lb)))

    return data

train_data = load_data('./titles-en-train.labeled')
test_data = load_data('./titles-en-test.labeled')

train_docs, train_labels = zip(*train_data)
test_docs, test_labels = zip(*test_data)

## Data Transformation

### Vocabulary


In [None]:
from collections import defaultdict

class Vocabulary:
    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}

        self.pad_index = 0
        self.unk_index = 1

    def lookup_token(self, token):
        """Retrieve the index associated with the token
          or the UNK index if token isn't present.

        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary)
              for the UNK functionality
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index

        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    @classmethod
    def build_vocab(cls, sentences):
        """Build vocabulary from a list of sentences

        Arguments:
        ----------
            sentences (list): list of sentences, each sentence is a string

        Return:
        ----------
            vocab (Vocabulary): a Vocabulary object
        """
        token_to_idx = {"<PAD>": 0, "<UNK>": 1}
        vocab = cls(token_to_idx)

        for s in sentences:
            for word in s.split():
                vocab.add_token(word)
        return vocab

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [None]:
vocab = Vocabulary.build_vocab(train_docs)
print(vocab)

<Vocabulary(size=27192)>


### Data Vectorization

In [None]:
import torch
import numpy as np

def vectorize(vocab, title):
    """
    Args:
        vocab (Vocabulary)
        title (str): the string of characters
        max_length (int): an argument for forcing the length of index vector
    """
    indices = [vocab.lookup_token(token) for token in title.split()]

    return torch.tensor(indices)

In [None]:
train_data = [vectorize(vocab, t) for t in train_docs]
test_data = [vectorize(vocab, t) for t in test_docs]

### Label Mapping

In [None]:
label2idx = {
    -1: 0, 1: 1
}
train_y = [label2idx[lb] for lb in train_labels]
test_y = [label2idx[lb] for lb in test_labels]

### Dataset class

In order to put data into DataLoader, we need to implement a custom Dataset class that inherite [Dataset class](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

It is required to implement two functions `__len__` and `__getitem__`

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        x = self.sequences[index]
        y = self.labels[index]

        return x, y

In [None]:
train_dataset = TextDataset(train_data, train_y)
test_dataset = TextDataset(test_data, test_y)

### Create DataLoader

We need to define function for processing batches generated by DataLoader

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    """Processing a batch generated by DataLoader

    Arguments:
    -----
        batch (torch.tensor): a tensor generated by DataLoader
    """
    (x, y) = zip(*batch)
    x_lens = torch.tensor([len(x) for x in x])
    y = torch.tensor(y, dtype=torch.float32)

    x_pad = pad_sequence(x, batch_first=True, padding_value=0)

    return x_pad, x_lens, y

## Part 1: Creating Embedding Matrix from Pre-trained Glove (50 points)

What you need to do is to create an embedding matrix which is a tensor with shape (vocab_size, embedidng_size). Row $i$ in the matrix is the word vector for the word of the index $i$ in the vocabulary. We are going to get those word vectors from pre-trained word vectors.

We will download the pre-trained word vectors using gensim.

In [None]:
import gensim.downloader as api
wv = api.load('glove-wiki-gigaword-300')



**You will need to implement the following function**

Note that: words in glove model are lower-cases, so to get the vector for a word, such as "King", we need to convert the word into lower case first. For instance

```
print( wv["King".lower()] )
```

For words in the Vocabulary but not in pre-trained word vector model, we will assign random values by:

```
torch.randn(emb_dim)
```

In [None]:
def create_embedding_matrix(wv, word_to_idx, emb_dim=300):
    """Create embedding matrix

    Args:
        wv: word vector model loaded from gensim
        word_to_idx (dict): Map from a word into index in the vocab
        emb_dim (int): Embedding size

    Returns:
        embedding_matrix: a Torch tensor with size (vocab_size, emb_dim)
    """
    vocab_size = len(word_to_idx)
    embedding_matrix = torch.zeros(vocab_size, emb_dim)

    #TODO: Write your code here

    return embedding_matrix

After that, create an embedding matrix

In [None]:
word_to_idx = vocab._token_to_idx
embedding_matrix = create_embedding_matrix(wv, word_to_idx)

## RNN Model

We just modify the model to initialize word embedding layer by pre-trained word vectors.

In [None]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class TextClassifier(nn.Module):

    def __init__(self, embedding_matrix, rnn_hidden_size, num_classes,
                 num_layers=1, batch_first=True, padding_idx=0):

        super(TextClassifier, self).__init__()

        vocab_size, embedding_size = embedding_matrix.shape

        self.emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        # Use Bidirectional LSTM
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=rnn_hidden_size,
                          batch_first=batch_first, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(in_features=2*rnn_hidden_size, out_features=num_classes)

    def forward(self, x_in, x_lens):
        x_embed = self.emb(x_in)
        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=True, enforce_sorted=False)
        _, (hidden_state, _) = self.rnn(x_packed)

        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)

        logits = torch.sigmoid(self.fc(hidden))
        return logits

## Part 2: Customized Training loop (50 points)

Modify the training loss to print-out averaged loss function after each epoch

In [None]:
from tqdm.notebook import trange, tqdm

rnn_hidden_size = 128
num_layers=1
num_classes = 1
batch_first = True

model = TextClassifier(embedding_matrix,
                       rnn_hidden_size=rnn_hidden_size,
                       num_classes=num_classes,
                       batch_first=batch_first, num_layers=num_layers)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

learning_rate = 1e-3
batch_size = 16

# You can change epochs to 5 to save time
epochs = 20

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)

def train():
    train_dataloader = DataLoader(
        train_dataset,
        collate_fn=collate_batch,
        batch_size=batch_size,
    )
    model.train()
    train_iterator = trange(int(epochs), desc="Epoch")

    for epoch in train_iterator:
        epoch_loss = 0
        num_batches = 0

        for x_in, x_lens, y in train_dataloader:
            x_in = x_in.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            pred = model(x_in, x_lens).squeeze()
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()

            #TODO: Write code to add the loss value to epoch_loss and increase num_batches

        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch [{epoch + 1}/{epochs}] - Average Loss: {avg_epoch_loss}")

train()

## Evaluation

If there are no bug in your code, you should be able to calcuate evaluation metrics on the test data.

In [None]:
from sklearn import metrics

def evaluate():
    model.eval()
    test_dataloader = DataLoader(
        test_dataset,
        collate_fn=collate_batch,
        shuffle=False,
        batch_size=batch_size,
    )

    preds = []
    true_labels = []
    with torch.no_grad():
        for x_in, x_lens, y in tqdm(test_dataloader, desc="Evaluating"):
            x_in = x_in.to(device)
            y = y.to(device)

            logits = model(x_in, x_lens).squeeze()
            _preds = (logits>0.5).type(torch.long)
            preds += _preds.detach().cpu().numpy().tolist()
            true_labels += y.detach().cpu().numpy().tolist()

    print(metrics.classification_report(true_labels, preds))

evaluate()