# CIS6930 Week 6a: Recurrent Neural Networks (Student version)

---

Preparation: Go to `Runtime > Change runtime type` and choose `GPU` for the hardware accelerator.



In [None]:
gpu_info = !nvidia-smi -L
gpu_info = "\n".join(gpu_info)
if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU")
else:
    print(gpu_info)

In [None]:
!pip install pytorch-lightning

## Whitespace Tokenizer

Let's take a look at this "naive" whitespace tokenizer class. In addition to `fit()` function that simply assigns token IDs to unseen tokens, it has `encode()` that returns a list of token IDs after tokenization. 

In [None]:
class WhiteSpaceTokenizer:
    """Simple tokenizer based on whitespace splitting."""
    def __init__(self,
                 max_size: int = 30000,
                 unk_token: str = "[UNK]"):
        self.token2id = {}
        self.id2token = {}
        self.max_size = max_size
        self.unk_token_id = 1  # 0 is reserved for padding token
        self.unk_token = unk_token

    def __len__(self):
        return len(self.token2id)

    def tokenize(self, text):
        """Simple whitespace splitting after lower casing.
        To use more sophisticated option, you can overwrite this logic."""
        return text.lower().split(" ")

    def fit(self, text):
        # Initialize
        self.token2id = {self.unk_token: self.unk_token_id}
        for token in self.tokenize(text):
            if (len(self.token2id) < self.max_size) and (
                            token not in self.token2id):
                self.token2id[token] = len(self.token2id)

        # id2token is reverse mapping
        self.id2token = {int(v): k for k, v in self.token2id.items()}

    def encode(self, text):
        token_ids = []
        for token in self.tokenize(text):
            if token in self.token2id:
                token_ids.append(self.token2id[token])
            else:
                token_ids.append(self.unk_token_id)
        return token_ids

    def decode(self, token_ids):
        tokens = []
        for token_id in token_ids:
            if (token_id > 1) and (token_id not in self.id2token):
                # token_id=1 is [UNK]
                # Any token_ids should be in the dictionary
                print("WARNING: token_id={} not found in the vocabulary.".format(
                    token_id))
                token = "N/A"
            else:
                token = self.id2token[token_id]
            tokens.append(token)
        return tokens

In [None]:
text = "In machine learning, the perceptron is an algorithm for supervised learning of binary classifiers. A binary classifier is a function which can decide whether or not an input, represented by a vector of numbers, belongs to some specific class.[1] It is a type of linear classifier, i.e. a classification algorithm that makes its predictions based on a linear predictor function combining a set of weights with the feature vector."
tokenizer = WhiteSpaceTokenizer()
tokenizer.fit(text)

token_ids = tokenizer.encode("Machine learning is fun")
print(token_ids)

print(tokenizer.decode(token_ids))

print(len(tokenizer))

In [None]:
tokenizer.token2id

## Playing around with PyTorch components

- `nn.Embedding`
- `nn.RNN`
- `nn.utils.rnn import pad_sequence`


In [None]:
import copy
from time import time
from typing import Any, Dict
import random

import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, TensorDataset, DataLoader

### `nn.Embedding`

In [None]:
# Embedding layer
embedding_dim = 32

emb_layer = nn.Embedding(num_embeddings=len(tokenizer),
                         embedding_dim=embedding_dim)

text_data1 = ["Machine learning is fun",
              "Deep learning is great"]

print([tokenizer.encode(text) for text in text_data1])

X1 = torch.LongTensor([tokenizer.encode(text) for text in text_data1])
out1 = emb_layer(X1)

print(out1.shape) # (2, 4, 16)
print(out1)

In [None]:
# This cell should return an error. Why?
text_data2 = ["Machine learning is really fun",
              "Deep learning is great"]

torch.LongTensor([tokenizer.encode(text) for text in text_data2])

In [None]:
[tokenizer.encode(text) for text in text_data2]

### `nn.utils.rnn.pad_sequence`

In [None]:
# https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html

from torch.nn.utils.rnn import pad_sequence
seq = [torch.LongTensor(tokenizer.encode(text)) for text in text_data2]
print(seq)

padded_seq = pad_sequence(seq, batch_first=True)
print(padded_seq)    

In [None]:
emb_layer_ignore_pad = nn.Embedding(num_embeddings=len(tokenizer),
                                    embedding_dim=embedding_dim,
                                    padding_idx=0)

no_padidx_out = emb_layer(padded_seq)
padidx_out = emb_layer_ignore_pad(padded_seq)

print(no_padidx_out[-1][-1])  # Some values are assigned to padding tokens!!
print(padidx_out[-1][-1])

### nn.RNN

This example only covers `nn.RNN`. `nn.LSTM` and `nn.GRU` generally have the same interface.

`nn.LSTM` is significantly different from `nn.RNN` and `nn.GRU` in some sense. Please remember the lecture about what is the difference.

In [None]:
hidden_size = 32
num_layers = 3

rnn_layer = nn.RNN(input_size=embedding_dim,
                    hidden_size=hidden_size,
                    num_layers=num_layers,
                    batch_first=True) # True: (B, L, V), False: (L, B, V)

out2, h_n = rnn_layer(out1) # last hidden state of each layer

In [None]:
print(out2.shape)  # (B, L, D)
print(h_n.shape)     # (NumLayer, B, D)

In [None]:
out2[:, -1, :].shape

In [None]:
fc_layer = nn.Linear(in_features=hidden_size,
                     out_features=2)

out3 = fc_layer(out2[:, -1, :])
print(out3.shape)
out3

## Putting It All Together!

Let's put it all together to implement an RNN classifier model. 


In [None]:
class SimpleRNN(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 embedding_dim: int = 128,
                 hidden_size: int = 64,
                 num_layers: int = 1,
                 num_output: int = 3):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=vocab_size,
                                embedding_dim=embedding_dim,
                                padding_idx=0)
        self.rnn = nn.RNN(input_size=embedding_dim,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(hidden_size, num_output)
    
    def forward(self, x):
        emb = self.emb(x)
        hidden_states, _ = self.rnn(emb)
        last_hidden_state = hidden_states[:, -1, :] # Last hidden state
        out = self.linear(self.dropout(last_hidden_state))
        return out, last_hidden_state                               

### US Airline Sentiment Analysis dataset

In [None]:
# https://www.kaggle.com/crowdflower/twitter-airline-sentiment
# License CC BY-NC-SA 4.0
!gdown --id 1BS_TIqm7crkBRr8p6REZrMv4Uk9_-e6W

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Loading dataset
df = pd.read_csv("Tweets.csv")

# Tokenizer (Note: Technically, tokenizer should be fit only on training data)
tokenizer = WhiteSpaceTokenizer()
tokenizer.fit(" ".join(df["text"].tolist()))
df["token_ids"] = df["text"].apply(lambda x: tokenizer.encode(x))

# Label encoder
le = LabelEncoder()
y = le.fit_transform(df["airline_sentiment"].values)
df["label"] = y

df[["airline_sentiment", "label", "text", "token_ids"]]

In [None]:
# Splint into 60% train, 20% valid, 20% test
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=1)

train_df, valid_df = train_test_split(
    train_df, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2

print(len(train_df), len(valid_df), len(test_df))

### Design Custom Dataset

You can easily create a custom `Dataset` class. What you need to implement is

- 1) `__len__(self)` that returns the total number of samples.
- 2) `__getitem__(self, idx)` that returns the corresponding data for a given `idx`.

Pandas `DataFrame` makes the custom Dataset design simple. Let's take a look.

In [None]:
class TweetDataset(Dataset):
    def __init__(self,
                 df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {"label": torch.LongTensor([df.iloc[idx]["label"]]),
                "token_ids": torch.LongTensor(df.iloc[idx]["token_ids"])}

### Collate function

The remainining challenge is that different tweets have different number of tokens. We have already learned how to use `pad_sequence`.

By passing a collate function to `DataLoader`, we can process batches on the fly.

In [None]:
def pad_collate(batch):
    """
    batch = [{'label': tensor([0]), 'token_ids': tensor([1164,  122,   60,   28,   38, 1312,  163,  155,  235,  195, 1331,  340,
        7283,  101, 6157,  128,  121,  588,  407, 3419, 2513,   13, 1470,   34, 9,  524,  121, 4436, 6153])},
             {'label': tensor([2]), 'token_ids': tensor([ 1164,    48,  1391, 10195, 10196,   340,  1008,    39, 10197,  2384,
          274, 10198,  2881,    83,   839,    32,  2849,  3389, 10199,   163, 167,   320, 10200])},
          ...]
    """
    token_ids = torch.nn.utils.rnn.pad_sequence([x["token_ids"] for x in batch],
                                                batch_first=True)
    label = torch.cat([x["label"] for x in batch])
    #import pdb; pdb.set_trace()
    batch = {"token_ids": token_ids, "label": label}
    return batch

In [None]:
batch_size = 8 

train_dataset = TweetDataset(train_df)
valid_dataset = TweetDataset(valid_df)
test_dataset = TweetDataset(test_df)

train_dl = DataLoader(train_dataset,
                      batch_size=batch_size,
                      shuffle=True,
                      drop_last=True,
                      collate_fn=pad_collate) # <=
valid_dl = DataLoader(valid_dataset,
                      collate_fn=pad_collate) # <=
test_dl = DataLoader(test_dataset,
                     collate_fn=pad_collate) # <=

# Check
batch = next(iter(train_dl))
batch

In [None]:
def train(model: nn.Module,
          train_dataset: Dataset,
          valid_dataset: Dataset,
          config: Dict[str, Any],
          random_seed: int = 0):
  
    # Random Seeds ===============
    torch.manual_seed(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    # Random Seeds ===============

    # GPU configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    dl_train = DataLoader(train_dataset,
                          batch_size=config["batch_size"],
                          shuffle=True,
                          drop_last=True,
                          collate_fn=pad_collate)
    dl_valid = DataLoader(valid_dataset,
                          collate_fn=pad_collate)
                  
    # Model, Optimzier, Loss function
    model = model.to(device)

    # Optimizer
    optimizer = config["optimizer_cls"](model.parameters(), lr=config["lr"])
    loss_fn = nn.CrossEntropyLoss()

    # For each epoch
    eval_list = []
    t0 = time()
    best_val = None
    best_model = None
    for n in range(config["n_epochs"]):
        t1 = time()
        print("Epoch {}".format(n))
        # Training
        train_loss = 0.
        train_pred_list = []
        train_true_list = []
        model.train()  # Switch to the training mode

        # For each batch
        for batch in dl_train:
            optimizer.zero_grad()              # Initialize gradient information
            X = batch["token_ids"].to(device)
            out, last_hidden_state = model(X)  # Call `forward()` function of the model
            loss = loss_fn(out, batch["label"].to(device))
            loss.backward()                    # Backpropagate the loss value
            optimizer.step()                   # Update the parameters
            train_loss += loss.data.item()
            train_pred_list += out.argmax(axis=1).detach().cpu().tolist()
            train_true_list += batch["label"].detach().cpu().tolist()

        train_loss /= len(dl_train)
        train_acc = accuracy_score(train_true_list, train_pred_list)
        print("    Training loss: {:.4f}    Training acc: {:.4f}".format(train_loss,
                                                                         train_acc))

        # Validation
        valid_loss = 0.
        valid_pred_list = []
        valid_true_list = []

        model.eval()  # Switch to the evaluation mode
        valid_emb_list = []
        valid_label_list = []
        for i, batch in enumerate(dl_valid):
            X = batch["token_ids"].to(device)
            out, last_hidden_state = model(X)  # Call `forward()` function of the model
            loss = loss_fn(out, batch["label"].to(device))
            valid_loss += loss.data.item()
            valid_pred_list += out.argmax(axis=1).detach().cpu().tolist()
            valid_true_list += batch["label"].detach().cpu().tolist()

        valid_loss /= len(dl_valid)
        valid_acc = accuracy_score(valid_true_list, valid_pred_list)
        print("  Validation loss: {:.4f}  Validation acc: {:.4f}".format(valid_loss,
                                                                         valid_acc))

        # Model selection
        if best_val is None or valid_loss < best_val:
            best_model = copy.deepcopy(model)
            best_val = valid_loss

        t2 = time()
        print("     Elapsed time: {:.1f} [sec]".format(t2 - t1))

        # Store train/validation loss values
        eval_list.append([n, train_loss, valid_loss, train_acc, valid_acc, t2 - t1])

    eval_df = pd.DataFrame(eval_list, columns=["epoch",
                                               "train_loss", "valid_loss",
                                               "train_acc", "valid_acc",
                                               "time"])
    eval_df.set_index("epoch")

    print("Total time: {:.1f} [sec]".format(t2 - t0))

    # Return the best model and trainining/validation information
    return {"model": best_model,
            "best_val": best_val,
            "eval_df": eval_df}

In [None]:
config = {"optimizer_cls": optim.Adam,          
          "lr": 0.001,
          "batch_size": 16,
          "n_epochs": 10}
model = SimpleRNN(vocab_size=len(tokenizer))
output = train(model, train_dataset, valid_dataset, config)