In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# RNN implementation: Partition df_train into train, val, test. 
4th cell under Split Data - we make the train dataset in the form "parent_tweet_text -> tweet"

Number of epochs can be changed on the line right above the "Evaluation" section.

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_dev = pd.read_csv('/content/drive/Shareddrives/CSE 573 Semantic Web Mining/Annotated/dev1.csv')
df_dev_body = pd.read_csv('/content/drive/Shareddrives/CSE 573 Semantic Web Mining/Annotated/dev_body1.csv')
df_dev_stance = pd.read_csv('/content/drive/Shareddrives/CSE 573 Semantic Web Mining/Annotated/dev_stance1.csv')
df_train = pd.read_csv('/content/drive/Shareddrives/CSE 573 Semantic Web Mining/Annotated/train1.csv')
df_train_body = pd.read_csv('/content/drive/Shareddrives/CSE 573 Semantic Web Mining/Annotated/train_body1.csv')
df_train_stance = pd.read_csv('/content/drive/Shareddrives/CSE 573 Semantic Web Mining/Annotated/train_stance1.csv')


# RNN code

# Set up

In [None]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn

In [None]:
SEED = 1234

In [None]:
def set_seeds(seed=1234):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # multi-GPU# Set seeds for reproducibility
set_seeds(seed=SEED)

In [None]:
# Set seeds for reproducibility
set_seeds(seed=SEED)

In [None]:
# Set device
cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)

cpu


## Load data

We will download the [AG News dataset](http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html), which consists of 120K text samples from 4 unique classes (`Business`, `Sci/Tech`, `Sports`, `World`)

In [None]:
import numpy as np
import pandas as pd
import re
import urllib

In [None]:
# Load data
df = df_train
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

Unnamed: 0.1,Unnamed: 0,id,parent_id,parent_tweet,tweet,topic,classification
0,2409,500411534065950721,5.004100619742904e+17,Not sure 'shoot first &amp; uncover potential ...,Early reports said the store had not reported ...,ferguson,comment
1,1121,529716453792956416,,,Is Prince in Toronto? Rumours fly of surprise ...,prince-toronto,comment
2,2185,552803445711708160,5.5280265464122566e+17,Charlie Hebdo shooting latest: dead and gunme...,Dua belas. RT Charlie Hebdo shooting latest: ...,charliehebdo,support
3,3247,524931144476028928,5.249252152359117e+17,BREAKING news: Shots fired at Parliament Hill....,“globeandmail: BREAKING news: Shots fired at P...,ottawashooting,comment
4,1848,524953098201362432,5.249375421317939e+17,We are in full lock down until further notice ...,Stay safe my friends. RT We are in full lock d...,ottawashooting,comment


## Preprocessing

We're going to clean up our input data first by doing operations such as lower text, removing stop (filler) words, filters using regular expressions, etc.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [None]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocess(text, stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    if not isinstance(text, str):
      return str(text)
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    text = pattern.sub('', text)

    # Remove words in paranthesis
    text = re.sub(r'\([^)]*\)', '', text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric chars
    text = re.sub(' +', ' ', text)  # remove multiple spaces
    text = text.strip()

    return text

In [None]:
# Sample
text = 0
preprocess(text=text)

'0'

In [None]:
# Apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.parent_tweet = preprocessed_df.parent_tweet.apply(preprocess)
preprocessed_df.tweet = preprocessed_df.tweet.apply(preprocess)

# Test if it's working
preprocessed_df.parent_tweet

0       sure shoot first amp uncover potential crime l...
1                                                     nan
2       charlie hebdo shooting latest dead gunmen stil...
3       breaking news shots fired parliament hill foll...
4                          full lock notice ottawa police
                              ...                        
4233    police say shots fired ottawa sites national w...
4234    tv channels chosen show videos hostages relayi...
4235    breaking news soldier shot national war memori...
4236    surveillance robbery release stills video ferg...
4237    police confirm sydneysiege finally two people ...
Name: parent_tweet, Length: 4238, dtype: object

## Split data

In [None]:
import collections
from sklearn.model_selection import train_test_split

In [None]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [None]:
def train_val_test_split(X, y, train_size):
    """Split dataset into data splits."""
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
# Data
X = (preprocessed_df["parent_tweet"] + " -> "+ preprocessed_df["tweet"]).values
y = preprocessed_df["classification"].values

In [None]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")

X_train: (2966,), y_train: (2966,)
X_val: (636,), y_val: (636,)
X_test: (636,), y_test: (636,)
Sample point: black islamic flag held window lindt chocolate store martin place sydney hostages inside -> still sure motivation attack → query


## LabelEncoder

Next we'll define a `LabelEncoder` to encode our text labels into unique indices

In [None]:
import itertools

In [None]:
class LabelEncoder(object):
    """Label encoder for tag labels."""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults ;)
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        encoded = np.zeros((len(y)), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
        return classes

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {'class_to_index': self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [None]:
# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'comment': 0, 'deny': 1, 'query': 2, 'support': 3}

In [None]:
# Convert labels to tokens
print (f"y_train[0]: {y_train[0]}")
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train[0]}")

y_train[0]: query
y_train[0]: 2


In [None]:
# Class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")

counts: [1913  233  231  589]
weights: {0: 0.0005227391531625719, 1: 0.004291845493562232, 2: 0.004329004329004329, 3: 0.001697792869269949}


## Tokenizer

We'll define a `Tokenizer` to convert our text input data into token indices.

In [None]:
import json
from collections import Counter
from more_itertools import take

In [None]:
class Tokenizer(object):
    def __init__(self, char_level, num_tokens=None, 
                 pad_token='<PAD>', oov_token='<UNK>',
                 token_to_index=None):
        self.char_level = char_level
        self.separator = '' if self.char_level else ' '
        if num_tokens: num_tokens -= 2 # pad + unk tokens
        self.num_tokens = num_tokens
        self.pad_token = pad_token
        self.oov_token = oov_token
        if not token_to_index:
            token_to_index = {pad_token: 0, oov_token: 1}
        self.token_to_index = token_to_index
        self.index_to_token = {v: k for k, v in self.token_to_index.items()}

    def __len__(self):
        return len(self.token_to_index)

    def __str__(self):
        return f"<Tokenizer(num_tokens={len(self)})>"

    def fit_on_texts(self, texts):
        if not self.char_level:
            texts = [text.split(" ") for text in texts]
        all_tokens = [token for text in texts for token in text]
        counts = Counter(all_tokens).most_common(self.num_tokens)
        self.min_token_freq = counts[-1][1]
        for token, count in counts:
            index = len(self)
            self.token_to_index[token] = index
            self.index_to_token[index] = token
        return self

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            if not self.char_level:
                text = text.split(' ')
            sequence = []
            for token in text:
                sequence.append(self.token_to_index.get(
                    token, self.token_to_index[self.oov_token]))
            sequences.append(np.asarray(sequence))
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = []
            for index in sequence:
                text.append(self.index_to_token.get(index, self.oov_token))
            texts.append(self.separator.join([token for token in text]))
        return texts

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {
                "char_level": self.char_level,
                "oov_token": self.oov_token,
                "token_to_index": self.token_to_index
            }
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [None]:
# Tokenize
tokenizer = Tokenizer(char_level=False, num_tokens=5000)
tokenizer.fit_on_texts(texts=X_train)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)

<Tokenizer(num_tokens=5000)>


In [None]:
# Sample of tokens
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") # use this to adjust num_tokens

[('<PAD>', 0), ('<UNK>', 1), ('->', 2), ('police', 3), ('ferguson', 4)]
least freq token's freq: 1


In [None]:
# Convert texts to sequences of indices
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print ("Text to indices:\n"
    f"  (preprocessed) → {preprocessed_text}\n"
    f"  (tokenized) → {X_train[0]}")

Text to indices:
  (preprocessed) → black islamic flag held window lindt chocolate store martin place sydney hostages inside -> still sure motivation attack
  (tokenized) → [  75   76   33   32  121   97  727   55  175   70    5    9   58    2
   38  185 3271   27]


## Padding

We'll need to do 2D padding to our tokenized text.

In [None]:
def pad_sequences(sequences, max_seq_len=0):
    """Pad sequences to max length in sequence."""
    max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
    padded_sequences = np.zeros((len(sequences), max_seq_len))
    for i, sequence in enumerate(sequences):
        padded_sequences[i][:len(sequence)] = sequence
    return padded_sequences

In [None]:
# 2D sequences
padded = pad_sequences(X_train[0:3])
print (padded.shape)
print (padded)

(3, 18)
[[7.500e+01 7.600e+01 3.300e+01 3.200e+01 1.210e+02 9.700e+01 7.270e+02
  5.500e+01 1.750e+02 7.000e+01 5.000e+00 9.000e+00 5.800e+01 2.000e+00
  3.800e+01 1.850e+02 3.271e+03 2.700e+01]
 [1.500e+01 2.000e+00 2.560e+02 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [3.000e+00 1.076e+03 5.600e+01 1.560e+02 1.900e+01 7.300e+01 2.000e+02
  4.900e+01 6.200e+01 2.300e+01 3.700e+01 2.000e+00 2.500e+01 8.800e+01
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]]


## Datasets

We're going to create Datasets and DataLoaders to be able to efficiently create batches with our data splits.

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y,):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __str__(self):
        return f"<Dataset(N={len(self)})>"

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index]
        return [X, len(X), y]

    def collate_fn(self, batch):
        """Processing on a batch."""
        # Get inputs
        batch = np.array(batch, dtype=object)
        X = batch[:, 0]
        seq_lens = batch[:, 1]
        y = np.stack(batch[:, 2], axis=0)

        # Pad inputs
        X = pad_sequences(sequences=X)

        # Cast
        X = torch.LongTensor(X.astype(np.int32))
        seq_lens = torch.LongTensor(seq_lens.astype(np.int32))
        y = torch.LongTensor(y.astype(np.int32))

        return X, seq_lens, y

    def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
        return torch.utils.data.DataLoader(
            dataset=self, batch_size=batch_size, collate_fn=self.collate_fn,
            shuffle=shuffle, drop_last=drop_last, pin_memory=True)

In [None]:
# Create datasets
train_dataset = Dataset(X=X_train, y=y_train)
val_dataset = Dataset(X=X_val, y=y_val)
test_dataset = Dataset(X=X_test, y=y_test)
print ("Datasets:\n"
    f"  Train dataset:{train_dataset.__str__()}\n"
    f"  Val dataset: {val_dataset.__str__()}\n"
    f"  Test dataset: {test_dataset.__str__()}\n"
    "Sample point:\n"
    f"  X: {train_dataset[0][0]}\n"
    f"  seq_len: {train_dataset[0][1]}\n"
    f"  y: {train_dataset[0][2]}")

Datasets:
  Train dataset:<Dataset(N=2966)>
  Val dataset: <Dataset(N=636)>
  Test dataset: <Dataset(N=636)>
Sample point:
  X: [  75   76   33   32  121   97  727   55  175   70    5    9   58    2
   38  185 3271   27]
  seq_len: 18
  y: 2


In [None]:
# Create dataloaders
batch_size = 64
train_dataloader = train_dataset.create_dataloader(
    batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(
    batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(
    batch_size=batch_size)
batch_X, batch_seq_lens, batch_y = next(iter(train_dataloader))
print ("Sample batch:\n"
    f"  X: {list(batch_X.size())}\n"
    f"  seq_lens: {list(batch_seq_lens.size())}\n"
    f"  y: {list(batch_y.size())}\n"
    "Sample point:\n"
    f"  X: {batch_X[0]}\n"
    f" seq_len: {batch_seq_lens[0]}\n"
    f"  y: {batch_y[0]}")

Sample batch:
  X: [64, 30]
  seq_lens: [64]
  y: [64]
Sample point:
  X: tensor([  75,   76,   33,   32,  121,   97,  727,   55,  175,   70,    5,    9,
          58,    2,   38,  185, 3271,   27,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])
 seq_len: 18
  y: 2


## Trainer

Let's create the `Trainer` class that we'll use to facilitate training for our experiments.

In [None]:
class Trainer(object):
    def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):

        # Set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train_step(self, dataloader):
        """Train step."""
        # Set model to train mode
        self.model.train()
        loss = 0.0

        # Iterate over train batches
        for i, batch in enumerate(dataloader):

            # Step
            batch = [item.to(self.device) for item in batch]  # Set device
            inputs, targets = batch[:-1], batch[-1]
            self.optimizer.zero_grad()  # Reset gradients
            z = self.model(inputs)  # Forward pass
            J = self.loss_fn(z, targets)  # Define loss
            J.backward()  # Backward pass
            self.optimizer.step()  # Update weights

            # Cumulative Metrics
            loss += (J.detach().item() - loss) / (i + 1)

        return loss

    def eval_step(self, dataloader):
        """Validation or test step."""
        # Set model to eval mode
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Step
                batch = [item.to(self.device) for item in batch]  # Set device
                inputs, y_true = batch[:-1], batch[-1]
                z = self.model(inputs)  # Forward pass
                J = self.loss_fn(z, y_true).item()

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """Prediction step."""
        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch[:-1], batch[-1]
                z = self.model(inputs)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)

        return np.vstack(y_probs)
    
    def train(self, num_epochs, patience, train_dataloader, val_dataloader):
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:
                _patience -= 1
            if not _patience:  # 0
                print("Stopping early!")
                break

            # Logging
            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )
        return best_model

# Vanilla RNN

Inputs to RNNs are sequential like text or time-series.

In [None]:
BATCH_SIZE = 64
EMBEDDING_DIM = 100

In [None]:
# Input
sequence_size = 8 # words per input
x = torch.rand((BATCH_SIZE, sequence_size, EMBEDDING_DIM))
seq_lens = torch.randint(high=sequence_size, size=(1, BATCH_SIZE))
print (x.shape)
print (seq_lens.shape)

torch.Size([64, 8, 100])
torch.Size([1, 64])


In [None]:
RNN_HIDDEN_DIM = 128
DROPOUT_P = 0.1
RNN_DROPOUT_P = 0.1

In [None]:
# Initialize hidden state
hidden_t = torch.zeros((BATCH_SIZE, RNN_HIDDEN_DIM))
print (hidden_t.size())

torch.Size([64, 128])


In [None]:
# Initialize RNN cell
rnn_cell = nn.RNNCell(EMBEDDING_DIM, RNN_HIDDEN_DIM)
print (rnn_cell)

RNNCell(100, 128)


In [None]:
# Forward pass through RNN
x = x.permute(1, 0, 2) # RNN needs batch_size to be at dim 1

# Loop through the inputs time steps
hiddens = []
for t in range(sequence_size):
    hidden_t = rnn_cell(x[t], hidden_t)
    hiddens.append(hidden_t)
hiddens = torch.stack(hiddens)
hiddens = hiddens.permute(1, 0, 2) # bring batch_size back to dim 0
print (hiddens.size())

torch.Size([64, 8, 128])


In [None]:
# We also could've used a more abstracted layer
x = torch.rand((BATCH_SIZE, sequence_size, EMBEDDING_DIM))
rnn = nn.RNN(EMBEDDING_DIM, RNN_HIDDEN_DIM, batch_first=True)
out, h_n = rnn(x) # h_n is the last hidden state
print ("out: ", out.shape)
print ("h_n: ", h_n.shape)

out:  torch.Size([64, 8, 128])
h_n:  torch.Size([1, 64, 128])


In [None]:
# The same tensors
print (out[:,-1,:])
print (h_n.squeeze(0))

tensor([[-0.0359, -0.3819,  0.2162,  ..., -0.3397,  0.0468,  0.1937],
        [-0.4914, -0.3056, -0.0837,  ..., -0.3507, -0.4320,  0.3593],
        [-0.0989, -0.2852,  0.1170,  ..., -0.0805, -0.0786,  0.3922],
        ...,
        [-0.3115, -0.4169,  0.2611,  ..., -0.3214,  0.0620,  0.0338],
        [-0.2455, -0.3380,  0.2048,  ..., -0.4198, -0.0075,  0.0372],
        [-0.2092, -0.4594,  0.1654,  ..., -0.5397, -0.1709,  0.0023]],
       grad_fn=<SliceBackward0>)
tensor([[-0.0359, -0.3819,  0.2162,  ..., -0.3397,  0.0468,  0.1937],
        [-0.4914, -0.3056, -0.0837,  ..., -0.3507, -0.4320,  0.3593],
        [-0.0989, -0.2852,  0.1170,  ..., -0.0805, -0.0786,  0.3922],
        ...,
        [-0.3115, -0.4169,  0.2611,  ..., -0.3214,  0.0620,  0.0338],
        [-0.2455, -0.3380,  0.2048,  ..., -0.4198, -0.0075,  0.0372],
        [-0.2092, -0.4594,  0.1654,  ..., -0.5397, -0.1709,  0.0023]],
       grad_fn=<SqueezeBackward1>)


In our model, we want to use the RNN's output after the last relevant token in the sentence is processed. The last relevant token doesn't refer the `<PAD>` tokens but to the last actual word in the sentence and its index is different for each input in the batch. This is why we included a `seq_lens` tensor in our batches.

In [None]:
def gather_last_relevant_hidden(hiddens, seq_lens):
    """Extract and collect the last relevant 
    hidden state based on the sequence length."""
    seq_lens = seq_lens.long().detach().cpu().numpy() - 1
    out = []
    for batch_index, column_index in enumerate(seq_lens):
        out.append(hiddens[batch_index, column_index])
    return torch.stack(out)

In [None]:
# Get the last relevant hidden state
gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens).squeeze(0).shape

torch.Size([64, 128])

## Model

In [None]:
import torch.nn.functional as F

In [None]:
HIDDEN_DIM = 100

In [None]:
class RNN(nn.Module):
    def __init__(self, embedding_dim, vocab_size, rnn_hidden_dim,
                 hidden_dim, dropout_p, num_classes, padding_idx=0):
        super(RNN, self).__init__()
        
        # Initialize embeddings
        self.embeddings = nn.Embedding(
            embedding_dim=embedding_dim, num_embeddings=vocab_size,
            padding_idx=padding_idx)
        
        # RNN
        self.rnn = nn.RNN(embedding_dim, rnn_hidden_dim, batch_first=True)
     
        # FC weights
        self.dropout = nn.Dropout(dropout_p)
        self.fc1 = nn.Linear(rnn_hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, inputs):
        # Embed
        x_in, seq_lens = inputs
        x_in = self.embeddings(x_in)
            
        # Rnn outputs
        out, h_n = self.rnn(x_in)
        z = gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens)

        # FC layers
        z = self.fc1(z)
        z = self.dropout(z)
        z = self.fc2(z)
        return z

In [None]:
# Simple RNN cell
model = RNN(
    embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE, 
    rnn_hidden_dim=RNN_HIDDEN_DIM, hidden_dim=HIDDEN_DIM, 
    dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
model = model.to(device) # set device
print (model.named_parameters)

<bound method Module.named_parameters of RNN(
  (embeddings): Embedding(5000, 100, padding_idx=0)
  (rnn): RNN(100, 128, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=128, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)>


## Training

In [None]:
from torch.optim import Adam

In [None]:
NUM_LAYERS = 1
LEARNING_RATE = 1e-4
PATIENCE = 50
NUM_EPOCHS = 50

In [None]:
# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

In [None]:
# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE) 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=3)

In [None]:
# Trainer module
trainer = Trainer(
    model=model, device=device, loss_fn=loss_fn, 
    optimizer=optimizer, scheduler=scheduler)

In [None]:
# Train
best_model = trainer.train(
    NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)



Epoch: 1 | train_loss: 1.39516, val_loss: 1.39864, lr: 1.00E-04, _patience: 50
Epoch: 2 | train_loss: 1.37513, val_loss: 1.39239, lr: 1.00E-04, _patience: 50
Epoch: 3 | train_loss: 1.36242, val_loss: 1.38752, lr: 1.00E-04, _patience: 50
Epoch: 4 | train_loss: 1.34279, val_loss: 1.38350, lr: 1.00E-04, _patience: 50
Epoch: 5 | train_loss: 1.32747, val_loss: 1.37985, lr: 1.00E-04, _patience: 50
Epoch: 6 | train_loss: 1.31053, val_loss: 1.37653, lr: 1.00E-04, _patience: 50
Epoch: 7 | train_loss: 1.28917, val_loss: 1.37359, lr: 1.00E-04, _patience: 50
Epoch: 8 | train_loss: 1.26294, val_loss: 1.37171, lr: 1.00E-04, _patience: 50
Epoch: 9 | train_loss: 1.23258, val_loss: 1.37234, lr: 1.00E-04, _patience: 49
Epoch: 10 | train_loss: 1.20050, val_loss: 1.37794, lr: 1.00E-04, _patience: 48
Epoch: 11 | train_loss: 1.16603, val_loss: 1.38896, lr: 1.00E-04, _patience: 47
Epoch: 12 | train_loss: 1.13184, val_loss: 1.40334, lr: 1.00E-05, _patience: 46
Epoch: 13 | train_loss: 1.10126, val_loss: 1.4051

## Evaluation

In [None]:
import json
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
def get_performance(y_true, y_pred, classes):
    """Per-class performance metrics."""
    # Performance
    performance = {"overall": {}, "class": {}}

    # Overall performance
    metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    performance["overall"]["precision"] = metrics[0]
    performance["overall"]["recall"] = metrics[1]
    performance["overall"]["f1"] = metrics[2]
    performance["overall"]["num_samples"] = np.float64(len(y_true))
    performance["overall"]["accuracy"] = accuracy_score(y_true, y_pred)
   
    # Per-class performance
    metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
    for i in range(len(classes)):
        performance["class"][classes[i]] = {
            "precision": metrics[0][i],
            "recall": metrics[1][i],
            "f1": metrics[2][i],
            "num_samples": np.float64(metrics[3][i]),
            "accuracy": accuracy_score(y_true, y_pred),
        }

    return performance

In [None]:
# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)



In [None]:
# Determine performance
performance = get_performance(
    y_true=y_test, y_pred=y_pred, classes=label_encoder.classes)
print (json.dumps(performance["overall"], indent=2))

{
  "precision": 0.5073093558255758,
  "recall": 0.27672955974842767,
  "f1": 0.2957695789042687,
  "num_samples": 636.0,
  "accuracy": 0.27672955974842767
}
