In [3]:
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
from datasets import load_dataset
from collections import Counter
import gensim.downloader as api
import numpy as np
import os

import sys
sys.path.append('../tasks/classification/models')
sys.path.append('../tasks/classification/')


# Loading dataset and data processing

In [4]:
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [5]:
word2vec_model = api.load('word2vec-google-news-300')
embeddings = word2vec_model.vectors
w2v_vocab = word2vec_model.key_to_index

In [6]:
w2v_vocab['UNK'], w2v_vocab['PAD']

(98307, 35636)

In [7]:
w2v_vocab['pad']

11380

## Tokenizing

1. We ignore the words
2. We map them to `UNK`, available in word2vec vocab

In [8]:
def tokenize(vocab, text):
    """Tokenize a given text using NLTK, returning the corresponding in our pretrained embeddings"""
    tokens = word_tokenize(text.lower())  # Tokenize the text into words
    token_ids = [self.vocab.get(token, self.vocab["UNK"]) for token in tokens]  # Get token IDs
    return {"tokens": tokens, "ids": token_ids}

In [9]:
nltk.download('punkt')

class NLTKTokenizer:
    def __init__(self, config=None):
        self.vocab = {}
        self.config = config or {}
        self.pad_id = None  # Store pad_id for future use

    @classmethod
    def from_pretrained(cls, vocab):
        """Load a tokenizer with a pre-built vocabulary from a saved file."""
        tokenizer = cls()
        tokenizer.vocab = vocab
        tokenizer.pad_id = vocab.get("PAD", 1)  # Ensure pad_id is set
        return tokenizer

    def build_vocab(self):
        """Build vocabulary from the given dataset."""
        from collections import Counter
        from datasets import load_dataset

        dataset = load_dataset(self.config["dataset"])
        train_dataset = dataset['train']
        vocab = Counter(self.vocab)
        for item in train_dataset:
            tokens = word_tokenize(item['text'].lower())
            vocab.update(tokens)
        self.vocab = {word: idx for idx, (word, _) in enumerate(vocab.items(), 1)}  # Index starts at 1
        #print(self.vocab)

    def tokenize(self, text):
        """Tokenize a given text using NLTK."""
        tokens = word_tokenize(text.lower())  # Tokenize the text into words
        token_ids = [self.vocab.get(token, self.vocab["UNK"]) for token in tokens]  # Get token IDs
        # token_ids = [self.vocab.get(token, np.zeros_like(self.vocab['UNK'])) for token in tokens]  # Get token IDs

        return {"tokens": tokens, "ids": token_ids}
    
    def save(self, folder_path):
        """Save the vocabulary to a file."""
        os.makedirs(folder_path, exist_ok=True)
        with open(os.path.join(folder_path, "vocab.json"), "w") as f:
            json.dump(self.vocab, f)

[nltk_data] Downloading package punkt to /home/bach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
tokenizer = NLTKTokenizer.from_pretrained(w2v_vocab)

## Dataloader

In [11]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [12]:
class ClassificationDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item["text"]
        label = item["label"]
        ids = self.tokenizer.tokenize(text)["ids"]
        length = len(ids)
        ids = torch.tensor(ids)
        return ids, length, label

In [13]:
def get_dataloaders(
    tokenizer,
    dataset,
    training_bs,
    val_bs,
):
    train_dataset = ClassificationDataset(dataset["train"], tokenizer)
    validation_dataset = ClassificationDataset(dataset["validation"], tokenizer)
    test_dataset = ClassificationDataset(dataset["test"], tokenizer)
    # partial function to be used in DataLoader
    def padding_fn(batch):
        # pad sequences in the same batch to be the same shape
        (xx, lengths, yy) = zip(*batch)
        xx_pad = pad_sequence(xx, batch_first=True, padding_value=tokenizer.pad_id)
        return xx_pad, torch.tensor(lengths), torch.tensor(yy)
    train_loader = DataLoader(train_dataset, batch_size=training_bs, shuffle=True, collate_fn=padding_fn)
    val_loader   = DataLoader(validation_dataset, batch_size=val_bs, shuffle=True, collate_fn=padding_fn)
    test_loader  = DataLoader(test_dataset, batch_size=val_bs, shuffle=True, collate_fn=padding_fn)
    
    return train_loader, val_loader, test_loader

In [14]:
train_loader, val_loader, test_loader = get_dataloaders(
    tokenizer=tokenizer, 
    dataset=dataset, 
    training_bs=32,
    val_bs=32,
)

# Initializing the model

In [15]:
import torch 
import torch.nn as nn
import torch.functional as F
import json

In [16]:
import sys
sys.path.append('../tasks/classification/models')

In [17]:
class RNNLayer(nn.Module):
    def __init__(self, dim_input, dim_hidden, dim_output, direction=1):
        super(RNNLayer, self).__init__()
        self.dim_input = dim_input
        self.dim_hidden = dim_hidden
        
        self.i2h = nn.Linear(dim_input + dim_hidden, dim_hidden)
        self.i2o = nn.Linear(dim_input + dim_hidden, dim_output)
        self.direction = direction

    def forward(self, input, hidden):
        outputs = []
        if self.direction == 1:
            for i in range(input.size()[1]):
                combined = torch.cat((input[:, i, :], hidden), dim=1)
                hidden = self.i2h(combined)
                output_cell = self.i2o(combined)
                outputs.append(output_cell)
        else: 
            for i in range(input.size()[1]-1, -1, -1):
                combined = torch.cat((input[:, i, :], hidden), dim=1)
                hidden = self.i2h(combined)
                output_cell = self.i2o(combined)
                outputs.append(output_cell)
        return torch.stack(outputs, dim=1) # (batch_size, seq_len, dim_output)

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.dim_hidden)

class RNN(nn.Module):
    def __init__(self, vocab_size, dim_input, dim_hidden, dim_output, pretrained_embeddings=None, freeze_embeddings=True):
        super(RNN, self).__init__()

        if pretrained_embeddings is not None:
            print("Loading pretrained word embeddings")
            self.token_embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embeddings, dtype=torch.float),
                freeze=freeze_embeddings
            )
        else:
            self.token_embedding = nn.Embedding(vocab_size, dim_input)

        self.dim_input = dim_input
        self.dim_hidden = dim_hidden
        self.dim_output = dim_output

        self.rnn_layer = RNNLayer(dim_input, dim_hidden, dim_output)
        self.softmax = nn.LogSoftmax(dim=-1)
    
    def initialize(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, input):
        hidden = self.rnn_layer.init_hidden(input.size()[0])
        embedded = self.token_embedding(input)
        outputs = self.rnn_layer(embedded, hidden)
        outputs = self.softmax(outputs)
        return outputs

    def init_hidden(self, batch_size):
        return Variable(torch.zeros((batch_size, self.dim_hidden)))

In [18]:
len(w2v_vocab)

3000000

In [19]:
vocab_size=len(w2v_vocab)
dim_input = embeddings[0].shape[0]
dim_hidden = 300

In [20]:
model = RNN(vocab_size=len(w2v_vocab),
            dim_input=dim_input,
            dim_hidden=dim_hidden,  
            dim_output=2,
            pretrained_embeddings=embeddings,
            freeze_embeddings=True)

Loading pretrained word embeddings


# Training loop

Use the pretrained word embeddings from Part 1 as inputs; do not update them during training (they are “frozen”).

Design a simple recurrent neural network (RNN), taking the input word embeddings, and predicting a sentiment label for each sentence. To do that, you need to consider how to aggregate the word representations to represent a sentence.

Use the validation set to gauge the performance of the model for each epoch during training. You are required to use accuracy  as the performance metric during validation and evaluation.

Use the mini-batch strategy during training.  You may choose any preferred optimizer (e.g., SGD, Adagrad, Adam, RMSprop). Be careful when you choose your initial learning rate and mini-batch size. (You should use the validation set to determine the optimal configuration.) Train the model until the accuracy  score on the validation set is not increasing for a few epochs.

Evaluate your trained model on the test dataset, observing the accuracy score.

In [21]:
from torchmetrics.classification import BinaryAccuracy

In [23]:
metric = BinaryAccuracy()
learning_rate = 0.001
num_epochs = 10000
metric_log_interval = 2000
eval_interval = 100

In [21]:
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate)

In [22]:
model.train()
train_loss = 0