In [1]:
from collections import Counter
import os
import random
import tarfile
import tempfile
import urllib.request

import nltk
nltk.download('punkt')

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import pickle
from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Sentiment Classification on Pretrained GloVe Embeddings

The following trains an RNN to classify sentiment of text, using a pretrained embedding layer. This code is largely the same as the example notebook given in class: https://colab.research.google.com/drive/14GAMb7c6FbDnhWvqcliCZ8KYNvqdnQz7?usp=sharing#scrollTo=uofVxyhMbVtx

Key changes are highlighted in the text.

**Here, we generate the embedding matrix by using GloVe embeddings for words from the sentiment dataset that were also in the GloVe vocabulary, while using random weights for the other words.**

In [2]:
! git clone https://github.com/ronakdm/nlp-hw3.git

fatal: destination path 'nlp-hw3' already exists and is not an empty directory.


In [16]:
def create_embedding_weights(token_to_idx):
    embeddings = pickle.load(open("nlp-hw3/embeddings.p", "rb")).numpy()
    pretrain_word_list = pickle.load(open("nlp-hw3/vocab_list.p", "rb"))

    pretrain_size, embed_dim = embeddings.shape

    vocab_size = len(token_to_idx)

    # Weight matrix of new embedding.
    weights = torch.rand(vocab_size, embed_dim)

    # Find out which of the GloVe words are in the IMDB vocabulary.
    # Find their indices, and replace the indices of the matrix with those embeddings.
    count = 0
    in_vocab_indices = []
    for i, word in enumerate(pretrain_word_list):
        if word in token_to_idx:
            idx = token_to_idx[word]
            weights[idx, :] = torch.tensor(embeddings[i, :])
            in_vocab_indices.append(idx)
            count += 1
    in_vocab_indices = torch.tensor(in_vocab_indices)

    print("%d word types from the GloVe embeddings were found in the %d size IMDB review vocabulary." % (count, vocab_size))

    return weights, in_vocab_indices

**Hyperparameters below. Note that these are picked to prioritize computational cheapness and performance can definitely be increased with a larger network and more epochs of training.**

In [5]:
MAX_SEQ_LEN = -1  # -1 for no truncation
UNK_THRESHOLD = 5
BATCH_SIZE = 64
N_EPOCHS = 3
LEARNING_RATE = 1e-3
HIDDEN_DIM = 128
N_RNN_LAYERS = 2

PAD = "@@PAD@@"
UNK = "@@UNK@@"

In [6]:
def seed_everything(seed=1):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

In [7]:
def download_data():
    """
    A function to download and uncompress the imdb data. You don't have to understand anything here.
    """

    def extract_data(dir, split):
        data = []
        for label in ("pos", "neg"):
            label_dir = os.path.join(dir, "aclImdb", split, label)
            files = sorted(os.listdir(label_dir))
            for file in files:
                filepath = os.path.join(label_dir, file)
                with open(filepath, encoding="UTF-8") as f:
                    data.append({"raw": f.read(), "label": label})
        return data

    url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    stream = urllib.request.urlopen(url)
    tar = tarfile.open(fileobj=stream, mode="r|gz")
    with tempfile.TemporaryDirectory() as td:
        tar.extractall(path=td)
        train_data = extract_data(td, "train")
        test_data = extract_data(td, "test")
        return train_data, test_data


def split_data(train_data, num_split=2000):
    """Splits the training data into training and development sets."""
    random.shuffle(train_data)
    return train_data[:-num_split], train_data[-num_split:]

In [8]:
def tokenize(data, max_seq_len=MAX_SEQ_LEN):
    """
    Here we use nltk to tokenize data. There are many othe possibilities. We also truncate the
    sequences so that the training time and memory is more manageable. You can think of truncation
    as making a decision only looking at the first X words.
    """
    for example in data:
        example["text"] = []
        for sent in nltk.sent_tokenize(example["raw"]):
            example["text"].extend(nltk.word_tokenize(sent))
        if max_seq_len >= 0:
            example["text"] = example["text"][:max_seq_len]


def create_vocab(data, unk_threshold=UNK_THRESHOLD):
    """
    Creates a vocabulary with tokens that have frequency above unk_threshold and assigns each token
    a unique index, including the special tokens.
    """
    counter = Counter(token for example in data for token in example["text"])
    vocab = {token for token in counter if counter[token] > unk_threshold}
    print(f"Vocab size: {len(vocab) + 2}")  # add the special tokens
    print(f"Most common tokens: {counter.most_common(10)}")
    token_to_idx = {PAD: 0, UNK: 1}
    for token in vocab:
        token_to_idx[token] = len(token_to_idx)
    return token_to_idx


def apply_vocab(data, token_to_idx):
    """
    Applies the vocabulary to the data and maps the tokenized sentences to vocab indices as the
    model input.
    """
    for example in data:
        example["text"] = [token_to_idx.get(token, token_to_idx[UNK]) for token in example["text"]]


def apply_label_map(data, label_to_idx):
    """Converts string labels to indices."""
    for example in data:
        example["label"] = label_to_idx[example["label"]]

In [9]:
class SentimentDataset(Dataset):
    def __init__(self, data, pad_idx):
        data = sorted(data, key=lambda example: len(example["text"]))
        self.texts = [example["text"] for example in data]
        self.labels = [example["label"] for example in data]
        self.pad_idx = pad_idx

    def __getitem__(self, index):
        return [self.texts[index], self.labels[index]]

    def __len__(self):
        return len(self.texts)

    def collate_fn(self, batch):
        def tensorize(elements, dtype):
            return [torch.tensor(element, dtype=dtype) for element in elements]

        def pad(tensors):
            """Assumes 1-d tensors."""
            max_len = max(len(tensor) for tensor in tensors)
            padded_tensors = [
                F.pad(tensor, (0, max_len - len(tensor)), value=self.pad_idx) for tensor in tensors
            ]
            return padded_tensors

        texts, labels = zip(*batch)
        return [
            torch.stack(pad(tensorize(texts, torch.long)), dim=0),
            torch.stack(tensorize(labels, torch.long), dim=0),
        ]

**The `SequenceClassifier` now takes `weights` upon initialization instead of an embedding dimension.**.

In [10]:
class SequenceClassifier(nn.Module):
    def __init__(self, vocab_size, weights, hidden_dim, n_labels, n_rnn_layers, pad_idx):
        super().__init__()

        self.pad_idx = pad_idx

        self.embedding = nn.Embedding.from_pretrained(weights, freeze=False)

        embedding_dim = weights.shape[1]
        
        self.rnn = nn.GRU(
            embedding_dim, hidden_dim, num_layers=n_rnn_layers, batch_first=True, bidirectional=True
        )
        # We take the final hidden state at all GRU layers as the sequence representation.
        # 2 because bidirectional.
        layered_hidden_dim = hidden_dim * n_rnn_layers * 2
        self.output = nn.Linear(layered_hidden_dim, n_labels)

    def forward(self, text):
        # text shape: (batch_size, max_seq_len) where max_seq_len is the max length *in this batch*
        # lens shape: (batch_size,)
        non_padded_positions = text != self.pad_idx
        lens = non_padded_positions.sum(dim=1)

        # embedded shape: (batch_size, max_seq_len, embedding_dim)
        embedded = self.embedding(text)
        # You can pass the embeddings directly to the RNN, but as the input potentially has
        # different lengths, how do you know when to stop unrolling the recurrence for each example?
        # pytorch provides a util function pack_padded_sequence that converts padded sequences with
        # potentially different lengths into a special PackedSequence object that keeps track of
        # these things. When passing a PackedSequence object into the RNN, the output will be a
        # PackedSequence too (but not the hidden state as that always has a length of 1). Since we
        # do not use the per-token output, we do not unpack it. But if you need it, e.g. for
        # token-level classification such as POS tagging, you can use pad_packed_sequence to convert
        # it back to a regular tensor.
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, lens.cpu(), batch_first=True, enforce_sorted=False
        )
        # nn.GRU produces two outputs: one is the per-token output and the other is per-sequence.
        # The pers-sequence output is simiar to the last per-token output, except that it is taken
        # at all layers.
        # output (after unpacking) shape: (batch_size, max_seq_len, hidden_dim)
        # hidden shape: (n_layers * n_directions, batch_size, hidden_dim)
        packed_output, hidden = self.rnn(packed_embedded)
        # shape: (batch_size, n_layers * n_directions * hidden_dim)
        hidden = hidden.transpose(0, 1).reshape(hidden.shape[1], -1)
        # Here we directly output the raw scores without softmax normalization which would produce
        # a valid probability distribution. This is because:
        # (1) during training, pytorch provides a loss function "F.cross_entropy" that combines
        # "log_softmax + F.nll_loss" in one step. See the `train` function below.
        # (2) during evaluation, we usually only care about the class with the highest score, but
        # not the actual probablity distribution.
        # shape: (batch_size, n_labels)
        return self.output(hidden)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

**Because the computational requirements of this model are not too prohibitive, we employ a hack to "freeze" the embeddings that were taken from the GloVe vocabulary. After backward propagation, we zero out the gradients of those weights, so the optimizer makes no step in those dimensions. This is if a tensor indicating which indices are in the GloVe vocabulary is passed to `train`. Additionally, `evaluate` outputs Macro F1 Score as well.**

In [19]:
def train(model, dataloader, optimizer, device, in_vocab_indices = None):
    for texts, labels in tqdm(dataloader):
        texts, labels = texts.to(device), labels.to(device)
        output = model(texts)
        loss = F.cross_entropy(output, labels)
        model.zero_grad()
        loss.backward()

        if in_vocab_indices is not None:
            # We would like to freeze the embeddings of the pretrained vectors.
            model.embedding.weight.grad[in_vocab_indices] = 0
        optimizer.step()


def evaluate(model, dataloader, device):
    count = correct = 0.0
    with torch.no_grad():
        for texts, labels in tqdm(dataloader):
            texts, labels = texts.to(device), labels.to(device)
            # shape: (batch_size, n_labels)
            output = model(texts)
            # shape: (batch_size,)
            predicted = output.argmax(dim=-1)
            count += len(predicted)
            correct += (predicted == labels).sum().item()
    print(f"Accuracy: {correct / count}")
    print(f"Macro F1 Score: {f1_score(predicted.cpu(), labels.cpu())}")

In [14]:
seed_everything()

print("Downloading data")
train_data, test_data = download_data()
train_data, dev_data = split_data(train_data)
print(f"Data sample: {train_data[:3]}")
print(f"train {len(train_data)}, dev {len(dev_data)}, test {len(test_data)}")

print("Processing data")
for data in (train_data, dev_data, test_data):
    tokenize(data)

# Here we only use the training data to create the vocabulary because
# (1) we shouldn't look at the test set; and
# (2) we want the dev set to accurately reflect the test set performance.
# There are people who do other things.
token_to_idx = create_vocab(train_data)
label_to_idx = {"neg": 0, "pos": 1}
for data in (train_data, dev_data, test_data):
    apply_vocab(data, token_to_idx)
    apply_label_map(data, label_to_idx)

pad_idx = token_to_idx[PAD]
train_dataset = SentimentDataset(train_data, pad_idx)
dev_dataset = SentimentDataset(dev_data, pad_idx)
test_dataset = SentimentDataset(test_data, pad_idx)
train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn
)
dev_dataloader = DataLoader(
    dev_dataset, batch_size=BATCH_SIZE, collate_fn=dev_dataset.collate_fn
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, collate_fn=test_dataset.collate_fn
)

Downloading data
Data sample: [{'raw': "This film is quite boring. There are snippets of naked flesh tossed around in a lame attempt to keep the viewer awake but they don't succeed.<br /><br />The best thing about the movie is Lena Olin--she does a masterful job handling her character, but Day-Lewis garbles most of his lines.<br /><br />Kaufman clearly had no idea how to film this. The incongruities in bouncing between domestic household/marriage issues and political crises are badly matched. Character attitudes change without explanation throughout. Badly disjointed.", 'label': 'neg'}, {'raw': 'How can you tell that a horror movie is terrible? when you can\'t stop laughing about it of course! The plot has been well covered by other reviewers, so I\'ll just add a few things on the hilarity of it all.<br /><br />Some reviews have placed the location in South America, others in Africa, I thought it was in some random island in the Pacific. Where exactly does this take place, seems to be 

In [17]:
weights, in_vocab_indices = create_embedding_weights(token_to_idx)

model = SequenceClassifier(
    len(token_to_idx), weights, HIDDEN_DIM, len(label_to_idx), N_RNN_LAYERS, pad_idx
)
print(f"Model has {count_parameters(model)} parameters.")

# Adam is just a fancier version of SGD.
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

18741 word types from the GloVe embeddings were found in the 30595 size IMDB review vocabulary.
Model has 1965464 parameters.


SequenceClassifier(
  (embedding): Embedding(30595, 50)
  (rnn): GRU(50, 128, num_layers=2, batch_first=True, bidirectional=True)
  (output): Linear(in_features=512, out_features=2, bias=True)
)

**Full training loop.**

In [20]:
try:
  print(f"Random baseline")
  evaluate(model, dev_dataloader, device)
  for epoch in range(N_EPOCHS):
      print(f"Epoch {epoch + 1}")  # 0-based -> 1-based
      train(model, train_dataloader, optimizer, device, in_vocab_indices = in_vocab_indices)
      evaluate(model, dev_dataloader, device)
  print(f"Test set performance")
  evaluate(model, test_dataloader, device)
except KeyboardInterrupt:
  print('Graceful Exit')

 25%|██▌       | 8/32 [00:00<00:00, 75.97it/s]

Random baseline


100%|██████████| 32/32 [00:00<00:00, 36.00it/s]
  0%|          | 1/360 [00:00<00:44,  8.00it/s]

Accuracy: 0.491
Macro F1 Score: 0.26666666666666666
Epoch 1


100%|██████████| 360/360 [00:58<00:00,  6.14it/s]
100%|██████████| 32/32 [00:00<00:00, 37.11it/s]
  0%|          | 1/360 [00:00<01:02,  5.79it/s]

Accuracy: 0.8275
Macro F1 Score: 0.888888888888889
Epoch 2


100%|██████████| 360/360 [00:59<00:00,  6.07it/s]
100%|██████████| 32/32 [00:00<00:00, 37.27it/s]
  0%|          | 0/360 [00:00<?, ?it/s]

Accuracy: 0.8705
Macro F1 Score: 0.888888888888889
Epoch 3


100%|██████████| 360/360 [00:59<00:00,  6.03it/s]
100%|██████████| 32/32 [00:00<00:00, 36.16it/s]
  4%|▍         | 16/391 [00:00<00:02, 152.54it/s]

Accuracy: 0.8835
Macro F1 Score: 0.9411764705882353
Test set performance


100%|██████████| 391/391 [00:08<00:00, 44.37it/s]

Accuracy: 0.8588
Macro F1 Score: 0.85



