In [7]:
import inspect

# README

### How to run the code

```bash
conda env create -f nlp_gpu.yml
conda activate nlp_gpu
```

Example training command:
```bash
python train.py --model TreeLSTM --word_embeddings glove --trainable_embeddings --supervise_nodes --batch_size 128 --patience 10 --max_epochs 100
```

Results will be saved in the `results` folder, in json format.

### Analysis

Our results can be found in the `results` folder.
To run the analysis, use the `analysis.ipynb` notebook, it generates the plots and tables in the report.

# Data Handling

In [6]:
import data
print(inspect.getsource(data))

import requests
import os
import zipfile
import re
from collections import Counter, OrderedDict, namedtuple
from nltk import Tree
from torch.utils.data import Dataset


class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first seen"""

    def __repr__(self):
        return "%s(%r)" % (self.__class__.__name__, OrderedDict(self))

    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)


class Vocabulary:
    """A vocabulary, assigns IDs to tokens"""

    def __init__(self):
        self.freqs = OrderedCounter()
        self.w2i = {}
        self.i2w = []
        self.i2t = ["very negative", "negative", "neutral", "positive", "very positive"]
        self.t2i = OrderedDict({p : i for p, i in zip(self.i2t, range(len(self.i2t)))})

    def count_token(self, t):
        self.freqs[t] += 1

    def add_token(self, t):
        self.w2i[t] = len(self.w2i)
        self.i2w.append(t)

    def build(self, min_freq=0):
        """

# Models

## Bag of Words Models

In [8]:
from models.bow import BOW, CBOW, Deep_CBOW, PTDeepCBOW
print(inspect.getsource(BOW))
print(inspect.getsource(CBOW))
print(inspect.getsource(Deep_CBOW))
print(inspect.getsource(PTDeepCBOW))

class BOW(nn.Module):
    """A simple bag-of-words model"""

    def __init__(self, vocab_size, embedding_dim, vocab):
        super(BOW, self).__init__()
        self.vocab = vocab

        # this is a trainable look-up table with word embeddings
        self.embed = nn.Embedding(vocab_size, embedding_dim)

        # this is a trainable bias term
        self.bias = nn.Parameter(torch.zeros(embedding_dim), requires_grad=True)

    def forward(self, inputs):
        # this is the forward pass of the neural network
        # it applies a function to the input and returns the output

        # this looks up the embeddings for each word ID in inputs
        # the result is a sequence of word embeddings
        embeds = self.embed(inputs)

        # the output is the sum across the time dimension (1)
        # with the bias term added
        logits = embeds.sum(1) + self.bias

        return logits

class CBOW(nn.Module):
    """A continuous bag-of-words model"""

    def __init__(self, v

## LSTM Models

### LSTM

In [10]:
from models.lstm import LSTMClassifier, MyLSTMCell
print(inspect.getsource(LSTMClassifier))
print(inspect.getsource(MyLSTMCell))

class LSTMClassifier(nn.Module):
    """Encodes sentence with an LSTM and projects final hidden state"""

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, vocab):
        super(LSTMClassifier, self).__init__()
        self.vocab = vocab
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        self.rnn = MyLSTMCell(embedding_dim, hidden_dim)

        self.output_layer = nn.Sequential(
            nn.Dropout(p=0.5), nn.Linear(hidden_dim, output_dim)  # explained later
        )

    def forward(self, x):

        B = x.size(0)  # batch size (this is 1 for now, i.e. 1 single example)
        T = x.size(1)  # timesteps (the number of words in the sentence)

        input_ = self.embed(x)

        # here we create initial hidden states containing zeros
        # we use a trick here so that, if input is on the GPU, then so are hx and cx
        hx = input_.new_zeros(B, self.rnn.hidden_size)
        cx = 

### TreeLSTM N-ary

In [11]:
from models.lstm import TreeLSTMCell, TreeLSTM, TreeLSTMClassifier
print(inspect.getsource(TreeLSTMClassifier))
print(inspect.getsource(TreeLSTM))
print(inspect.getsource(TreeLSTMCell))

class TreeLSTMClassifier(nn.Module):
    """Encodes sentence with a TreeLSTM and projects final hidden state"""

    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        vocab,
        reduce_fn=TreeLSTMCell,
    ):
        super(TreeLSTMClassifier, self).__init__()
        self.vocab = vocab
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        self.treelstm = TreeLSTM(embedding_dim, hidden_dim, reduce_fn=reduce_fn)
        self.output_layer = nn.Sequential(
            nn.Dropout(p=0.5), nn.Linear(hidden_dim, output_dim, bias=True)
        )

    def forward(self, x):

        # x is a pair here of words and transitions; we unpack it here.
        # x is batch-major: [B, T], transitions is time major [2T-1, B]
        x, transitions = x
        emb = self.embed(x)

        # we use the root/top state of the Tree LSTM to classify the sentence
        roo

### TreeLSTM Child-Sum

In [12]:
from models.lstm import ChildSumTreeLSTMCell
print(inspect.getsource(ChildSumTreeLSTMCell))

class ChildSumTreeLSTMCell(nn.Module):
    """A child-sum Tree LSTM cell"""

    def __init__(self, input_size, hidden_size, bias=True):
        """Creates the weights for this LSTM"""
        super(ChildSumTreeLSTMCell, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        self.reduce_layer_1 = nn.Linear(hidden_size, 3 * hidden_size)
        self.reduce_layer_forget = nn.Linear(hidden_size, hidden_size)
        self.dropout_layer = nn.Dropout(p=0.25)

        self.reset_parameters()

    def reset_parameters(self):
        """This is PyTorch's default initialization method"""
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def forward(self, hx_l, hx_r, mask=None):
        """
        hx_l is ((batch, hidden_size), (batch, hidden_size))
        hx_r is ((batch, hidden_size), (batch, hidden_size))
        """
        prev_

# Utilities

In [14]:
from utils import (
    train_model,
    evaluate_metrics_extended_batch,
    print_parameters,
    get_minibatch,
    prepare_minibatch,
    prepare_treelstm_minibatch,
    pad,
    batch,
    unbatch,
    download_file,
    load_pretrained_embeddings,
    create_vocabulary_and_embeddings,
)

# these are the important ones
print(inspect.getsource(train_model))
print(inspect.getsource(evaluate_metrics_extended_batch))

def train_model(
    model,
    optimizer,
    train_data,
    dev_data,
    test_data,
    device,
    max_epochs=100,
    eval_every=1,
    batch_fn=None,
    prep_fn=None,
    eval_fn=None,
    batch_size=None,
    patience=5,
):

    criterion = nn.CrossEntropyLoss()
    train_epoch_losses = []
    val_epoch_metrics = []

    # early stopping
    best_val_accuracy = float("-inf")
    best_epoch = 0
    patience_counter = 0
    best_model = None
    try:
        for epoch in range(max_epochs):
            train_loss = 0
            num_batches = 0
            for batch in tqdm(
                batch_fn(train_data.data, batch_size=batch_size, shuffle=True)
            ):
                num_batches += 1

                # Forward pass
                model.train()
                x, targets = prep_fn(batch, model.vocab, device=device)

                logits = model(x)

                B = targets.size(0)  # Batch size

                # Compute cross-entropy loss
                los

# Main (Training Pipeline)

In [15]:
from train import main
print(inspect.getsource(main))

def main(args):

    seeds = [1, 42, 1337]

    # load data
    train_dataset = SentimentDataset(
        split="train", lower=False, supervise_nodes=args.supervise_nodes
    )
    dev_dataset = SentimentDataset(
        split="dev", lower=False, supervise_nodes=False
    )  # keep dev and test as is
    test_dataset = SentimentDataset(split="test", lower=False, supervise_nodes=False)

    results_dir = "results"
    os.makedirs(results_dir, exist_ok=True)

    train_epoch_losses_list = []
    val_epoch_metrics_list = []
    test_metrics_list = []
    binned_metrics_list = []
    max_epochs = []
    for seed in seeds:

        # Set random seeds for reproducibility
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

        # initialize model
        if args.model == "BOW":
            vocab_size = len(train_dataset.vocab.w2i)
            n_classes = len(train_dataset.vocab.t2i)
            model = bow.BOW(vocab_s