### For this lab session we are going to build a character based model that learns to classify which language does the word belong to

### We have prepared the dataset (data.p) of roughly 8k words per each language (English, German, French, Bulgarian and Russian). If you are interested to know how we created the dataset take a look at lab4_data notebook

### We will show how to implement Recurrent Neural Net (RNN) and Convolutional Neural Net (CNN) based models and compare them with Bag-of-Words

### Important Note: Make sure to Restart and Run all (Kernel -> Restart and Run all) every time you modify your network before training it: Jupyter Notebook saves network weight and resumes training instead of starting it from scratch again

In [1]:
# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

In [2]:
def build_vocab(data):
    # Returns:
    # id2char: list of chars, where id2char[i] returns char that corresponds to char i
    # char2id: dictionary where keys represent chars and corresponding values represent indices
    # some preprocessing
    max_len = max([len(word[0]) for word in data])
    all_chars = []
    for word in data:
        all_chars += word[0]
    unique_chars = list(set(all_chars))

    id2char = unique_chars
    char2id = dict(zip(unique_chars, range(2,2+len(unique_chars))))
    id2char = ['<pad>', '<unk>'] + id2char
    char2id['<pad>'] = PAD_IDX
    char2id['<unk>'] = UNK_IDX

    return char2id, id2char, max_len

def convert_to_chars(data):
    return [([c for c in sample[0]], sample[1]) for sample in data]

### Function that preprocessed dataset
def read_data():
    data = pkl.load(open("data.p", "rb"))
    train_data, val_data, test_data = data['train'], data['valid'], data['test']
    train_data, val_data, test_data = convert_to_chars(train_data), convert_to_chars(val_data), convert_to_chars(test_data)
    char2id, id2char, max_len = build_vocab(train_data)
    return train_data, val_data, test_data, char2id, id2char, max_len


In [3]:
train_data, val_data, test_data, char2id, id2char, MAX_WORD_LENGTH = read_data()

print ("Maximum word length of dataset is {}".format(MAX_WORD_LENGTH))
print ("Number of characters in dataset is {}".format(len(id2char)))
print ("Characters:")
print (char2id.keys())

Maximum word length of dataset is 23
Number of characters in dataset is 63
Characters:
dict_keys(['l', 'm', 'n', 'q', 'ï', '2', 'p', '■', 'x', 'r', 'o', '1', 'œ', 'v', ',', 'à', '♪', '4', 'y', '.', 's', 'z', 'ë', 'h', 'î', '0', 'c', 'j', 'ј', 'ü', 'ѝ', 'ç', 'û', 'é', 'w', 'a', '<unk>', 'i', 'ô', '<pad>', 't', 'ö', 'ê', 'd', '♫', '9', '►', "'", 'э', 'ù', '5', 'e', 'ä', 'u', 'è', '8', 'â', 'b', '6', 'k', 'f', 'g', '3'])


### Now lets build the PyTorch DataLoader as we did in previous lab

In [4]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, char2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = zip(*data_tuple)
        assert (len(self.data_list) == len(self.target_list))
        self.char2id = char2id

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        char_idx = [self.char2id[c] if c in self.char2id.keys() else UNK_IDX  for c in self.data_list[key][:MAX_WORD_LENGTH]]
        label = self.target_list[key]
        return [char_idx, len(char_idx), label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_WORD_LENGTH-datum[1])),
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    ind_dec_order = np.argsort(length_list)[::-1]
    data_list = np.array(data_list)[ind_dec_order]
    length_list = np.array(length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]


In [5]:
# Build train, valid and test dataloaders

train_dataset = VocabDataset(train_data, char2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_data, char2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = VocabDataset(test_data, char2id)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

### Now lets implement basic Recurrent Neural Net model

In [6]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers, batch_size, self.hidden_size)

        return hidden

    def forward(self, x, lengths):
        # reset hidden state

        batch_size, seq_len = x.size()

        self.hidden = self.init_hidden(batch_size)

        # get embedding of characters
        embed = self.embedding(x)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        rnn_out, self.hidden = self.rnn(embed, self.hidden)
        # undo packing
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        rnn_out = torch.sum(rnn_out, dim=1)

        logits = self.linear(rnn_out)
        return logits


## Important things to keep in mind when using variable sized sequences in RNN in Pytorch

### RNN modules accept packed sequences as inputs
* pack_padded_sequence function packs a sequence (in Tensor format) containing padded sequences of variable length. **IMPORTANT: the sequences should be sorted by length in a decreasing order before passing to this function**

* pad_packed_sequence function is an inverse operation to pack_padded_sequence. Transforms a padded sequence into a tensor of variable lenth sequences

In [7]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = RNN(emb_size=100, hidden_size=200, num_layers=2, num_classes=5, vocab_size=len(id2char))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data, lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


Epoch: [1/10], Step: [101/924], Validation Acc: 58.46
Epoch: [1/10], Step: [201/924], Validation Acc: 61.64
Epoch: [1/10], Step: [301/924], Validation Acc: 65.98
Epoch: [1/10], Step: [401/924], Validation Acc: 67.02
Epoch: [1/10], Step: [501/924], Validation Acc: 68.86
Epoch: [1/10], Step: [601/924], Validation Acc: 69.32
Epoch: [1/10], Step: [701/924], Validation Acc: 70.64
Epoch: [1/10], Step: [801/924], Validation Acc: 70.76
Epoch: [1/10], Step: [901/924], Validation Acc: 72.48
Epoch: [2/10], Step: [101/924], Validation Acc: 72.74
Epoch: [2/10], Step: [201/924], Validation Acc: 73.84
Epoch: [2/10], Step: [301/924], Validation Acc: 74.44
Epoch: [2/10], Step: [401/924], Validation Acc: 75.14
Epoch: [2/10], Step: [501/924], Validation Acc: 74.84
Epoch: [2/10], Step: [601/924], Validation Acc: 75.28
Epoch: [2/10], Step: [701/924], Validation Acc: 75.84
Epoch: [2/10], Step: [801/924], Validation Acc: 76.24
Epoch: [2/10], Step: [901/924], Validation Acc: 76.54
Epoch: [3/10], Step: [101/92

## Exercise 1:
### Implement LSTM cell instead of RNN cell. Train the model and compare the results.
### Hint (modify init_hidden function and cell in __init__) 

## Exercise 2:
### Implement Bidirectional LSTM. You can do it very easily by adding one argument to cell when you create it.
### For better understanding we recommend that you implement it youself by reversing a sequence and passing it to another cell.

## Exercise 3:

### Add max-pooling (over time) after passing through RNN instead of summing over hidden layers through time

### Now lets implement basic Convolutional Neural Net model for text


In [11]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        batch_size, seq_len = x.size()

        embed = self.embedding(x)
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = torch.sum(hidden, dim=1)
        logits = self.linear(hidden)
        return logits

## Important things to keep in mind when using Convolutional Nets for Language Tasks in Pytorch

### Conv1d module expect input of size (batch_size, num_channels, length), where in our case input has size (batch_size, length, num_channels). Hence it is important call transpose(1,2) before passing it to convolutional layer and then reshape it back to (batch_size, length, num_channels) by calling transpose(1,2) again

### Additionally we need to reshape hidden activations into 2D tensor before passing it to Relu layer by calling view(-1, hidden.size(-1)

In [12]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = CNN(emb_size=100, hidden_size=200, num_layers=2, num_classes=5, vocab_size=len(id2char))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data, lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


Epoch: [1/10], Step: [101/924], Validation Acc: 62.3
Epoch: [1/10], Step: [201/924], Validation Acc: 69.82
Epoch: [1/10], Step: [301/924], Validation Acc: 72.36
Epoch: [1/10], Step: [401/924], Validation Acc: 74.48
Epoch: [1/10], Step: [501/924], Validation Acc: 75.9
Epoch: [1/10], Step: [601/924], Validation Acc: 76.76
Epoch: [1/10], Step: [701/924], Validation Acc: 74.64
Epoch: [1/10], Step: [801/924], Validation Acc: 78.46
Epoch: [1/10], Step: [901/924], Validation Acc: 77.56
Epoch: [2/10], Step: [101/924], Validation Acc: 79.22
Epoch: [2/10], Step: [201/924], Validation Acc: 80.24
Epoch: [2/10], Step: [301/924], Validation Acc: 80.04
Epoch: [2/10], Step: [401/924], Validation Acc: 79.96
Epoch: [2/10], Step: [501/924], Validation Acc: 80.34
Epoch: [2/10], Step: [601/924], Validation Acc: 78.84
Epoch: [2/10], Step: [701/924], Validation Acc: 80.48
Epoch: [2/10], Step: [801/924], Validation Acc: 82.04
Epoch: [2/10], Step: [901/924], Validation Acc: 81.24
Epoch: [3/10], Step: [101/924]

## Exercise 4:
### Implement Gated Relu activations as well as Gated Linear activations and compare them with Relu (reference: https://arxiv.org/pdf/1612.08083.pdf )
### Hint: Gated Relu activations are sigmoid(conv1_1(x)) * relu(conv1_2(x))
### Hint: Gated Linear activations are sigmoid(conv1_1(x)) * conv1_2(x)

### Feel free to play with other variants of gating


## Exercise 5:

### Add max-pooling (over time) after passing through conv as well as add non-linear fully connected layer

## Exercise 6:

### Use Bag-of-Words and Bag-of-NGrams model for this task and compare it with RNN and CNN

## Exercise 7:

### Use FastText for this task