# Model Training - Different Models

In this notebook, I will experiment with different types of models, and see if they show improvement over the basic model.

First, let's bring in all the code from the previous notebook as the starter code:

In [None]:
import pandas as pd;
import numpy as np;

import torch;
from torch import nn;
from torch.utils.data import Dataset, DataLoader;
import torch.nn.functional as F;
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score;

import math;
from numpy import save, load;
import keras_sequence_preprocessing as seq_preprocessing;
import keras_text_preprocessing as text_preprocessing;

import matplotlib.pyplot as plt;

import time;

from PyTorchTools import EarlyStopping;

quora_train_text = pd.read_csv('data/augmented_quora_text.txt');
quora_train_text = quora_train_text.dropna()

embed_size = 300;

# FASTTEXT Embeddings

embeddings_dict = {};
with open('../Embeddings/crawl-%dd-2M.vec'%(embed_size), 'rb') as f:
    for line in f:
        splits = line.split();
        word = splits[0];
        vec = np.asarray(splits[1:], dtype='float32')
        
        embeddings_dict[word.decode()] = vec;
        
word_index = {};

token_num = 0;
for row in quora_train_text[['cleaned_text', 'target']].iterrows():
    text, label = row[1]
    
    tokens = [token for token in text.split(' ')];
    
    for token in tokens:
        if token not in word_index:
            word_index[token] = token_num;
            token_num = token_num + 1;

MAX_WORDS = 200000
MAX_LEN = 70

def encode_sentences(sentence, word_index=word_index, max_words=MAX_WORDS):
    output = [];
    for token in sentence.split(' '):
        if (token in word_index) and (word_index[token] < max_words):
            output.append(word_index[token]);
    return output;

encoded_sentences = [encode_sentences(sent) for sent in quora_train_text['cleaned_text']]
encoded_lengths = [len(x) for x in encoded_sentences]
padded_sequences = seq_preprocessing.pad_sequences(encoded_sentences, maxlen=MAX_LEN, padding='post', truncating='post');

val_split = int(0.85 * len(quora_train_text));

train_ds = padded_sequences[:val_split];
val_ds = padded_sequences[val_split:];

train_y = quora_train_text.iloc[:val_split]['target'].values;
val_y = quora_train_text.iloc[val_split:]['target'].values;

train_lens = encoded_lengths[:val_split];
val_lens = encoded_lengths[val_split:];

len(train_ds), len(val_ds)

vocab_size = min(MAX_WORDS, len(word_index))+1;
embeddings_matrix = np.zeros((vocab_size, embed_size));

for word, posit in word_index.items():
    if posit >= vocab_size:
        break;
        
    vec = embeddings_dict.get(word);
    if vec is None:
        vec = np.random.sample(embed_size);
        embeddings_dict[word] = vec;
    
    embeddings_matrix[posit] = vec;
    
class QuoraDataset(Dataset):
    def __init__(self, encoded_sentences, labels, lengths):
        self.encoded_sentences = encoded_sentences;
        self.labels = labels;
        self.lengths = lengths;
        
    def __len__(self):
        return len(self.encoded_sentences);
    
    def __getitem__(self, index):
        x = self.encoded_sentences[index, :];
        x = torch.LongTensor(x);
        
        y = self.labels[index];
        y = torch.Tensor([y]);
        
        length = self.lengths[index];
        length = torch.Tensor([length]);
        
        return x, y, length;
    
train_dataset = QuoraDataset(train_ds, train_y, train_lens);
val_dataset = QuoraDataset(val_ds, val_y, val_lens);

batch_size = 512;

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True);
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True);

def train(nn_model, nn_optimizer, nn_criterion, data_loader, val_loader = None, num_epochs = 5, print_ratio = 0.1, verbose=True):
    
    print_every_step = int(print_ratio * len(train_loader));
    
    if verbose:
        print('Training with model: ');
        print(nn_model);
    
    for epoch in range(num_epochs):

        epoch_time = time.time();    

        f1_scores_train = []

        # Enable Training for the model
        nn_model.train()
        running_loss = 0;

        all_ys = torch.tensor(data=[]).to(device);
        all_preds = torch.tensor(data=[]).to(device);

        for ite, (x, y, l) in enumerate(data_loader):
            init_time = time.time();

            # Convert our tensors to GPU tensors
            x = x.cuda()
            y = y.cuda()

            # Clear gradients
            nn_optimizer.zero_grad()

            # Forward Propagation and compute predictions
            preds = nn_model.forward(x, l)

            # Compute loss against actual values
            loss = nn_criterion(preds, y)

            # Add predictions and actuals into larger list for scoring
            all_preds = torch.cat([all_preds, preds]);
            all_ys = torch.cat([all_ys, y]);

            # Back Propagation and Updating weights
            loss.backward()
            nn_optimizer.step()

            running_loss = running_loss + loss.item();

            if ite % print_every_step == print_every_step-1:
                
                # Compute Sigmoid Activation and Prediction Probabilities
                preds_sigmoid = torch.sigmoid(all_preds).cpu().detach().numpy();
                
                # Compute Predictions over the Sigmoid base line
                all_preds = (preds_sigmoid > 0.5).astype(int);

                # Compute Metrics
                all_ys = all_ys.detach().cpu().numpy();

                f_score = f1_score(all_ys, all_preds);
                precision = precision_score(all_ys, all_preds);
                recall = recall_score(all_ys, all_preds);
                accuracy = accuracy_score(all_ys, all_preds);

                print('\t[%d %5d %.2f sec] loss: %.3f acc: %.3f prec: %.3f rec: %.3f f1: %.3f'%(epoch+1, ite+1, time.time() - init_time, running_loss / 2000, accuracy, precision, recall, f_score))

                all_ys = torch.tensor(data=[]).to(device);
                all_preds = torch.tensor(data=[]).to(device);
        
        print('Epoch %d done in %.2f min'%(epoch+1, (time.time() - epoch_time)/60 ));

        if val_loader is not None:
            eval(nn_model, nn_criterion, val_loader);
        
        running_loss = 0.0;
        
def eval(nn_model, nn_criterion, data_loader):

    # Disable weight updates
    with torch.no_grad():

        # Enable Model Evaluation
        nn_model.eval()
        running_loss = 0;
        
        all_ys = torch.tensor(data=[]).to(device);
        all_preds = torch.tensor(data=[]).to(device);

        init_time = time.time();

        for ite, (x, y, l) in enumerate(data_loader):

            # Convert tensors to GPU tensors
            x = x.cuda()
            y = y.cuda()

            # Forward propagation to compute predictions
            preds = nn_model.forward(x, l)

            # Compute loss on these predictions
            loss = nn_criterion(preds, y)

            all_preds = torch.cat([all_preds, preds]);
            all_ys = torch.cat([all_ys, y]);

            running_loss = running_loss + loss.item();

        # Compute Sigmoid activation on the predictions, and derive predictions over the Sigmoid base line
        preds_sigmoid = torch.sigmoid(all_preds).cpu().detach().numpy();
        all_preds = (preds_sigmoid > 0.5).astype(int);

        # Compute metrics
        all_ys = all_ys.detach().cpu().numpy();
        f_score = f1_score(all_ys, all_preds);

        precision = precision_score(all_ys, all_preds);
        recall = recall_score(all_ys, all_preds);
        accuracy = accuracy_score(all_ys, all_preds);

        print('\tEVAL: [%5d %.2f sec] loss: %.3f acc: %.3f prec: %.3f rec: %.3f f1: %.3f'%(ite+1, time.time() - init_time, running_loss / 2000, accuracy, precision, recall, f_score))

See the Notebook "Model Training - Basic Model" for comments and documentation on the starter code.

## Model Training with 128 Hidden Units in the first LSTM Layer:

In [108]:
model = Model(embeddings_tensor, 128);
model = model.to(device);

train(model, optimizer, criterion, train_loader)

Training with model: 
Model(
  (embedding_layer): Embedding(100001, 300)
  (lstm_1): LSTM(300, 128, bidirectional=True)
  (fc_1): Linear(in_features=256, out_features=256, bias=True)
  (lstm_2): LSTM(256, 128, bidirectional=True)
  (fc_2): Linear(in_features=512, out_features=1, bias=True)
)
	[1   356 0.16 sec] loss: 0.037 acc: 0.938 prec: 0.540 rec: 0.013 f1: 0.025
	[1   712 0.16 sec] loss: 0.063 acc: 0.945 prec: 0.634 rec: 0.290 f1: 0.398
	[1  1068 0.16 sec] loss: 0.087 acc: 0.948 prec: 0.631 rec: 0.349 f1: 0.449
	[1  1424 0.16 sec] loss: 0.111 acc: 0.950 prec: 0.659 rec: 0.382 f1: 0.484
	[1  1780 0.16 sec] loss: 0.134 acc: 0.950 prec: 0.657 rec: 0.398 f1: 0.495
	[1  2136 0.17 sec] loss: 0.156 acc: 0.951 prec: 0.674 rec: 0.428 f1: 0.524
	[1  2492 0.16 sec] loss: 0.178 acc: 0.951 prec: 0.669 rec: 0.422 f1: 0.517
	[1  2848 0.16 sec] loss: 0.199 acc: 0.953 prec: 0.672 rec: 0.419 f1: 0.516
	[1  3204 0.16 sec] loss: 0.221 acc: 0.952 prec: 0.682 rec: 0.435 f1: 0.531
	[1  3560 0.16 sec] los

In [10]:
eval(model, optimizer, val_loader)

	EVAL: [  764 18.49 sec] loss: 0.036 acc: 0.966 prec: 0.634 rec: 0.512 f1: 0.567


In [None]:
print('\tEVAL: [  764 18.49 sec] loss: 0.046 acc: 0.953 prec: 0.634 rec: 0.512 f1: 0.567')

### With 128 Hidden Units in first LSTM Layer: F1 score over 5 epochs:

The model shows slight improvement over a model with 64 hidden units.

##### Training: 0.625
##### Validation: 0.567

In [109]:
train(model, optimizer, criterion, train_loader)

Training with model: 
Model(
  (embedding_layer): Embedding(100001, 300)
  (lstm_1): LSTM(300, 128, bidirectional=True)
  (fc_1): Linear(in_features=256, out_features=256, bias=True)
  (lstm_2): LSTM(256, 128, bidirectional=True)
  (fc_2): Linear(in_features=512, out_features=1, bias=True)
)
	[1   356 0.16 sec] loss: 0.017 acc: 0.962 prec: 0.732 rec: 0.575 f1: 0.644
	[1   712 0.16 sec] loss: 0.034 acc: 0.961 prec: 0.728 rec: 0.569 f1: 0.639
	[1  1068 0.16 sec] loss: 0.052 acc: 0.961 prec: 0.734 rec: 0.585 f1: 0.651
	[1  1424 0.16 sec] loss: 0.069 acc: 0.960 prec: 0.729 rec: 0.570 f1: 0.640
	[1  1780 0.16 sec] loss: 0.087 acc: 0.961 prec: 0.729 rec: 0.579 f1: 0.646
	[1  2136 0.16 sec] loss: 0.104 acc: 0.961 prec: 0.732 rec: 0.584 f1: 0.650
	[1  2492 0.16 sec] loss: 0.122 acc: 0.961 prec: 0.735 rec: 0.576 f1: 0.646
	[1  2848 0.16 sec] loss: 0.139 acc: 0.961 prec: 0.737 rec: 0.590 f1: 0.656
	[1  3204 0.16 sec] loss: 0.157 acc: 0.960 prec: 0.726 rec: 0.574 f1: 0.641
	[1  3560 0.16 sec] los

In [11]:
eval(model, optimizer, val_loader)

	EVAL: [  764 18.01 sec] loss: 0.030 acc: 0.983 prec: 0.615 rec: 0.510 f1: 0.558


### With 128 Hidden Units in first LSTM Layer: F1 score over 10 epochs:

The model starts overfitting after additional epochs, with good improvement in training, but a small dip in validation.

##### Training: 0.661
##### Validation: 0.558

In [2]:
def ffscore(prec, recall):
    return (2 * prec * recall) / (prec + recall)

In [8]:
ffscore(0.615, 0.510)

0.5576

## Attention

Next, we will add an Attention layer, which will allow the model to learn deeper context between words

I borrowed implementation of an Attention Layer from the following Gist:
https://gist.github.com/thomwolf/dec72992ea6817290273d42f6b95c04c

In [21]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, batch_first=False):
        super(SelfAttention, self).__init__()

        self.hidden_size = hidden_size
        self.batch_first = batch_first

        self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size), requires_grad=True)

        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.att_weights:
            nn.init.uniform_(weight, -stdv, stdv)

    def get_mask(self):
        pass

    def forward(self, inputs, lengths):
        if self.batch_first:
            batch_size, max_len = inputs.size()[:2]
        else:
            max_len, batch_size = inputs.size()[:2]
            
        # apply attention layer
        weights = torch.bmm(inputs,
                            self.att_weights  # (1, hidden_size)
                            .permute(1, 0)  # (hidden_size, 1)
                            .unsqueeze(0)  # (1, hidden_size, 1)
                            .repeat(batch_size, 1, 1) # (batch_size, hidden_size, 1)
                            )
    
        attentions = torch.softmax(F.relu(weights.squeeze()), dim=-1)

        # create mask based on the sentence lengths
        mask = torch.ones(attentions.size(), requires_grad=True).cuda()
        
        lengths = lengths.to(dtype=torch.long, device=device)
        for i, l in enumerate(lengths):  # skip the first sentence
            if l < max_len:
                mask[i, l:] = 0

        # apply mask and renormalize attention scores (weights)
        masked = attentions * mask
        _sums = masked.sum(-1).unsqueeze(-1)  # sums per row
        
        attentions = masked.div(_sums)

        # apply attention weights
        weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()

        return representations, attentions

In [22]:
class AttentionModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_unit = 64):
        super(AttentionModel, self).__init__();
        vocab_size = embeddings_tensor.shape[0];
        embedding_dim = embeddings_tensor.shape[1];
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim);
        self.embedding_layer.weight = nn.Parameter(embeddings_tensor);
        self.embedding_layer.weight.requires_grad = True;
        
        self.lstm_1 = nn.LSTM(embedding_dim, hidden_unit, bidirectional=True);
        
        self.attn_1 = SelfAttention(hidden_unit*2, batch_first=True);
        
        self.fc_1 = nn.Linear(hidden_unit*2, hidden_unit*2);
        
        self.fc_2 = nn.Linear(hidden_unit * 2, 1);
        
    def forward(self, x, l):
        out = self.embedding_layer(x);
        
        out, _ = self.lstm_1(out);
        
        out, _ = self.attn_1(out, l);
        
        out = self.fc_1(out);
        
        out = torch.relu(out);
        
        out = self.fc_2(out);
        return out;

In [27]:
attn_model = AttentionModel(embeddings_tensor, 128)
attn_model = attn_model.to(device);
attn_model

AttentionModel(
  (embedding_layer): Embedding(200001, 300)
  (lstm_1): LSTM(300, 128, bidirectional=True)
  (attn_1): SelfAttention()
  (fc_1): Linear(in_features=256, out_features=256, bias=True)
  (fc_2): Linear(in_features=256, out_features=1, bias=True)
)

In [28]:
criterion = nn.BCEWithLogitsLoss();
optimizer = torch.optim.Adam(lr=0.003, params = attn_model.parameters());


We also make some changes to the model to accomodate the LSTM layer. We have a single LSTM Layer, that feeds into the Attention Layer, following by two Fully Connected layers.

In [112]:
train(attn_model, optimizer, criterion, train_loader, num_epochs=5)

Training with model: 
AttentionModel(
  (embedding_layer): Embedding(200001, 300)
  (lstm_1): LSTM(300, 128, bidirectional=True)
  (attn_1): SelfAttention()
  (fc_1): Linear(in_features=256, out_features=256, bias=True)
  (fc_2): Linear(in_features=256, out_features=1, bias=True)
)
	[1   356 0.37 sec] loss: 0.026 acc: 0.946 prec: 0.631 rec: 0.326 f1: 0.430
	[1   712 0.40 sec] loss: 0.047 acc: 0.952 prec: 0.660 rec: 0.467 f1: 0.547
	[1  1068 0.37 sec] loss: 0.068 acc: 0.952 prec: 0.662 rec: 0.481 f1: 0.557
	[1  1424 0.39 sec] loss: 0.089 acc: 0.955 prec: 0.684 rec: 0.521 f1: 0.592
	[1  1780 0.36 sec] loss: 0.109 acc: 0.954 prec: 0.683 rec: 0.496 f1: 0.575
	[1  2136 0.36 sec] loss: 0.129 acc: 0.956 prec: 0.685 rec: 0.514 f1: 0.587
	[1  2492 0.37 sec] loss: 0.149 acc: 0.954 prec: 0.681 rec: 0.499 f1: 0.576
	[1  2848 0.37 sec] loss: 0.169 acc: 0.956 prec: 0.682 rec: 0.513 f1: 0.585
	[1  3204 0.36 sec] loss: 0.188 acc: 0.957 prec: 0.696 rec: 0.530 f1: 0.602
	[1  3560 0.36 sec] loss: 0.207 a

In [115]:
eval(attn_model, criterion, val_loader)



	EVAL: [  764 83.75 sec] loss: nan acc: 0.951 prec: 0.610 rec: 0.592 f1: 0.601


We can already see a Drastic improvement in the model with a single Attention layer.

#### Training F1 score is 0.771
#### Validation F1 score is 0.601

The improvement in validation is not significant, but still noticable with a jump of 5%.

Let's try training on additional Epochs and see if there is any improvement:

In [117]:
train(attn_model, optimizer, criterion, train_loader, num_epochs=5)

Training with model: 
AttentionModel(
  (embedding_layer): Embedding(200001, 300)
  (lstm_1): LSTM(300, 128, bidirectional=True)
  (attn_1): SelfAttention()
  (fc_1): Linear(in_features=256, out_features=256, bias=True)
  (fc_2): Linear(in_features=256, out_features=1, bias=True)
)
	[1   356 0.37 sec] loss: 0.009 acc: 0.978 prec: 0.848 rec: 0.782 f1: 0.814
	[1   712 0.36 sec] loss: 0.019 acc: 0.978 prec: 0.848 rec: 0.774 f1: 0.809
	[1  1068 0.38 sec] loss: 0.029 acc: 0.977 prec: 0.839 rec: 0.782 f1: 0.810
	[1  1424 0.37 sec] loss: 0.039 acc: 0.977 prec: 0.840 rec: 0.769 f1: 0.803
	[1  1780 0.37 sec] loss: 0.049 acc: 0.976 prec: 0.840 rec: 0.777 f1: 0.807
	[1  2136 0.37 sec] loss: 0.060 acc: 0.976 prec: 0.837 rec: 0.761 f1: 0.797
	[1  2492 0.36 sec] loss: 0.071 acc: 0.975 prec: 0.833 rec: 0.756 f1: 0.792
	[1  2848 0.37 sec] loss: 0.082 acc: 0.975 prec: 0.824 rec: 0.751 f1: 0.786
	[1  3204 0.37 sec] loss: 0.093 acc: 0.975 prec: 0.826 rec: 0.747 f1: 0.784
	[1  3560 0.37 sec] loss: 0.104 a

In [12]:
eval(attn_model, criterion, val_loader)

EVAL: [  812 88.90 sec] loss: nan acc: 0.948 prec: 0.583 rec: 0.569 f1: 0.576


After 15 Epochs trained on the model with an Attention Layer (only last 5 shown above), we see

#### Training F1 score is 0.815
#### Validation F1 score is 0.576

The model overfit quickly, and although training scoring is much higher, there is also a dip in Validation.

After trying different configurations, as well as others such as an optimizer with a Weight decay, additional layers to the Attention, and other types of Word Embeddings, the final best model trained was:

1. Neural Model with a 128-dim LSTM Layer, Attention Layer, and 2 Fully Connected Layers
2. 300 Dimensional Fast Text word Embeddings
3. Adam Optimizer with a 0.003 Learning Rate and no weight decay
4. Trained on 5 Epochs

To obtain a **0.771** Training F1 score and a **0.601** Validation F1 score

After reviewing other Notebooks and other submissions on the Kaggle Project, good validation scores on the leaderboard are in the range of **0.70**. Most such implementations use enemble training, transfer learning, data augmentation by pulling in data from answers, and Transformers. I did not use any of these methods because my hardware resources are more limited. Transformers can be used, but I have not explored the concept enough to be able to implement them yet.