In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# import keras libraries
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
# import pytorch
import torch
from torch.utils import data
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

# other libraries
import time
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from fastai.basics import *
from fastai.basic_train import Learner
from fastai.callbacks.general_sched import *
import gc


# Summary

This code is based on the scripts from the following kernels: 
* [https://www.kaggle.com/thousandvoices/simple-lstm](https://www.kaggle.com/thousandvoices/simple-lstm)
 * Author: @thousandvoices
 * Version 7
* [https://www.kaggle.com/bminixhofer/speed-up-your-rnn-with-sequence-bucketing](https://www.kaggle.com/bminixhofer/speed-up-your-rnn-with-sequence-bucketing)
 * Author: Benjamin Minixhofer
 * Version 6

I just added my own notes and changes some of the code to make it more clear.

Here we are building an Long Short Term Memory (LSTM) network, a type of recurrent neural networks that is well explained in the following websites: 
* https://colah.github.io/posts/2015-08-Understanding-LSTMs/
* https://adventuresinmachinelearning.com/keras-lstm-tutorial/

I've been seeing LSTM pop up a lot in kaggle competitions so its good to become familiar with them.

# Universal Parameters
First we set up the parameters to identify basic parts of the input data

In [None]:
# universal parameter settings

# identity columns that are featured in the testing data
# according to the data description of the competition
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]

# columns that describe the comment
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

# column with text data that will need to be converted for processing
TEXT_COLUMN = 'comment_text'

# column we eventually need to predict
TARGET_COLUMN = 'target'

parameters for text processing

In [None]:
# characters that we can ignore when tokenizating the TEXT_COLUMN
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

parameters for building neuronal network model

In [None]:
# Rate at which comments are dropped for training
# too high can underfit
# too low can overfit
DROPOUT_RATE = 0.2

# NUMBER OF EPOCHS
# One Epoch is when an entire dataset is passed forward and backward
# through the neural network once.
EPOCHS = 2

# dimensions of the output vectors of each LSTM cell.
# Too high can overfit
# Too low can underfit
# The length of this vector reflects the number of
# Bidirectional CuDNNLSTM layers there will be
LSTM_UNITS = 128


# dimensions of the densely-connected NN layer cells.
# The length of this vector reflects the number of
# Dense layers there will be
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS


# Import Data
Open the testing and training datasets into data frames

In [None]:
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

Organize competition data

In [None]:
#sample_weights = torch.from_numpy(train_df[TARGET_COLUMN].values[:,np.newaxis])
for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    #train_df[column] = np.where(train_df[column] >= 0.5, True, False)
    train_df[column] = np.where(train_df[column] >= 0.5, 1, 0)

In [None]:
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values[:,np.newaxis]
#y_aux_train = train_df[AUX_COLUMNS].values
#y_aux_train[:,1:] = np.where(y_aux_train[:,1:] >= .5,1,0)
x_test = test_df[TEXT_COLUMN].astype(str)

make the target binary rather than continuous

# Text Processing

## Get unique sequences for each comment

We base the keras token vocabulary on the comments in both the training and testing data.

In [None]:
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

Once the comments have been fit to the tokenizer we could get:
* word_counts: A dictionary of words and their counts across all the comments.
* word_docs: A dictionary of words and how many comments each appeared in.
* word_index: A dictionary of words and their uniquely assigned integers.
* document_count:An integer count of the total number of documents that were used to fit the Tokenizer.

Let us just note that the word_index that matches a word is arbitrary. If two words have word_index values that are relatively close that does NOT mean the words are closely related it just means they are different from each other. However, none of the words have the index "0". A word_index of 0 means the word is outside the vocabulary and therefore can be used to pad comments to be longer if we need comments to be of equal length.

Using the tokenizer, we translate the comments in the training and testing set respectively to lists of each word's word_index in each comment. For example if the comment was "Hello World" and the word index for "Hello" is 5 and "World" is 202 then we would translate the comment to [5,202]. In other words, we can now identify each comment by the order of the unique indexes.

In [None]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

## Create matrix containing the integer vector for each comment with sequence bucketing

In the original [Simple LSTM](https://www.kaggle.com/thousandvoices/simple-lstm) kernel, they created a matrix by making each comment a uniform length. if the comment was shorter than MAX_LEN than they would add 0's to the end of it. If the comment was longer than MAX_LEN, then they trimmed it. They set MAX_LEN to 220 as in the code below.

However, in the [Speed up your RNN with Sequence Bucketing](https://www.kaggle.com/bminixhofer/speed-up-your-rnn-with-sequence-bucketing) kernel, they point out that this is "suboptimal because when iterating over the dataset in batches, there will be some batches where the length of all samples is smaller than `MAX_LEN`. So there will be tokens which are zero everywhere in the batch but are still processed by the RNN. Therefore they optimize a process they call "sequence bucketing".

Two ideas came from this analysis.
1. Independent of the batch size, the 95th percentile of sequence lengths is about 163. We could use this number as a static pad, but we would still be losing information from the comments that are longer than 163 words.
2. They found method 3 to be more elegant and more or equal to the other three methods of sequence bucketing that was tried. Therefore, I am also going to do method 3 which they titled "Default Tensor Dataset with custom collate_fn"

### Method 3: Default TensorDataset with custom collate_fn

Based on @Benjamin_Minixhofer's analysis he found that when he used bin size of about 512 the maximum lengths of the batches were mostly the around ~190 words (only a little bit higher than the 95th percentile of sequence lengths which makes sense) and there were relatively few outliers. Therefore, he uses this batch size.

In [None]:
batch_size = 1024

we put the target values in pytorch format

In [None]:
#y_train_torch = torch.from_numpy(y_train).float()
#y_aux_train_torch = torch.from_numpy(y_aux_train).float()
#y_train_torch = torch.cat([y_train_torch.unsqueeze(1),y_aux_train_torch],1)


In [None]:
y_train_torch = torch.from_numpy(y_train)
y_train_torch = y_train_torch.float()

we get a list of the lengths of all the comments and put them into pytorch format

In [None]:
train_lengths = torch.from_numpy(np.array([len(x) for x in x_train]))
test_lengths = torch.from_numpy(np.array([len(x) for x in x_test]))

collect the length of the longest comment.


In [None]:
maxlen = train_lengths.max() # length of longest comment

 pad the sequences so that they are all as long as the longest comments (add 0's to the end of comments that are shorter than the longest one)

In [None]:
x_train_padded = torch.from_numpy(sequence.pad_sequences(x_train, maxlen=maxlen))
print("x_train_padded size:")
print(x_train_padded.shape)

x_test_padded = torch.from_numpy(sequence.pad_sequences(x_test, maxlen=maxlen))
print("x_test_padded size:")
print(x_test_padded.shape)
# save the space within RAM
del x_train, x_test

Next we create a python object called `SequenceBucketCollator` which is used to create a matrix for each batch with the sequences padded based on the longest comment in each of the batches.

In [None]:
# The following object calls a `batch` which is a 
# TensorDataset that contains two or three items:
# 1. Mandatory - a torch object with a matrix that has rows containing 
#    word incides for each comment (eg. x_train_padded)
# 2. Mandatory - a torch object that contains a list of the lengths of 
#    each of these comments in the same order as the matrix
#   (eg. train_lengths)
# 3. Optional- a torch object that contins a list of the 
# target values for each comment (eg. y_train_torch)
class SequenceBucketCollator():
    # initalizing features
    # choose_length - function to choose uniform length of each comment
    # sequence_index - index in Tensor Dataset where a torch object with 
    # a matrix that has rows containing word incides for each comment 
    # is located.
    # length_index - index in Tensor Dataset where a list of the lengths of 
    # each of these comments in the same order as the matrix
    # is located.
    # label_index - index in Tensor Dataset where a torch object that contins
    # a list of the target values for each comment is located (Optional)
    def __init__(self, choose_length, sequence_index, length_index, label_index=None, weight_index = None):
        self.choose_length = choose_length 
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.weight_index = weight_index
        self.label_index = label_index
    
    # An example of batch is:
    # data.TensorDataset(x_train_padded, train_lengths, y_train_torch)
    def __call__(self, batch):
        # make a list 
        # eg. [x_train_padded, train_lengths, y_train_torch]
        batch = [torch.stack(x) for x in list(zip(*batch))]
        
        # put the padded comment matrix in a variable `sequences`
        sequences = batch[self.sequence_index]
        
        # put list of lengths of the comments in a variable `lengths`
        lengths = batch[self.length_index]
        
        # set uniform length to set all the comments to
        length = self.choose_length(lengths) 
        
        # add 0's to the comments that are shorter than `length`
        mask = torch.arange(start=maxlen, end=0, step=-1) < length
        padded_sequences = sequences[:, mask]
        
        # reset the batch sequences
        batch[self.sequence_index] = padded_sequences
        
        # if present, add target labels to the batch
        if self.label_index is not None:
            return [x for i, x in enumerate(batch) if i not in [self.label_index,self.weight_index]],[batch[self.label_index],batch[self.weight_index]]

        return batch

First we make a training TensorDataset which contains
1. x_train_padded - a torch object with a matrix where each row represents a comment in the form of a sequence of numbers where each number represents a specific word. Each comment is as long as the longest comment (if was smaller than the longest comment it was passed with 0's to lengthen it)
2. train_lengths -  a torch object that contains a list of the lengths of each of these comments in the same order as the matrix
3. y_train_torch - a torch object that contins a list of the target values for each comment


In [None]:
train_df[TARGET_COLUMN] = train_df[TARGET_COLUMN].astype(np.bool8)
train_df[IDENTITY_COLUMNS] = train_df[IDENTITY_COLUMNS].astype(np.bool8)
# first we make all comments equal weights of 1
# this makes the math easier later
sample_weights = np.ones(len(x_train_padded), dtype=np.float32)
# then we add weight to columns with identity labels
# if a comment has more identity labels it gets more
# weight
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1).values
# if the comment is labeled a toxic, then we add
# the amount of identity columns that are not
# labeled
sample_weights += np.abs(train_df[TARGET_COLUMN] * \
    (~train_df[IDENTITY_COLUMNS]).sum(axis=1).values)
sample_weights += np.abs(train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1).values)

# if the comment is NOT labeled as toxic then we add
# 5 times the amount of identity columns
sample_weights += np.abs(~train_df[TARGET_COLUMN]) * \
    train_df[IDENTITY_COLUMNS].sum(axis=1).values * 5
# then we normalize the weights by dividing them all by the mean
sample_weights /= sample_weights.mean()
sample_weights = torch.from_numpy(sample_weights.values[:,np.newaxis]).float()


In [None]:
train_dataset = data.TensorDataset(x_train_padded, train_lengths, y_train_torch, sample_weights)

and a testing TensorDataset which contains
1. x_test_padded - a torch object with a matrix where each row represents a comment in the form of a sequence of numbers where each number represents a specific word. Each comment is as long as the longest comment (if was smaller than the longest comment it was passed with 0's to lengthen it)
2. test+lengths -  a torch object that contains a list of the lengths of each of these comments in the same order as the matrix


In [None]:
test_dataset = data.TensorDataset(x_test_padded, test_lengths)

Note we also make a validation Tensor Dataset which is just 2 rows of the train_dataset. The validation dataset is only added so that the fast.ai DataBunch works as expected.

In [None]:
valid_dataset = data.Subset(train_dataset, indices=[0,1])

Then we load the datasets into data.DataLoader which splits them batches of size `batch_size` and resets each batch to no longer have each of the comments as long as the longest comment, but instead resets each batch to be as long as the longest comment in the batch. I am not sure why shuffle is true for the `train_loader` and not the rest, but comment below if you do.

In [None]:
# initialize the SequenceBucketCollator objects
train_collator = SequenceBucketCollator(lambda lenghts: lenghts.max(), 
                                        sequence_index=0, 
                                        length_index=1, 
                                        label_index=2,
                                        weight_index = 3)
test_collator = SequenceBucketCollator(lambda lenghts: lenghts.max(), sequence_index=0, length_index=1)

# run the SequenceBucketCollator method to uniformly change 
# each of the comments in each batch to be the size of the 
# longest comment in each batch
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_collator)
valid_loader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=train_collator)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collator)

databunch = DataBunch(train_dl=train_loader, valid_dl=valid_loader, collate_fn=train_collator)

## Word Embeddings Dictionaries

We added two external word embeddings (also known as word vectors) files that were not given in the original data. According to the rules of the competition we are allowed "External data, freely & publicly available, is allowed, including pre-trained models". Word embeddings are tools for representing words that have similar representation. Each word is a vector and words that are different should have a greater distance between them and visa versa.

In [None]:
#fastText_wordEmbedder_f = '../input/fasttextsubword/crawl-300d-2m-subword/crawl-300d-2M-subword.vec'
#fastText_wordEmbedder_f='../input/fasttext-toxic/crawl-300d-2M.vec'
fastText_wordEmbedder_f='../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
glove_wordEmbedder_f='../input/glove840b300dtxt/glove.840B.300d.txt'
glove_twitter = '../input/glove-global-vectors-for-word-representation/glove.twitter.27B.200d.txt'

Let's create a dictionary of words with their respective vectors (AKA the embedding_index) for each of the word Embedding files.

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float16')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path,dim = 300):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, dim))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

def create_embedding_index(file,header=False):
    f = open(file,'r')
    lines = []
    if header:
        line_vec = f.readlines()[1:]
    else:
        line_vec = f.readlines()
    return dict(get_coefs(*line.strip().split(' ')) for line in line_vec)

According to kaggle, the fastText word embedder is a 

"300-dimensional pretrained FastText English word vectors released by Facebook.

The ** first line** of the file contains the number of words in the vocabulary and the size of the vectors. Each line contains a word followed by its vectors, like in the default fastText text format. Each value is space separated. Words are ordered by descending frequency."

Therefore, to build our embedding_index we need to skip the first line, keep the first word as the dictionary key, and the rest of the numbers as the values for the key.

In [None]:
fastText_embedding_index,unknown_words_ft = \
      build_matrix(tokenizer.word_index, fastText_wordEmbedder_f)
    #create_embedding_index(fastText_wordEmbedder_f,True)

On the other hand the glove word embedder is also 300-dimensional but it does NOT have the first header line so we use the same code but don't skip that first line.

In [None]:
glove_embedding_index,_ = build_matrix(tokenizer.word_index,glove_wordEmbedder_f,300)

In [None]:
glove_twitter,_ = build_matrix(tokenizer.word_index,glove_twitter,200)

instead of writing 300 over and over to signify that my vectors are 300 dimensions I am going to make it a variable.

In [None]:
embedding_matrix = np.concatenate([fastText_embedding_index, glove_embedding_index,glove_twitter], axis=-1)
embedding_matrix.shape

del glove_embedding_index
del fastText_embedding_index
del glove_twitter
gc.collect()

In [None]:
word_vec_size=300

## Create Word Embedding Matricies Based on Unique Word Indecies

As stated before, the vector of each word is 300-dimensional. Therefore we can create a matrix where each row represents a word in the vocabulary (word_index) and the values are based on these word embedding vectors. For example if the word_index of hello is "3" then index "3" would have the "hello" vector values given in the enbedding_index.

In [None]:
def get_embedding_matrix(word_index, embedding_index, word_vec_size):
    # toxinizer_vocab = all word indexes plus index 0
    toxinizer_vocab = len(word_index) +1
    embedding_matrix = np.zeros((toxinizer_vocab, word_vec_size))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return(embedding_matrix)

In [None]:
"""
fastText_embedding_matrix = get_embedding_matrix(tokenizer.word_index,fastText_embedding_index,word_vec_size)

glove_embedding_matrix = get_embedding_matrix(tokenizer.word_index,glove_embedding_index,word_vec_size)
#embedding_matrix = np.concatenate([fastText_embedding_matrix, glove_embedding_matrix], axis=-1)

#del fastText_embedding_matrix
#del glove_embedding_matrix
#gc.collect()
"""

To review, we now have a matrix which contains the comments in each row AND two embedding matricies which contain row vectors for each word in the training and testing sets. Next, we need to combine these two data structures in Keras Embedding Layer.

# LSTM Model

## Weights for Model

To avoid bias of comments that contain an identity as toxic we attempt to use weights to push toxicity classificaiton to not be based on identity labels

## Build Model

make sure to seed everything to remove a reproducibility issue

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

function to dropout random comments

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [None]:
a = torch.rand(10)
for s in range(3):
    if s != 1:
        a = a.unsqueeze(s)

In [None]:
def weighted_avg(tensor,dim):
    weight_array = np.arange(1,tensor.shape[dim] + 1)
    
    weight_array = weight_array / np.sum(weight_array)
    weight_array = torch.from_numpy(weight_array).float().cuda()
    for a in range(len(tensor.shape)):
        if a != dim:
            weight_array = weight_array.unsqueeze(a)
    weighted_tensor = tensor * weight_array
    return torch.sum(weighted_tensor,dim)
    

In [None]:
class NeuralNet(nn.Module):
    # initializing parameters:
    ## embedding matrix - 2D matrix containing a unique vectors in each row 
    ## that corresponds to words based on each word indexes in a specific 
    ## vocabulary
    ## num_aux_targets - number of AUX columns in training set
    ## drouput_rate - rate at which input layer drops out comments
    ## lstm_units - dimension of the lstm outputs
    ## dense_hidden_units  - dimension of the dense-layer outputs
    def __init__(self, embedding_matrix, dropout_rate,
                lstm_units, dense_hidden_units):
        super(NeuralNet, self).__init__()
        
        vocab_size = embedding_matrix.shape[0]
        embed_size = embedding_matrix.shape[1]
        
        # Create a table using nn.Embedding shaped by the size of the 
        ## vocabulary (vocab_size) by the size of the word vectors 
        ## (embed_size)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # set the embedding.weight based on the embedding matrix that
        # was created using the word em
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix,
                                                          dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(dropout_rate)
        
        
        #self.lstm1 = nn.LSTM(embed_size, lstm_units, bidirectional=True, batch_first=True)
        #self.lstm1 = BNLSTM(embed_size, lstm_units, bidirectional=True, batch_first=True)
        #self.lstm2 = nn.LSTM(lstm_units * 2, lstm_units, bidirectional=True, batch_first=True,dropout=.5)
        #self.lstm2 = BNLSTM(lstm_units * 2, lstm_units, bidirectional=True, batch_first=True)
        self.lstm1 = nn.LSTM(embed_size,lstm_units,bidirectional=True,batch_first=True,num_layers=2)
        self.linear1 = nn.Linear(dense_hidden_units, dense_hidden_units)
        self.linear2 = nn.Linear(dense_hidden_units, dense_hidden_units)
        #self.linear2 = nn.Sequential(nn.Linear(dense_hidden_units, int(dense_hidden_units / 2)),nn.BatchNorm1d(int(dense_hidden_units / 2)),nn.ReLU(),nn.Linear(int(dense_hidden_units / 2), dense_hidden_units))
        
        self.dropout = nn.Dropout(.2)
        self.linear_out = nn.Linear(dense_hidden_units, 1)
        self.bn = nn.BatchNorm1d(dense_hidden_units)
        #self.linear_aux_out = nn.Linear(dense_hidden_units, num_aux_targets)
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm2, _ = self.lstm1(h_embedding)
        #h_lstm2, _ = self.lstm2(h_lstm1)
        # see what happens if we add skip connection here
        #h_lstm2 = h_lstm2 + h_lstm1
        # global average pooling
        avg_pool_1 = torch.mean(h_lstm2, 1)
        #avg_pool_2 = weighted_avg(h_lstm2,1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool_1), 1)
        h_conc_linear1  = F.relu(self.bn(self.linear1(h_conc)))
        h_conc_linear2  = self.dropout(F.relu(self.bn(self.linear2(h_conc))))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        #aux_result = self.linear_aux_out(hidden)
        #out = torch.cat([result, aux_result], 1)
        #print(out.dtype)
        return result

In [None]:
def custom_loss(data, targets, weights):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=weights.float())(data[:,:1].float(),targets[:,:1].float())
    #bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,2:].float(),targets[:,1:].float())
    #return (bce_loss_1 ) + bce_loss_2
    return bce_loss_1

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [None]:
def train_model(learn,test,output_dim,lr=0.002,
                batch_size=512, n_epochs=5,
                enable_checkpoint_ensemble=True):
    
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    n = len(learn.data.train_dl)
    phases = [(TrainingPhase(n).schedule_hp('lr', lr * (0.6**(i)))).schedule_hp('wd',1e-2) for i in range(n_epochs)]
    sched = GeneralScheduler(learn, phases)
    learn.callbacks.append(sched)
    for epoch in range(n_epochs):
        learn.fit(3)
        test_preds = np.zeros((len(test), output_dim))    
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(learn.model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)


    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

In [None]:

print('fastText Model')
seed_everything(1234 + 0)
fastText_model = NeuralNet(embedding_matrix,dropout_rate=DROPOUT_RATE,lstm_units=LSTM_UNITS,dense_hidden_units=DENSE_HIDDEN_UNITS)
fastText_learn = Learner(databunch, fastText_model, loss_func=custom_loss)

fastText_test_preds = train_model(fastText_learn,test_dataset,output_dim=1,batch_size = batch_size,n_epochs = EPOCHS)    
#all_test_preds.append(fastText_test_preds)

fastText_model_submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': np.squeeze(fastText_test_preds[:,0])
})
fastText_model_submission.to_csv('submission.csv', index=False)


In [None]:


'''
all_test_preds = []

print('fastText Model')
seed_everything(1234 + 0)
fastText_model = NeuralNet(fastText_embedding_matrix,dropout_rate=DROPOUT_RATE,lstm_units=LSTM_UNITS,dense_hidden_units=DENSE_HIDDEN_UNITS)
fastText_learn = Learner(databunch, fastText_model, loss_func=custom_loss)

fastText_test_preds = train_model(fastText_learn,test_dataset,output_dim=1,batch_size = batch_size,n_epochs = EPOCHS)    
all_test_preds.append(fastText_test_preds)

fastText_model_submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': np.squeeze(fastText_test_preds[:,0])
})
fastText_model_submission.to_csv('fastText_submission.csv', index=False)
fastText_model_submission.to_csv('submission.csv', index=False)

print('glove Model')
seed_everything(1234 + 0)
glove_model = NeuralNet(glove_embedding_matrix,dropout_rate=DROPOUT_RATE,lstm_units=LSTM_UNITS,dense_hidden_units=DENSE_HIDDEN_UNITS)
glove_learn = Learner(databunch, glove_model, loss_func=custom_loss)
glove_test_preds = train_model(glove_learn,test_dataset,output_dim=1,batch_size = batch_size,n_epochs = EPOCHS)    
all_test_preds.append(glove_test_preds)


glove_model_submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': glove_test_preds
})
glove_model_submission.to_csv('glove_submission.csv', index=False)

'''

In [None]:
'''
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': np.mean(all_test_preds, axis=0)[:, 0]
})

submission.to_csv('submission.csv', index=False)
'''

# Build Model (Static Model: Deprecated)

The function below was used to model using static padding rather than binned padding...

get weighted average predictions using both the models