## Step 1 : Create Folds

In [None]:
from re import T
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":

    # read the training data
    df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

    # map positive to 1 anf negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )

    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1

    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # fetch labels
    y = df.sentiment.values

    # init the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f

    # save the new csv file with kfold column
    df.to_csv("train_folds.csv", index=False)


## Step 2 : Make dataset class

In [None]:
import torch

class IMDBdataset:

    def __init__(self, reviews, targets):
        """
        :param reviews: this is a numpy array
        :param targets: a vector, numpy array
        """

        self.reviews = reviews
        self.targets = targets

    def __len__(self):
        # returns length of the dataset
        return len(self.reviews)

    def __getitem__(self, item):

        # for any given item, which is an int,
        # return review and targets as torch tensor
        # item is the index of the item in concern

        review = self.reviews[item, :]
        target = self.targets[item]

        return {
            "review": torch.tensor(review, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.float)
        }

## Step 3 : LSTM Model

In [None]:
import torch
import torch.nn as nn

class LSTM(nn.Module):

    def __init__(self, embedding_matrix):
        """
        :param embedding_matrix: numpy array with vectors for all words
        """

        super(LSTM, self).__init__()

        # number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]

        # dimension of embedding is number of columns in the matrix
        embed_dim = embedding_matrix.shape[1]

        # we define an input embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=num_words,
            embedding_dim=embed_dim
        )

        # embedding matrix is used as weights of the emnedding layer
        self.embedding.weights = nn.Parameter(
            torch.tensor(
                embedding_matrix,
                dtype=torch.float32
            )
        )

        # we dont want to train the pretrained embeddings
        self.embedding.weights.requires_grad = False

        # a simple bidirectonal LSTM with hidden size of 128
        self.lstm = nn.LSTM(
            input_size = embed_dim,
            hidden_size = 128,
            bidirectional=True,
            batch_first=True
        )

        # output layer which is a linear layer
        # we have only one output
        # input(512) = 128 + 128 for mean and same for max pooling
        self.linear = nn.Linear(512, 1)


    def forward(self, x):

        # JUST A NOTE: IN BIDIRECTIONAL LSTM
        # h0 and c0 will be of the shape => (2*L X N X M)
        # 2*L due to bidirectional nature!


        # pass the data through the embedding layer
        # the input is just the tokens
        # batch of sentences
        # N X T --> N X T X D ( because batch_first )
        x = self.embedding(x)

        # move embedding output to lstm
        # x : hidden states for the final layer for each time step,
        # the RNN/LSTM output
        # _ : hidden states over all hidden layes, but only for the final time stamp
        x, _ = self.lstm(x)
        # N X T X D --> N X T X M (x)
        # M is the hidden size

        # apply mean and max pooling on lstm
        # dimension 1 is about the time axis
        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)

        # concatenate mean and max pooling
        # this is why size is 512
        # 128 for each direction = 256 ( as the LSTM is bidirectional )
        # avg_pool = 256 and max_pool = 256
        out = torch.cat((avg_pool, max_pool), 1)

        # pass through the linear layer and return the output
        out = self.linear(out)

        # return the linear output
        return out

## Step 4 : Training and evaluation functions

In [None]:
import torch
import torch.nn as nn

def train(data_loader, model, optimizer, device):
    """
    This is the main training function that trains model for one epoch
    :param data_loader: this is the torch DataLoader
    :param model: model (lstm model)
    :param optimizer: torch optimizer, e.g. adam, sgd, etc.
    :param device: this can be "cuda" or "cpu"
    """

    # set the model to training mode
    model.train()

    # go through the batches of data in data
    for data in data_loader:

        # fetch review and target from the dict
        reviews = data["review"]
        targets = data["target"]

        # move the data to device that we want to use
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        # clear the gradients
        optimizer.zero_grad()

        # make predictions from the model
        predictions = model(reviews)

        # calculate the loss
        loss = nn.BCEWithLogitsLoss()(predictions, targets.view(-1, 1))

        # compute the gradient of loss w.r.t.
        # all parameters of the model that are trainable
        loss.backward()

        # single optimization step
        optimizer.step()

    
def evaluate(data_loader, model, device):

    # init empty lists to store predictions and targets
    final_predictions = []
    final_targets = []

    # put the model in eval mode
    model.eval()

    # disable the gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            # targets = data["target"]

            reviews = reviews.to(device, dtype=torch.long)
            # targets = targets.to(device, dtype=torch.float)

            # make the predictions
            predictions = model(reviews)

            # move predictions and targets to list
            # we need to move predictions and targets to cpu too
            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()

            final_predictions.extend(predictions)
            final_targets.extend(targets)

    # return final predictions and final targets
    return final_predictions, final_targets

## Step 5 : Training Loop, Embeddings and Inference

In [None]:
# We define all the configurations here
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10

In [None]:
import io
import torch

import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn import metrics

def load_vectors(fname):

    f = open(fname,'r')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    f.close()
    return gloveModel


def create_embedding_matrix(word_index, embedding_dict):
    """
    This function creates the word embedding matrix.
    :param word_index: a dictionary with word:index_value
    :param embedding_dict: a dictionary with word:embedding_vector
    :return: a numpy array with embedding vectors for all known words
    """

    # init matrix with zeros
    # 100 for glove embeddings, change it accordingly
    embedding_matrix = np.zeros((len(word_index) + 1, 100))

    # loop over all the words
    for word, i in word_index.items():

        # if word is found in pre-trained embeddings,
        # update the matrix.
        # if the word is not found,
        # the vector is zeros
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]

    # return embedding matrix
    return embedding_matrix


def run(df, fold):
    """
    Run training and validation for a given fold and dataset
    :param df: pandas DataFrame with kfold column
    :param fold: current fold, int
    """

    # fetch the training dataframe
    train_df = df[df.kfold != fold].reset_index(drop=True)

    # fetch validation dataframe
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print("Fitting tokenizer")

    # we use tf.keras for tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    # convert training data to sequences
    # each sentence is a bunch of number sequences
    # which are just indices
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    # zero pad the training sequences given the maximum length
    # this padding is done on left hand side
    # if sequnece is > MAX_LEN, it is truncated on left hand side too
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(
        xtrain, maxlen=MAX_LEN
    )

    # zero pad the validation sequences
    xtest = tf.keras.preprocessing.sequence.pad_sequences(
        xtest, maxlen=MAX_LEN
    )

    # init dataset class for training
    train_dataset = IMDBdataset(
        reviews=xtrain,
        targets=train_df.sentiment.values
    )

    # create torch dataloader for training
    # torch dataloader loads the data using dataset
    # class in batches specified by batch_size
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=2
    )

    # init dataset class for validation
    valid_dataset = IMDBdataset(
        reviews=xtest,
        targets=valid_df.sentiment.values
    )

    # create torch dataloader for validation
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )

    print("Loading Embeddings")

    # load the embeddings 
    embedding_dict = load_vectors("../input/glove-embeddings/glove.6B.100d.txt")
    embedding_matrix = create_embedding_matrix(
        tokenizer.word_index, embedding_dict
    )

    # create torch device, since we use gpu, we are using cuda
    device = torch.device("cuda")

    # fetch out LSTM model
    model = LSTM(embedding_matrix)

    # send model to device
    model.to(device)

    # init the Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print("Training Model")

    # set best accuarcy to zero
    best_accuracy = 0.0
    
    # set early stoping counter to zero
    early_stopping_counter = 0

    # train and validate for all epochs
    for epoch in range(EPOCHS):

        # train one epoch
        train(train_data_loader, model, optimizer, device)

        # validate
        output, targets = evaluate(valid_data_loader, model, device)

        # use threshold of 0.5
        # NOTE: we are using linear layer and no sigmoid
        # we should do this 0.5 threshold after sigmoid

        # calculate accuracy
        outputs = np.array(output) >= 0.5
        
        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)

        print(f"Fold: {fold}, Epoch: {epoch}, Accuracy Score = {accuracy}")
        
        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy

        else:
            early_stopping_counter += 1

        if early_stopping_counter > 2:
            break


if __name__ == '__main__':

    # load data
    df = pd.read_csv("./train_folds.csv")

    # train for all folds

    for f_ in range(5):
        run(df, f_)
        print("")
        print("--------------------------")
        print("")