## Load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

In [None]:
# Create DataFrames
df_train = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv", encoding="latin1")
df_test = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv", encoding="latin1")

# Shuffle DataFrames
df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)

In [None]:
df_train.head()

Let's look at the length of the tweets.

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def token_counter(text, tokenizer):
    return len(tokenizer.encode(text))

tok_len = df_train["OriginalTweet"].apply(lambda x : token_counter(x, tokenizer))

max(list(tok_len))

The longest tweet contains 184 tokens, we don't have to use padding up to the 512th token, we will stop at 200 to reduce the size of the tensors handled.

In [None]:
tokenizer.model_max_length = 200

## Data analysis

In [None]:
print("The training dataframe contains {} Tweets".format(len(df_train)))
print("The test dataframe contains {} Tweets".format(len(df_test)))

In [None]:
df_train.Sentiment.value_counts().loc[["Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"]].plot(kind="bar")

The Dataset is well balanced between categories.

## Processing the text

In [None]:
import re
def remove_links(text):
    to_remove = ['\r','\n',',',';',':','.']
    
    out = re.sub(r'http\S+', '', text)
    
    for token in to_remove:
        out = out.replace(token, '')
    
    return re.sub(' +', ' ', out.lower()) #Remove duplicate spaces

def tokenize(text, tokenizer):
    return tokenizer.encode(text, padding='max_length')

name_to_idx = {
    "Extremely Negative" : 0,
    "Negative" : 1,
    "Neutral" : 2,
    "Positive" : 3,
    "Extremely Positive" : 4
}

def process_tgt(value):
    return name_to_idx[value]

In [None]:
train_text = list(df_train["OriginalTweet"].apply(remove_links).apply(lambda x : tokenize(x, tokenizer)))
train_labels = list(df_train["Sentiment"].apply(process_tgt))

test_text = list(df_test["OriginalTweet"].apply(remove_links).apply(lambda x : tokenize(x, tokenizer)))
test_labels = list(df_test["Sentiment"].apply(process_tgt))

In [None]:
from torch.utils.data import Dataset

class CreateDataset(Dataset):
    
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels
        
        
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)

train_dataset = CreateDataset(train_text, train_labels)
test_dataset = CreateDataset(test_text, test_labels)

In [None]:
len(train_dataset),len(test_dataset)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size = 32)
test_loader = DataLoader(test_dataset, batch_size = 32)

## Create the model

In [None]:
class LSTMModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size = embedding_dim, 
                            hidden_size = hidden_size, 
                            num_layers = num_layers,
                            dropout = dropout,
                            batch_first = True,
                            bidirectional = True)
        self.linear = nn.Linear(512*200, 5)
        
    def forward(self, inputs):
        emb = self.embedding(inputs)
        lstm_out, _ = self.lstm(emb)
        
        output = self.linear(lstm_out.reshape(lstm_out.size()[0], -1))
        
        return output
    
model = LSTMModel(tokenizer.vocab_size, 256, 256, 4, 0.2)

## Train the model

In [None]:
from tqdm import tqdm

class Trainer():
    
    def __init__(self, model, train_loader, valid_loader):
        
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader

    def train_epoch(self, f_loss, optimizer, device):

        # We enter train mode. This is useless for the linear model
        # but is important for layers such as dropout, batchnorm, ...
        self.model.train()

        correct = 0
        tot_loss = 0
        N = 41157 # Dataset length

        # iterator = tqdm(enumerate(self.train_loader))
        iterator = enumerate(self.train_loader)

        for i, (inputs, targets) in iterator:
            inputs, targets = inputs.to(device), targets.to(device)

            # Compute the forward pass through the network up to the loss
            outputs = self.model(inputs)

            loss = f_loss(outputs, targets)

            loss_value = loss.item()

            # Backward and optimize

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            tot_loss += inputs.shape[0] * loss_value

            predicted_targets = outputs.argmax(dim=1)
            correct += (predicted_targets == targets).sum().item()

            #iterator.set_description("loss : {: .3f} | accuracy : {: .3f}".format(tot_loss/(inputs.shape[0]*(i+1)), correct/(inputs.shape[0]*(i+1))))

        return tot_loss/N, correct/N

    def valid_epoch(self, f_loss, device):
        # We enter train mode. This is useless for the linear model
        # but is important for layers such as dropout, batchnorm, ...
        self.model.eval()

        correct = 0
        tot_loss = 0
        N = 3798 # Dataset length

        # iterator = tqdm(enumerate(self.valid_loader))
        iterator = enumerate(self.valid_loader)

        with torch.no_grad():
            for i, (inputs, targets) in iterator:
                inputs, targets = inputs.to(device), targets.to(device)

                # Compute the forward pass through the network up to the loss
                outputs = self.model(inputs)

                loss = f_loss(outputs, targets)

                tot_loss += inputs.shape[0] * loss.item()

                predicted_targets = outputs.argmax(dim=1)
                correct += (predicted_targets == targets).sum().item()

                # iterator.set_description("loss : {: .3f} | accuracy : {: .3f}".format(tot_loss/(inputs.shape[0]*(i+1)), correct/(inputs.shape[0]*(i+1))))

        return tot_loss/N, correct/N

    def training(self, f_loss, optimizer, device, epochs = 10):

        train_loss = []
        train_acc = []
        valid_loss = []
        valid_acc = []

        for i in range(epochs):
            print("EPOCH {}/{}".format(i + 1, epochs))
            train_results = self.train_epoch(f_loss, optimizer, device)
            print("Training loss : {: .3f} | Training accuracy : {: .3f}".format(*train_results))
            valid_results = self.valid_epoch(f_loss, device)
            print("Validation loss : {: .3f} | Validation accuracy : {: .3f}\n".format(*valid_results))

            train_loss.append(train_results[0])
            train_acc.append(train_results[1])
            valid_loss.append(valid_results[0])
            valid_acc.append(valid_results[1])

        return train_loss, train_acc, valid_loss, valid_acc

In [None]:
device = torch.device('cuda')
model = model.cuda()

f_loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
trainer = Trainer(model, train_loader, test_loader)

train_loss, train_acc, valid_loss, valid_acc = trainer.training(f_loss, optimizer, device, epochs = 8)

In [None]:
plt.plot(train_loss, label = "train set")
plt.plot(valid_loss, label = "test set")
plt.legend()
plt.title("Loss of the model during training")
plt.show()

plt.plot(train_acc, label = "train set")
plt.plot(valid_acc, label = "test set")
plt.legend()
plt.title("Accuracy of the model during training")
plt.show()