In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import numpy as np
from tqdm import tqdm
import transformers
from transformers import RobertaModel, RobertaTokenizer
import torch

from torch.utils.data import DataLoader, Dataset
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train=pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding='latin1')
test=pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding='latin1')

In [None]:
train.head()

In [None]:
train = train[['OriginalTweet', 'Sentiment']]
test = test[['OriginalTweet', 'Sentiment']]

****Taking a look at data****

In [None]:
text = ' '.join(train['OriginalTweet'])
text[:1500]

****Let's do some cleaning.(Roberta is good enough for this dataset even if we don't clean the dataset. You can try it.)****

In [None]:
train['OriginalTweet'] = train['OriginalTweet'].apply(lambda x: x.lower())
test['OriginalTweet'] = test['OriginalTweet'].apply(lambda x: x.lower())

In [None]:
train['OriginalTweet'] = train['OriginalTweet'].apply(lambda x: re.sub('\r', '', x))
test['OriginalTweet'] = test['OriginalTweet'].apply(lambda x: re.sub('\r', '', x))

In [None]:
train['OriginalTweet'] = train['OriginalTweet'].apply(lambda x: re.sub('\n', '', x))
test['OriginalTweet'] = test['OriginalTweet'].apply(lambda x: re.sub('\n', '', x))

In [None]:
train['OriginalTweet'] = train['OriginalTweet'].apply(lambda x: re.sub("\'", "", x))
test['OriginalTweet'] = test['OriginalTweet'].apply(lambda x: re.sub("\'", "", x))

In [None]:
text = ' '.join(train['OriginalTweet'])
text[:1500]

In [None]:
text[-1500:]

****Let's map the Sentiments into something that machine can understand****

In [None]:
train['Sentiment'].unique()

In [None]:
mapped = {'Extremely Negative': 0,'Negative': 0,'Neutral': 1,'Positive': 2,'Extremely Positive': 2}

In [None]:
train['Sentiment'] = train['Sentiment'].map(mapped)
test['Sentiment'] = test['Sentiment'].map(mapped)

In [None]:
pip install transformers

****Initializing the tokenizer****

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
max_len = 200

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.OriginalTweet
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
print("TRAIN Dataset: {}".format(train.shape))
print("TEST Dataset: {}".format(test.shape))

training_set = SentimentData(train, tokenizer, max_len)
testing_set = SentimentData(test, tokenizer, max_len)

In [None]:
training_set = SentimentData(train, tokenizer, max_len= max_len)
test_set = SentimentData(test, tokenizer, max_len= max_len)

In [None]:
training_loader = DataLoader(training_set, batch_size = 16)
testing_loader = DataLoader(test_set, batch_size= 16)

****Creating the network****

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
model = RobertaClass()
model.to(device)

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params= model.parameters(), lr = 2e-5)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

****Trainng****

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)

****Evaluation****

In [None]:
def testing(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Test Loss per 100 steps: {loss_step}")
                print(f"Test Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Test Loss Epoch: {epoch_loss}")
    print(f"Test Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [None]:
acc = testing(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

****Upvote if you learn something. Next I'll try to use XLNet for this dataset.****

****Thanks****