<center><h1>Sentiment Analysis using Transformers by HuggingFace Pytorch</h1></center>
<br>
<center>Sentiment analysis refers to the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.</center>
<br>
<center><img height=200 width=200 src = https://pytorch.org/assets/images/huggingface-logo.png></center>

<br>
<center><h4>I will be using the HuggingFace Python package for predicting question tags for this StackOverflow dataset. I'm just a beginner with this so please feel free to comment if I can do something better.</h4></center>

<br>
<center><img src = https://www.codemotion.com/magazine/wp-content/uploads/2020/05/bert-google-896x504.png></center>

In [None]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer

from torch.utils.data import TensorDataset

import transformers
from transformers import BertForSequenceClassification

import numpy as np
import pandas as pd
import re

In [None]:
# import torch
#     torch.cuda.empty_cache()

In [None]:
df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin-1')
survey_df = pd.read_csv('../input/survey-wos/survey_with_overall_sentiment.csv')
survey_df.head()

## Extracting of mentions and hashtags

In [None]:
def extract_hash_tags(s):
    hashes = re.findall(r"#(\w+)", s)
    return " ".join(hashes)
df['hashtags'] = df['OriginalTweet'].apply(lambda x : extract_hash_tags(x))

In [None]:
def extract_mentions(s):
    hashes = re.findall(r"@(\w+)", s)
    return " ".join(hashes)
df['mentions'] = df['OriginalTweet'].apply(lambda x : extract_mentions(x))

## Encoding classes [total 5]

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
# df['encoded_sentiment'] = encoder.fit_transform(df['Sentiment'])
survey_df['encoded_sentiment'] = encoder.fit_transform(survey_df['OverallSentiment'])

In [None]:
# print(survey_df.head(),df.head())

In [None]:
# df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))

In [None]:
from sklearn.model_selection import train_test_split

# xtrain, xval, ytrain, yval = train_test_split(df['OriginalTweet'], df['encoded_sentiment'], test_size = 0.2)
s_xtrain, s_xval, s_ytrain, s_yval = train_test_split(survey_df['CleanText'], survey_df['encoded_sentiment'], test_size = 0.2)


In [None]:
# print(type(xtrain[7]),type(s_xtrain[7]))
# print(type(s_xtrain), type(xtrain))
# print(xtrain[7])


In [None]:
types = {}
for x in s_xtrain:
    types[type(x)]=1
    if str(type(x))=="<class 'float'>":
        print(x)
print(types)

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

## Encoding Words to Vectors

In [None]:
max_str = survey_df['CleanText'].str.len().quantile(0.98)
print(max_str, type(max_str), int(max_str))

In [None]:
s_encoded_data_train = tokenizer.batch_encode_plus(
    s_xtrain, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=int(max_str), 
    return_tensors='pt'
)

s_encoded_data_val = tokenizer.batch_encode_plus(
    s_xval, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=int(max_str), 
    return_tensors='pt'
)

# encoded_data_train = tokenizer.batch_encode_plus(
#     xtrain, 
#     add_special_tokens=True, 
#     return_attention_mask=True, 
#     pad_to_max_length=True, 
#     max_length=50, 
#     return_tensors='pt'
# )

# encoded_data_val = tokenizer.batch_encode_plus(
#     xval, 
#     add_special_tokens=True, 
#     return_attention_mask=True, 
#     pad_to_max_length=True, 
#     max_length=50, 
#     return_tensors='pt'
# )

## Extracting inputs and attention masks out of encoded data

In [None]:
# input_ids_train = encoded_data_train['input_ids']
# attention_masks_train = encoded_data_train['attention_mask']
# labels_train = torch.tensor(ytrain.values)

s_input_ids_train = s_encoded_data_train['input_ids']
s_attention_masks_train = s_encoded_data_train['attention_mask']
s_labels_train = torch.tensor(s_ytrain.values)

# input_ids_val = encoded_data_val['input_ids']
# attention_masks_val = encoded_data_val['attention_mask']
# labels_val = torch.tensor(yval.values)

s_input_ids_val = s_encoded_data_val['input_ids']
s_attention_masks_val = s_encoded_data_val['attention_mask']
s_labels_val = torch.tensor(s_yval.values)


# Pytorch TensorDataset Instance
# dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
# dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

s_dataset_train = TensorDataset(s_input_ids_train, s_attention_masks_train, s_labels_train)
s_dataset_val = TensorDataset(s_input_ids_val, s_attention_masks_val, s_labels_val)

In [None]:
# initializing the model

# model = transformers.BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels=5,
#                                                       output_attentions=False,
#                                                       output_hidden_states=False)
s_model = transformers.BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

## Implementing Dataloaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# dataloader_train = DataLoader(dataset_train, 
#                               sampler=RandomSampler(dataset_train), 
#                               batch_size=128)

# dataloader_validation = DataLoader(dataset_val, 
#                                    sampler=SequentialSampler(dataset_val), 
#                                    batch_size=128)

s_dataloader_train = DataLoader(s_dataset_train, 
                              sampler=RandomSampler(s_dataset_train), 
                              batch_size=8)

s_dataloader_validation = DataLoader(s_dataset_val, 
                                   sampler=SequentialSampler(s_dataset_val), 
                                   batch_size=8)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# optimizer = AdamW(model.parameters(),
#                   lr=1e-5, 
#                   eps=1e-8)
s_optimizer = AdamW(s_model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps=0,
#                                             num_training_steps=len(dataloader_train)*epochs)

s_scheduler = get_linear_schedule_with_warmup(s_optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(s_dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device('cuda')

## Training

In [None]:
# model.to(device)

# for epoch in tqdm(range(1, epochs+1)):
    
#     model.train()
    
#     loss_train_total = 0

#     progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
#     for batch in progress_bar:

#         model.zero_grad()
        
#         batch = tuple(b.to(device) for b in batch)
        
#         inputs = {'input_ids':      batch[0].to(device),
#                   'attention_mask': batch[1].to(device),
#                   'labels':         batch[2].to(device),
#                  }       

#         outputs = model(**inputs)
        
#         loss = outputs[0]
#         loss_train_total += loss.item()
#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

#         optimizer.step()
#         scheduler.step()
        
#         progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
#     tqdm.write(f'\nEpoch {epoch}')
    
#     loss_train_avg = loss_train_total/len(dataloader_train)            
#     tqdm.write(f'Training loss: {loss_train_avg}')

In [None]:
s_model.to(device)

for epoch in tqdm(range(1, epochs+1)):
    
    s_model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(s_dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        s_model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels':         batch[2].to(device),
                 }       

        outputs = s_model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(s_model.parameters(), 1.0)

        s_optimizer.step()
        s_scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(s_dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')

In [None]:
# def evaluate(dataloader_val):

#     model.eval()
    
#     loss_val_total = 0
#     predictions, true_vals = [], []
    
#     for batch in dataloader_val:
        
#         batch = tuple(b.to(device) for b in batch)
        
#         inputs = {'input_ids':      batch[0],
#                   'attention_mask': batch[1],
#                   'labels':         batch[2],
#                  }

#         with torch.no_grad():        
#             outputs = model(**inputs)
            
#         loss = outputs[0]
#         logits = outputs[1]
#         loss_val_total += loss.item()

#         logits = logits.detach().cpu().numpy()
#         label_ids = inputs['labels'].cpu().numpy()
#         predictions.append(logits)
#         true_vals.append(label_ids)
    
#     loss_val_avg = loss_val_total/len(dataloader_val) 
    
#     predictions = np.concatenate(predictions, axis=0)
#     true_vals = np.concatenate(true_vals, axis=0)
            
#     return loss_val_avg, predictions, true_vals

In [None]:
def s_evaluate(dataloader_val):

    s_model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = s_model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
# val_loss, predictions, true_vals = evaluate(dataloader_validation)
# val_f1 = f1_score_func(predictions, true_vals)

s_val_loss, s_predictions, s_true_vals = s_evaluate(s_dataloader_validation)
s_val_f1 = f1_score_func(s_predictions, s_true_vals)

In [None]:
# print('Val Loss = ', val_loss)
# print('Val F1 = ', val_f1)

print('Val Loss = ', s_val_loss)
print('Val F1 = ', s_val_f1)

In [None]:
epochs = 100
s_model.to(device)
temp_val_loss = 1

for epoch in tqdm(range(1, epochs+1)):
    
    s_model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(s_dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        s_model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels':         batch[2].to(device),
                 }       

        outputs = s_model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(s_model.parameters(), 1.0)

        s_optimizer.step()
        s_scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(s_dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    s_val_loss, s_predictions, s_true_vals = s_evaluate(s_dataloader_validation)
    s_val_f1 = f1_score_func(s_predictions, s_true_vals)
    
    print(temp_val_loss,s_val_loss)
    if (temp_val_loss-s_val_loss) < 0: 
        break
    else:
        temp_val_loss = s_val_loss

In [None]:
print('Val Loss = ', s_val_loss)
print('Val F1 = ', s_val_f1)

In [None]:
# encoded_classes = encoder.classes_
# predicted_category = [encoded_classes[np.argmax(x)] for x in predictions]
# true_category = [encoded_classes[x] for x in true_vals]

s_encoded_classes = encoder.classes_
s_predicted_category = [s_encoded_classes[np.argmax(x)] for x in s_predictions]
s_true_category = [s_encoded_classes[x] for x in s_true_vals]
print(s_encoded_classes)


In [None]:
# x = 0
# for i in range(len(true_category)):
#     if true_category[i] == predicted_category[i]:
#         x += 1
        
# print('Accuracy Score = ', x / len(true_category))

x = 0
for i in range(len(s_true_category)):
    if s_true_category[i] == s_predicted_category[i]:
        x += 1
        
print('s_Accuracy Score = ', x / len(s_true_category))

In [None]:
from sklearn.metrics import confusion_matrix
# confusion_mat = confusion_matrix(y_true = true_category, y_pred = predicted_category, labels=list(encoded_classes))

s_confusion_mat = confusion_matrix(y_true = s_true_category, y_pred = s_predicted_category, labels=list(s_encoded_classes))

In [None]:
for i in range(len(s_true_category)):
    print(s_true_category[i],s_predicted_category[i])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# df = pd.DataFrame(confusion_mat, index = list(encoded_classes),columns = list(encoded_classes))
# sns.heatmap(df)

s_df = pd.DataFrame(s_confusion_mat, index = list(s_encoded_classes),columns = list(s_encoded_classes))
sns.heatmap(s_df)