###Installation

In [None]:
!pip install transformers
!pip install torch
!pip install pycuda

###Imports

In [None]:
#mporting the necessary libraries
import emoji
from emot.emo_unicode import EMOTICONS
from google.colab import drive
import pandas as pd
import numpy
import re
import seaborn as sns
import torch
import time
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from transformers import DistilBertModel, DistilBertTokenizerFast
from torch import cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
#Mount google drive for retrive files
drive.mount('/content/drive')

###Data

In [None]:
#Load Training data
train = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/preprocessing/train/transliterated_train.csv', names=['category','text'])
train.category = train.category.apply({'unknown_state':0,'Negative':1,'not-Tamil':2,'Positive':3,'Mixed_feelings':4}.get)
train.head(6)

In [None]:
print(encode_dict)

In [None]:
#Visualize Train
train = train[['text', 'category']]
sns.countplot(x='category', data=train)

In [None]:
#Load Validation data
val = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/preprocessing/dev/transliterated_val.csv', names=['category','text'])
val.category = val.category.apply({'unknown_state':0,'Negative':1,'not-Tamil':2,'Positive':3,'Mixed_feelings':4}.get)
val.head(6)

In [None]:
#Visualize Val
val = val[['text', 'category']]
sns.countplot(x='category', data=val)

In [None]:
#Load test data
test = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/preprocessing/test/transliterated_test.csv', names=['text'])
test.head(9)

In [None]:
print(test['text'][5])

###Model

In [None]:
#Initializing the key variables which will be later used in the training

MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-5
distilbert_multilingual = 'distilbert-base-multilingual-cased'   #Pretrained model 01
tokenizer = DistilBertTokenizerFast.from_pretrained(distilbert_multilingual)   #load the model through tokenizer

In [None]:
##Prepare the dataset
class SentimentDataset(Dataset):

  def __init__(self,dataframe,tokenizer,max_len):
    self.len = len(dataframe)
    self.data = dataframe
    self.tokenizer = tokenizer
    self.max_len = max_len 
  

  def __getitem__(self,index):
    sentence = str(self.data.text[index])
    sentence = " ".join(sentence.split())
    encoding = self.tokenizer.encode_plus(
        sentence,
        add_special_tokens = True,
        max_length = self.max_len,
        padding = 'max_length',
        return_token_type_ids = False,
        return_tensors = 'pt',
        truncation = True
    )
    #ids = encoding['input_ids']
    #mask = encoding['attention_mask']
    return {
        'ids' : encoding['input_ids'].flatten(),
        'mask': encoding['attention_mask'].flatten(),
        'targets': torch.tensor(self.data.category[index],dtype=torch.long)
    }

  def __len__(self):
    return self.len

In [None]:
#CXreate the dataloader for training
print('Total no of entities in the dataset: {}'.format(train.shape))
print('Train dataset:{}'.format(val.shape))

training_set = SentimentDataset(train,tokenizer,MAX_LEN)
validating_set = SentimentDataset(val,tokenizer,MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

val_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **train_params)

In [None]:
# Fine-Tuning DistilBERT by adding a dropout and a dense layer on top of it to get the final output

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")  #pretrained model
        
        self.lstm = nn.LSTM(768, 256, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(256*2, 5)
        
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5) #Classifier layer with 5 class output

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)

###Training

In [None]:
#Defining the loss function and optimizer
loss_function = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(params= model.parameters(),lr = LEARNING_RATE)
#loss_function.to(device)

In [None]:
def calcuate_accuracy(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Defining the training function for tuning the distilbert model

def train(epoch):
  
  tr_loss = 0
  n_correct = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  model.train()
  start_time = time.time()
  for _,data in enumerate(training_loader, 0):
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.long)

      outputs = model(ids, mask)
      loss = loss_function(outputs, targets)
      tr_loss += loss.item()
      big_val, big_idx = torch.max(outputs.data, dim=1)
      n_correct += calcuate_accuracy(big_idx, targets)

      nb_tr_steps += 1
      nb_tr_examples+=targets.size(0)
      
      optimizer.zero_grad()
      loss.backward()
      #When using GPU
      optimizer.step()

  print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
  epoch_loss = tr_loss/nb_tr_steps
  epoch_accu = (n_correct*100)/nb_tr_examples
  print(f"Training Loss Epoch: {epoch_loss}")
  print(f"Training Accuracy Epoch: {epoch_accu}")
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')

  return 

In [None]:
for epoch in range(EPOCHS):
  train(epoch)
  print() 

In [None]:
#save the trained model
torch.save(model.state_dict(), "/content/drive/My Drive/Sentiment Analysis Fire/models/preprocessed+translation+transliterated/distilmBERT-cased.bin")

In [None]:
 def valid(model,testing_loader):
  model.eval()
  n_correct = 0
  n_wrong = 0
  total = 0
  tr_loss = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  with torch.no_grad():
    for _,data in enumerate(testing_loader,0):
      ids = data['ids'].to(device,dtype = torch.long)
      mask = data['mask'].to(device,dtype = torch.long)
      targets = data['targets'].to(device,dtype=torch.long)
      outputs = model(ids,mask).squeeze()
      loss = loss_function(outputs,targets)
      tr_loss += loss.item()
      big_val,big_idx = torch.max(outputs.data,dim=1)
      n_correct += calcuate_accuracy(big_idx,targets)
      nb_tr_steps += 1
      nb_tr_examples += targets.size(0)

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accuracy = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch:{epoch_loss}")
    print(f"Validation Accuracy Epoch:{epoch_accuracy}")

    return epoch_accuracy
      

In [None]:
acc = valid(model, validating_loader)
print("Accuracy on test data = %0.2f%%" % acc)

###Prediction

In [None]:
#Dataloader for test data
class SentimentDatasetTest(Dataset):

  def __init__(self,dataframe,tokenizer,max_len):
    self.len = len(dataframe)
    self.data = dataframe
    self.tokenizer = tokenizer
    self.max_len = max_len 
  

  def __getitem__(self,index):
    sentence = str(self.data.text[index])
    sentence = " ".join(sentence.split())
    encoding = self.tokenizer.encode_plus(
        sentence,
        add_special_tokens = True,
        max_length = self.max_len,
        padding = 'max_length',
        return_token_type_ids = False,
        return_tensors = 'pt',
        truncation = True
    )
    return {
        'ids' : encoding['input_ids'].flatten(),
        'mask': encoding['attention_mask'].flatten()
    }

  def __len__(self):
    return self.len

In [None]:
testing_set = SentimentDatasetTest(test,tokenizer,MAX_LEN)

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  sentence = []
  predictions = []
  prediction_probs = []
  with torch.no_grad():
    for d in data_loader:
      ids = d["ids"].to(device)
      mask = d["mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=ids,
        attention_mask=mask
      )
      _, preds = torch.max(outputs, dim=1)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  return sentence, predictions, prediction_probs

In [None]:
y_review_texts, y_pred, y_pred_probs = get_predictions(
  model,
  testing_loader
)

###Write to csv

In [None]:
y_prediction = y_pred 
class_name = ['unknown_state','Negative','not-Tamil','Positive','Mixed_feelings']
arry = []

for y in range(len(y_prediction)): 
   arry.append(class_name[y_prediction[y].item()])
print(arry)

In [None]:
a = numpy.array(arry)
test_labels_dataframe = pd.DataFrame(a).to_csv("/content/drive/My Drive/Sentiment Analysis Fire/output/preprocessed+translation+transliterated/distilmBERT-cased.csv", index=False, header=None)