# Data Pre-Processing

## Installing and Importing the required libraries

In [22]:
!pip install pytreebank



In [23]:
!pip install numpy git+https://github.com/makcedward/nlpaug.git

Collecting git+https://github.com/makcedward/nlpaug.git
  Cloning https://github.com/makcedward/nlpaug.git to /tmp/pip-req-build-3p9j899t
  Running command git clone -q https://github.com/makcedward/nlpaug.git /tmp/pip-req-build-3p9j899t
Building wheels for collected packages: nlpaug
  Building wheel for nlpaug (setup.py) ... [?25l[?25hdone
  Created wheel for nlpaug: filename=nlpaug-1.1.3-cp37-none-any.whl size=837629 sha256=f328f764403df0c116888e580d6a5538a5999aca6667a51554264ab3afe419d0
  Stored in directory: /tmp/pip-ephem-wheel-cache-bdzdhtvx/wheels/2b/ef/30/a4e22f9a97373c9ab6763670c94aa5e111b0b956983f3892a4
Successfully built nlpaug


In [24]:
! pip install googletrans==3.1.0a0



In [25]:
import pytreebank
import pandas as pd
import random
import nlpaug.augmenter.word as naw
import googletrans

## Load Stanford Sentiment Analysis from the "pytreebank" library

In [26]:
dataset = pytreebank.load_sst()

In [27]:
[dataset.keys()]

[dict_keys(['train', 'test', 'dev'])]

In [28]:
len(dataset['train']), len(dataset['test']), len(dataset['dev'])

(8544, 2210, 1101)

## Prepare the Train, Validation and Test Dataset splits

In [29]:
train_df = pd.DataFrame({'sentence' : [dataset['train'][i].to_labeled_lines()[0][1] for i in range(len(dataset['train']))], 'labels' : [dataset['train'][i].to_labeled_lines()[0][0] for i in range(len(dataset['train']))]})
test_df = pd.DataFrame({'sentence' : [dataset['test'][i].to_labeled_lines()[0][1] for i in range(len(dataset['test']))], 'labels' : [dataset['test'][i].to_labeled_lines()[0][0] for i in range(len(dataset['test']))]})
val_df = pd.DataFrame({'sentence' : [dataset['dev'][i].to_labeled_lines()[0][1] for i in range(len(dataset['dev']))], 'labels' : [dataset['dev'][i].to_labeled_lines()[0][0] for i in range(len(dataset['dev']))]})
df = pd.concat([train_df, test_df, val_df]).reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)


In [30]:
df.shape

(11855, 2)

In [31]:
df.labels.value_counts()

1    3140
3    3111
2    2242
4    1852
0    1510
Name: labels, dtype: int64

In [32]:
# Dataset split to train and validation. Percent of data to be included for training set. Rest will go to validation set
train_split_pct = 70

In [33]:
df_train = df[:(train_split_pct*len(df))//100]
df_val = df[len(df_train):].reset_index(drop=True)

In [34]:
len(df_train), len(df_val)

(8298, 3557)

## Augment the Train Dataset

In [35]:
# Percentage of training data to be augmented using random augmentations
train_aug_pct = 5

In [36]:
indexes = random.choices(list(range(len(df_train))), k = (len(df_train)* train_aug_pct)//100)                         
sentences = [df_train['sentence'][i] for i in indexes]
augmented_sentences = []

### Random Augmentations from EDA

In [37]:
# random augmentations to be done and the percentage of words in the corresponding sentence to be augmented
augmentations = ['swap','delete']
pct = [1, 1]
aug_dict = dict(zip(augmentations, pct))

# Fraction of the percentage
frac_set = 3

In [38]:
aug_del = naw.RandomWordAug(action = augmentations[0], aug_p = aug_dict)
aug_swap = naw.RandomWordAug(action="swap", aug_p = 1)
aug_index_min, aug_index_max = 0, len(sentences)//frac_set
for augmentation in augmentations:
  aug = naw.RandomWordAug(action = augmentation, aug_p = aug_dict[augmentation])
  for i in range(aug_index_min, aug_index_max):  
    augmented_text = aug.augment(sentences[i])  
    augmented_sentences += [augmented_text]  
  aug_index_min = aug_index_max
  aug_index_max += min(len(sentences)-aug_index_max, len(sentences)//frac_set)
  


### Backtranslation from Google Translate

In [39]:
if aug_index_min < len(sentences):
  translator = googletrans.Translator()
  available_langs = list(googletrans.LANGUAGES.keys()) 
  trans_lang = random.choice(available_langs) 
  print(f"Translating to {googletrans.LANGUAGES[trans_lang]}")

  translations = translator.translate(sentences[aug_index_min:], dest=trans_lang) 
  t_text = [t.text for t in translations]

  translations_en_random = translator.translate(t_text, src=trans_lang, dest='en') 
  augmented_sentences += [t.text for t in translations_en_random]

Translating to serbian


In [47]:
df_train_aug = pd.DataFrame({'sentence': augmented_sentences, 'labels': [df_train['labels'][i] for i in indexes]})
df_aug = pd.concat([df_train, df_train_aug]).reset_index(drop=True)
df_aug = df_aug.sample(frac=1).reset_index(drop=True)

In [48]:
len(df_aug)

8712

# Data Flow Utilities

## Import the dependencies and configure reproducibility

In [49]:
import torch, torchtext
from torchtext import data

In [50]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)
CUDA_LAUNCH_BLOCKING=1

## Prepare the Train and Validation Datasets using the "torchtext" library

In [51]:
df_train = df_aug
df_val = df_val

In [52]:
df_val = df_val.reset_index(drop=True)

In [53]:
Sentence = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [54]:
fields = [('sentence', Sentence), ('label', Label)]

In [55]:
example_train = [torchtext.legacy.data.Example.fromlist([df_train.sentence[i],df_train.labels[i]], fields) for i in range(df_train.shape[0])] 
example_val = [torchtext.legacy.data.Example.fromlist([df_val.sentence[i],df_val.labels[i]], fields) for i in range(df_val.shape[0])] 



In [56]:
train = torchtext.legacy.data.Dataset(example_train, fields)
valid = torchtext.legacy.data.Dataset(example_val, fields)

In [57]:
len(train), len(valid)

(8712, 3557)

In [58]:
df_train.head()

Unnamed: 0,sentence,labels
0,"Donovan ... squanders his main asset , Jackie ...",1
1,Quelle surprise !,2
2,Seeks to transcend its genre with a curiously ...,3
3,"Metaphors abound , but it is easy to take this...",3
4,Pan Nalin 's exposition is beautiful and myste...,3


## Build the vocab for the Train Dataset

In [59]:
Sentence.build_vocab(train)
Label.build_vocab(train)

In [60]:
print('Size of input vocab : ', len(Sentence.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Sentence.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  16965
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 8227), (',', 7299), ('the', 6211), ('of', 4478), ('a', 4454), ('and', 4431), ('to', 3105), ('-', 2873), ('is', 2590), ("'s", 2515)]
Labels :  defaultdict(None, {3: 0, 1: 1, 2: 2, 4: 3, 0: 4})


## Setting the Device for Process Handling

In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Preparing DataLoaders for the Train and Validation Datasets

In [62]:
train_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid), batch_size = 16, 
                                                            sort_key = lambda x: len(x.sentence),
                                                            sort_within_batch=True, device = device)

In [63]:
next(iter(train_iterator))
#len(train.examples[11].tweet)


[torchtext.legacy.data.batch.Batch of size 16]
	[.sentence]:('[torch.cuda.LongTensor of size 16x26 (GPU 0)]', '[torch.cuda.LongTensor of size 16 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 16 (GPU 0)]

In [64]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Sentence.vocab.stoi, tokens)

# Building Model and defining Training and Validation Processes

## Model

In [65]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        #output = F.softmax(dense_outputs[0], dim=1)
            
        return dense_outputs[0]

In [66]:
# Define hyperparameters
size_of_vocab = len(Sentence.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [67]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(16965, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
The model has 5,331,605 trainable parameters


In [68]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

## Training Process

In [69]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        # resets the gradients after every batch
        optimizer.zero_grad() 
        
        
        # retrieve text and no. of words
        sentence, sentence_lengths = batch.sentence  
        
        # convert to 1D tensor
        predictions = model(sentence, sentence_lengths).squeeze()  
        #print(predictions, batch.label)
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Validation Process

In [70]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            sentence, sentence_lengths = batch.sentence
            
            # convert to 1d tensor
            predictions = model(sentence, sentence_lengths).squeeze() 
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Main Process - Training and Validating for a certain number of epochs

In [71]:
N_EPOCHS = 50
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.586 | Train Acc: 25.45%
	 Val. Loss: 1.578 |  Val. Acc: 25.26% 

	Train Loss: 1.572 | Train Acc: 26.81%
	 Val. Loss: 1.574 |  Val. Acc: 25.99% 

	Train Loss: 1.570 | Train Acc: 27.11%
	 Val. Loss: 1.573 |  Val. Acc: 25.99% 

	Train Loss: 1.567 | Train Acc: 28.08%
	 Val. Loss: 1.572 |  Val. Acc: 26.55% 

	Train Loss: 1.565 | Train Acc: 28.66%
	 Val. Loss: 1.571 |  Val. Acc: 26.16% 

	Train Loss: 1.563 | Train Acc: 28.96%
	 Val. Loss: 1.571 |  Val. Acc: 26.41% 

	Train Loss: 1.561 | Train Acc: 29.53%
	 Val. Loss: 1.569 |  Val. Acc: 27.31% 

	Train Loss: 1.559 | Train Acc: 30.06%
	 Val. Loss: 1.568 |  Val. Acc: 27.56% 

	Train Loss: 1.557 | Train Acc: 30.39%
	 Val. Loss: 1.568 |  Val. Acc: 27.48% 

	Train Loss: 1.555 | Train Acc: 30.40%
	 Val. Loss: 1.566 |  Val. Acc: 27.81% 

	Train Loss: 1.552 | Train Acc: 31.36%
	 Val. Loss: 1.565 |  Val. Acc: 27.61% 

	Train Loss: 1.549 | Train Acc: 31.54%
	 Val. Loss: 1.564 |  Val. Acc: 27.58% 

	Train Loss: 1.546 | Train Acc: 31.61%
	

# Evaluating the Model on Random Inputs from the Validation Dataset

In [72]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_sentence(sentence):
    
    categories = {0: "Very Negative", 1:"Negative", 2:"Neutral", 3:"Positive", 4:"Very Positive"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [78]:
# random sentences from validation set for the model to be tested upon
num_val = 10

In [82]:
categories = {0: "Very Negative", 1:"Negative", 2:"Neutral", 3:"Positive", 4:"Very Positive"}
evaluation_sentence_indices = random.sample(range(0, len(df_val)), num_val)
for i in range(num_val):
  print(f"Actual Sentence : {df_val['sentence'][evaluation_sentence_indices[i]]} \nActual Sentiment: {categories[df_val['labels'][evaluation_sentence_indices[i]]]}")
  print(f"Predicted Sentiment: {classify_sentence(df_val['sentence'][evaluation_sentence_indices[i]])}")
  print("----------------------------------------")

Actual Sentence : The setting turns out to be more interesting than any of the character dramas , which never reach satisfying conclusions . 
Actual Sentiment: Negative
Predicted Sentiment: Negative
----------------------------------------
Actual Sentence : A culture clash comedy only half as clever as it thinks it is . 
Actual Sentiment: Negative
Predicted Sentiment: Negative
----------------------------------------
Actual Sentence : Formula 51 has dulled your senses faster and deeper than any recreational drug on the market . 
Actual Sentiment: Neutral
Predicted Sentiment: Negative
----------------------------------------
Actual Sentence : The movie , while beautiful , feels labored , with a hint of the writing exercise about it . 
Actual Sentiment: Neutral
Predicted Sentiment: Very Negative
----------------------------------------
Actual Sentence : This is art paying homage to art . 
Actual Sentiment: Positive
Predicted Sentiment: Very Negative
--------------------------------------