<a href="https://colab.research.google.com/github/rishubhkhurana/nlp/blob/main/S7/SentimentFromBasics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Book Keeping

In [1]:
#!pip install GPUtil
#!pip install google-trans-new
#!wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
#!unzip stanfordSentimentTreebank.zip

Collecting GPUtil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7412 sha256=2e22faf802b78ef5b7ba89220f1141bf0010cd81aa36eb936815fa292a837ef5
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0


## Importing Libs

In [7]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import torchtext
from torchtext.data import Field, LabelField, Example
from torchtext.data import BucketIterator, Dataset
import random
import matplotlib.pyplot as plt
import google_trans_new
from google_trans_new import google_translator

In [9]:
import sys,os
sys.path.append('/content/drive/MyDrive')

In [10]:
#from Text import *
#from Text.text import *

## Set SEEDS

In [11]:
SEED = 1234
torch.manual_seed(SEED)
torch.set_deterministic(True)
np.random.seed(SEED)

## Reading Data

In [12]:
phrases_data = pd.read_csv('stanfordSentimentTreebank/dictionary.txt',sep='|',header=None).sort_values(by=1).rename(columns={0:'phrase',1:'id'})
label_data = pd.read_csv('stanfordSentimentTreebank/sentiment_labels.txt',sep='|').rename(columns={'sentiment values':'sentiment'}).rename(columns={'phrase ids':'id'})
assert phrases_data.shape[0] == label_data.shape[0]  
joined_data = pd.merge(phrases_data,label_data,on='id')

In [13]:
joined_data.head()

Unnamed: 0,phrase,id,sentiment
0,!,0,0.5
1,',1,0.5
2,' (,2,0.44444
3,' ( the cockettes,3,0.5
4,' ( the cockettes ),4,0.42708


In [14]:
label_dict = dict(zip(joined_data['phrase'],joined_data['sentiment']))

In [15]:
labels = []
with open('stanfordSentimentTreebank/SOStr.txt','r') as f:
    count=0
    for line in progress_bar(f.readlines()):
        temp = line.strip().split('|')
        value = [label_dict.get(c) for c in temp]
        labels.append(value)

In [16]:
text_data = pd.read_csv('stanfordSentimentTreebank/datasetSentences.txt',sep='\t')
text_data['label'] = labels

In [17]:
text_data['label'] = text_data['label'].apply(lambda x: np.ceil(25*np.mean(x))).astype('int')

In [19]:
datasplit = pd.read_csv('stanfordSentimentTreebank/datasetSplit.txt')

In [20]:
text_data['split'] = datasplit['splitset_label']

In [21]:
train_data = text_data[text_data['split']==1]
valid_data = text_data[text_data['split']!=1]

## Creating Dataset

In [22]:
## Defining Fields
TEXT = Field(tokenize='spacy',include_lengths=True,batch_first=True,lower=True)
LABEL = LabelField(tokenize='spacy',batch_first=True,is_target=True)

In [23]:
fields = [('text',TEXT),('label',LABEL)]
train_examples = [Example.fromlist([train_data.sentence.iloc[i],train_data.label.iloc[i]],fields) for i in range(train_data.shape[0])]
valid_examples = [Example.fromlist([valid_data.sentence.iloc[i],valid_data.label.iloc[i]],fields) for i in range(valid_data.shape[0])]

In [24]:
' '.join(vars(train_examples[0])['text']),vars(train_examples[0])['label']

("the rock is destined to be the 21st century 's new ` ` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean - claud van damme or steven segal .",
 13)

In [25]:
train_dset = Dataset(train_examples,fields)
valid_dset = Dataset(valid_examples,fields)

In [26]:
len(train_dset),len(valid_dset)

(8544, 3311)

In [27]:
#train_dset,valid_dset = dset.split(split_ratio=[0.85,0.15],random_state=random.seed(SEED))

In [28]:
#len(train_dset),len(valid_dset)

## Augmentations

In [29]:
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [30]:
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [31]:
# add random deletions
aug_data = pd.DataFrame()
for i,row in train_data.iterrows():
    row['sentence'] = ' '.join(random_deletion(row['sentence'].split(), p=0.2))
    label = row['label']
    aug_data = aug_data.append(row).reset_index(drop=True)
for i,row in train_data.iterrows():
    row['sentence'] = ' '.join(random_swap(row['sentence'].split()))
    label = row['label']
    aug_data = aug_data.append(row).reset_index(drop=True)

In [32]:
aug_data = train_data.append(aug_data).reset_index(drop=True)

In [33]:
aug_data.shape

(25632, 4)

## Building Vocab

In [34]:
TEXT.build_vocab(train_dset)
LABEL.build_vocab(train_dset)

In [35]:
len(TEXT.vocab),len(LABEL.vocab)

(15491, 14)

## DataLoader

In [36]:
device =  torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [37]:
train_dl,valid_dl = BucketIterator.splits([train_dset,valid_dset],batch_size=32,sort_key = lambda x: len(x.text), sort_within_batch=True,device=device)

In [38]:
len(train_dl),len(valid_dl)

(267, 104)

In [39]:
batch = next(iter(train_dl))

In [40]:
len(batch.text),len(batch.label)

(2, 32)

## Model

In [41]:
class SentClassifier(nn.Module):
    def __init__(self,embed_dims=300, hidden_size=100,num_layers=2,vocab_size=len(TEXT.vocab),num_outs=25,bidirectional=True,dropout=0):
        super().__init__()
        self.embedder = nn.Embedding(vocab_size,embed_dims)
        self.rnn = nn.LSTM(embed_dims,hidden_size,num_layers=num_layers,batch_first=True,bidirectional=bidirectional,dropout=dropout)
        self.dropout  = nn.Dropout(dropout)
        self.classifier = nn.Linear(2*hidden_size if bidirectional else hidden_size,num_outs)
        self.bidirectional = bidirectional
        self.num_layers = num_layers
    def forward(self,text,text_lengths):
        # text: [B,SeqL]
        embedded_text = self.embedder(text)
        # [B.SeqL,embed_dims]
        packed = nn.utils.rnn.pack_padded_sequence(embedded_text,text_lengths.cpu(),batch_first=True)
        packed_out,(h,c) = self.rnn(packed)
        # packed_out : [sum(seq_len),hidden_dims*2 if bidir else hidden_dims]
        # h: [num_layers*directions,B,hidden_dims]
        #print(h.shape,c.shape)
        if self.bidirectional:
            h = h[-2:,...]
            h = torch.cat([h[-2,...],h[-1,...]],dim=-1)
        else:
            h = h[-1,...]
        h = self.dropout(h)
        out = self.classifier(h)
        return out


In [34]:
model = SentClassifier(bidirectional=True)

## Training

In [42]:
model_params={}
model_params['dropout']=0.2
model_params['vocab_size']=len(TEXT.vocab)
model_params['embed_dims']=100
model_params['num_layers']=2
model_params['bidirectional']=True
model_params['hidden_size']=100
model_params['num_outs']=25
EPOCHS=20

In [43]:
model = SentClassifier(**model_params).to(device)
opt = optim.Adam(model.parameters(),lr=5e-4)
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCEWithLogitsLoss()

dls = {'train':train_dl,'valid':valid_dl}
metric_func = Accuracy()

In [60]:
model

SentClassifier(
  (embedder): Embedding(16805, 100)
  (rnn): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (classifier): Linear(in_features=200, out_features=25, bias=True)
)

In [109]:
#trainRecorder, valRecorder = trainModel(model,opt,criterion,dls,NEpochs=10,device=device,metric_func=metric_func,print_utils=False)

In [44]:
# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [45]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.text   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [46]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.text
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [47]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_dl, opt, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_dl, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.266 | Train Acc: 54.74%
	 Val. Loss: 1.046 |  Val. Acc: 56.24% 

	Train Loss: 1.049 | Train Acc: 55.95%
	 Val. Loss: 1.024 |  Val. Acc: 56.36% 

	Train Loss: 0.992 | Train Acc: 59.42%
	 Val. Loss: 0.992 |  Val. Acc: 58.79% 

	Train Loss: 0.881 | Train Acc: 64.62%
	 Val. Loss: 0.969 |  Val. Acc: 59.70% 

	Train Loss: 0.726 | Train Acc: 71.96%
	 Val. Loss: 0.986 |  Val. Acc: 59.85% 

	Train Loss: 0.564 | Train Acc: 79.20%
	 Val. Loss: 1.058 |  Val. Acc: 60.45% 

	Train Loss: 0.417 | Train Acc: 85.28%
	 Val. Loss: 1.099 |  Val. Acc: 60.08% 

	Train Loss: 0.292 | Train Acc: 90.16%
	 Val. Loss: 1.283 |  Val. Acc: 59.94% 

	Train Loss: 0.210 | Train Acc: 93.16%
	 Val. Loss: 1.424 |  Val. Acc: 58.74% 

	Train Loss: 0.145 | Train Acc: 95.42%
	 Val. Loss: 1.635 |  Val. Acc: 61.14% 

	Train Loss: 0.094 | Train Acc: 97.39%
	 Val. Loss: 1.838 |  Val. Acc: 59.39% 

	Train Loss: 0.080 | Train Acc: 97.47%
	 Val. Loss: 1.947 |  Val. Acc: 56.84% 

	Train Loss: 0.072 | Train Acc: 97.88%
	

## Augmented Training

In [48]:
## Defining Fields
TEXT = Field(tokenize='spacy',include_lengths=True,batch_first=True,lower=True)
LABEL = LabelField(tokenize='spacy',batch_first=True,is_target=True)

In [49]:
fields = [('text',TEXT),('label',LABEL)]
train_examples = [Example.fromlist([aug_data.sentence.iloc[i],aug_data.label.iloc[i]],fields) for i in range(aug_data.shape[0])]
valid_examples = [Example.fromlist([valid_data.sentence.iloc[i],valid_data.label.iloc[i]],fields) for i in range(valid_data.shape[0])]

In [50]:
train_dset = Dataset(train_examples,fields)
valid_dset = Dataset(valid_examples,fields)

In [51]:
TEXT.build_vocab(train_dset)
LABEL.build_vocab(train_dset)

In [52]:
len(TEXT.vocab)

15491

In [53]:
train_dl,valid_dl = BucketIterator.splits([train_dset,valid_dset],batch_size=32,sort_key = lambda x: len(x.text), sort_within_batch=True,device='cuda')

In [54]:
model_params={}
model_params['dropout']=0.2
model_params['vocab_size']=len(TEXT.vocab)
model_params['embed_dims']=100
model_params['num_layers']=2
model_params['bidirectional']=True
model_params['hidden_size']=100
model_params['num_outs']=25
EPOCHS=20

In [56]:
model = SentClassifier(**model_params).to(device)
opt = optim.Adam(model.parameters(),lr=5e-4)
criterion = nn.CrossEntropyLoss()
dls = {'train':train_dl,'valid':valid_dl}
metric_func = Accuracy()

In [57]:
model

SentClassifier(
  (embedder): Embedding(15491, 100)
  (rnn): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (classifier): Linear(in_features=200, out_features=25, bias=True)
)

In [58]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_dl, opt, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_dl, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.118 | Train Acc: 56.56%
	 Val. Loss: 0.993 |  Val. Acc: 58.83% 

	Train Loss: 0.828 | Train Acc: 67.25%
	 Val. Loss: 0.904 |  Val. Acc: 62.20% 

	Train Loss: 0.534 | Train Acc: 79.79%
	 Val. Loss: 0.956 |  Val. Acc: 65.02% 

	Train Loss: 0.314 | Train Acc: 88.75%
	 Val. Loss: 1.120 |  Val. Acc: 65.68% 

	Train Loss: 0.181 | Train Acc: 93.82%
	 Val. Loss: 1.483 |  Val. Acc: 62.91% 

	Train Loss: 0.108 | Train Acc: 96.26%
	 Val. Loss: 1.631 |  Val. Acc: 64.36% 

	Train Loss: 0.083 | Train Acc: 97.18%
	 Val. Loss: 1.855 |  Val. Acc: 65.14% 

	Train Loss: 0.058 | Train Acc: 98.14%
	 Val. Loss: 1.991 |  Val. Acc: 64.33% 

	Train Loss: 0.044 | Train Acc: 98.67%
	 Val. Loss: 2.032 |  Val. Acc: 65.05% 

	Train Loss: 0.035 | Train Acc: 98.81%
	 Val. Loss: 2.259 |  Val. Acc: 65.59% 

	Train Loss: 0.031 | Train Acc: 99.04%
	 Val. Loss: 2.530 |  Val. Acc: 64.54% 

	Train Loss: 0.029 | Train Acc: 99.03%
	 Val. Loss: 2.429 |  Val. Acc: 64.54% 

	Train Loss: 0.023 | Train Acc: 99.25%
	