<a href="https://colab.research.google.com/github/sagawritescode/ENDTwoPointOPhase1/blob/main/Sagar_Assignment_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 6 Submission: Encoder-decoder on Tweet dataset

## Preparing the data

In [2]:
from google.colab import files
tweetfile = files.upload()

Saving tweets.csv to tweets.csv


In [3]:
import pandas as pd
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [4]:
df.shape

(1364, 2)

In [5]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [6]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fdc3160dd50>

In [7]:
Tweet = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [8]:
fields = [('tweets', Tweet),('labels',Label)]

In [9]:
example = [data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])] 

In [10]:
# Creating dataset

twitterDataset = data.Dataset(example, fields)

In [11]:
(train, valid) = twitterDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED))

In [12]:
(len(train), len(valid))

(1159, 205)

In [13]:
vars(train.examples[10])

{'labels': 0,
 'tweets': ['Obama',
  ',',
  'Romney',
  'agree',
  ':',
  'Admit',
  'women',
  'to',
  'Augusta',
  'golf',
  'club',
  ':',
  'US',
  'President',
  'Barack',
  'Obama',
  'believes',
  'women',
  'should',
  'be',
  'allowe',
  '...',
  'http://t.co/PVKrepqI']}

In [14]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [15]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4651
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1069), (':', 783), ('#', 780), ('.', 761), (',', 598), ('"', 550), ('the', 542), ('RT', 516), ('?', 419), ('to', 400)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [18]:
import os, pickle

with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

## Preparing the Model

In [19]:
import torch.nn as nn
import torch.nn.functional as F


class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim_encoder, hidden_dim_decoder, output_dim, dropout):
        
        super().__init__()          
        
        self.embedding_dim = embedding_dim
        self.hidden_dim_encoder = hidden_dim_encoder
        self.hidden_dim_decoder = hidden_dim_decoder
        # Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Encoder consisting of GRU Cell
        self.encoder = nn.GRUCell(embedding_dim, hidden_dim_encoder)
        
        # Decoder consisting of GRU Cell
        self.decoder = nn.GRUCell(hidden_dim_encoder, hidden_dim_decoder)

        # Dense layer
        self.fc = nn.Linear(hidden_dim_decoder, output_dim)
        
    def forward(self, text, text_lengths, printOutput = False):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
        
        _, sentence_len = text.size()
        
        hidden = 0

        #### ENCODER LAYER passing word by word to the encoder cell
        for word_no in range(0, sentence_len):
            if word_no == 0:
                # By default GRU cell inside encoder will pass zeros if no hidden vector is passed
                hidden = self.encoder(embedded[:,word_no,:])
            else:
                hidden = self.encoder(embedded[:,word_no,:], hidden)

            if printOutput:
                print("Sending word no: ", word_no, "to the encoder")
                print("Output of the encoder: ", hidden)


        ### DECODER CELL passing the output of the encoder to the decoder
        decoder_output = self.decoder(hidden)             
        #decoder_output = [batch size, hidden_dim_decoder]

        dense_outputs = self.fc(decoder_output)
        #dense_outputs = [batch size, output_dim]
        if printOutput:
            print("Decoder output: ", dense_outputs)

        output = F.softmax(dense_outputs, dim=1)

        return output

In [20]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 2
dropout = 0.2

# Instantiate the model

model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_hidden_nodes, num_output_nodes, dropout)

In [21]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(4651, 300)
  (encoder): GRUCell(300, 100)
  (decoder): GRUCell(100, 100)
  (fc): Linear(in_features=100, out_features=3, bias=True)
)
The model has 1,576,803 trainable parameters


## Model Training and Evaluation

First define the optimizer and loss functions

In [22]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    #print("binary accuracy pred: y:", preds.shape, y.shape)
    _, predictions = torch.max(preds, 1)
    #print("predictions: ", predictions.shape)
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [23]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

**Let's Train and Evaluate**

In [25]:
N_EPOCHS = 25
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.054 | Train Acc: 61.78%
	 Val. Loss: 0.991 |  Val. Acc: 67.86% 

	Train Loss: 0.971 | Train Acc: 67.86%
	 Val. Loss: 0.916 |  Val. Acc: 68.75% 

	Train Loss: 0.906 | Train Acc: 69.21%
	 Val. Loss: 0.885 |  Val. Acc: 68.75% 

	Train Loss: 0.867 | Train Acc: 69.55%
	 Val. Loss: 0.873 |  Val. Acc: 68.75% 

	Train Loss: 0.848 | Train Acc: 71.57%
	 Val. Loss: 0.869 |  Val. Acc: 68.75% 

	Train Loss: 0.831 | Train Acc: 72.92%
	 Val. Loss: 0.866 |  Val. Acc: 69.20% 

	Train Loss: 0.816 | Train Acc: 74.44%
	 Val. Loss: 0.867 |  Val. Acc: 69.20% 

	Train Loss: 0.797 | Train Acc: 75.80%
	 Val. Loss: 0.846 |  Val. Acc: 70.54% 

	Train Loss: 0.767 | Train Acc: 79.01%
	 Val. Loss: 0.813 |  Val. Acc: 72.77% 

	Train Loss: 0.740 | Train Acc: 82.26%
	 Val. Loss: 0.783 |  Val. Acc: 78.12% 

	Train Loss: 0.720 | Train Acc: 84.04%
	 Val. Loss: 0.793 |  Val. Acc: 75.45% 

	Train Loss: 0.703 | Train Acc: 85.30%
	 Val. Loss: 0.786 |  Val. Acc: 77.23% 

	Train Loss: 0.688 | Train Acc: 86.91%
	

## Model Testing

In [26]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor, printOutput = True)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [27]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

Sending word no:  0 to the encoder
Output of the encoder:  tensor([[ 0.2058, -0.3136,  0.2856, -0.2534,  0.7220,  0.0217, -0.1964,  0.8956,
          0.2938,  0.2987,  0.4305, -0.0810, -0.6956,  0.8361,  0.0863, -0.1338,
          0.3219,  0.5745,  0.2199,  0.2676,  0.0442,  0.6336, -0.3882,  0.1586,
          0.2869, -0.4127, -0.5079,  0.0612, -0.3877, -0.1976,  0.1211, -0.1838,
         -0.3921,  0.5729, -0.1603, -0.1147,  0.8502, -0.4621, -0.3675, -0.0127,
         -0.2218,  0.3069, -0.1473,  0.4175,  0.4704,  0.1210,  0.3260,  0.2885,
         -0.2389, -0.3332,  0.0576,  0.0192,  0.1986,  0.3949,  0.4797, -0.1638,
          0.3114,  0.4872,  0.6424, -0.3082,  0.5766, -0.2461,  0.8563,  0.0576,
         -0.2997,  0.5999,  0.0838, -0.7253, -0.1684,  0.2831,  0.7313, -0.3701,
          0.4270,  0.2249,  0.3249,  0.3742,  0.3030,  0.4757,  0.4461,  0.4092,
          0.6452, -0.1660, -0.2076, -0.1364,  0.0728, -0.2401, -0.0221, -0.5756,
          0.6845,  0.4949, -0.0411,  0.1543, -0.32

'Negative'