<a href="https://colab.research.google.com/github/rajanm/END2_0_Session_6/blob/main/Tweets_Analysis_using_Encoder_Decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install gdown==3.13.0 --quiet
! pip install tweet-preprocessor --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone


In [2]:
import gdown
from tqdm.auto import tqdm
import pandas as pd
import preprocessor as tp

tqdm.pandas()

data_refresh = True
data_url = 'https://drive.google.com/uc?id=1FWLOXtYDiOZckzvn79cZ7OeQCI9fVA00'
data_file = 'tweets.csv'
cleaned_data_file = 'tweets_cleaned.csv'

  from pandas import Panel


In [3]:
def clean_text(text):
    text = tp.clean(text).strip()
    text = text.replace(":", "")
    return text

In [4]:
%%time
if data_refresh == True:
  gdown.download(data_url, data_file)
  dataset = pd.read_csv(data_file, header = 'infer')
  dataset['clean_tweets'] = dataset['tweets'].progress_apply(clean_text)
  dataset.to_csv(cleaned_data_file)
else:
  dataset = pd.read_csv(cleaned_data_file, header = 'infer')

Downloading...
From: https://drive.google.com/uc?id=1FWLOXtYDiOZckzvn79cZ7OeQCI9fVA00
To: /content/tweets.csv
100%|██████████| 160k/160k [00:00<00:00, 27.0MB/s]


HBox(children=(FloatProgress(value=0.0, max=1364.0), HTML(value='')))


CPU times: user 305 ms, sys: 25.2 ms, total: 331 ms
Wall time: 1.25 s


In [5]:
dataset.describe()

Unnamed: 0,labels
count,1364.0
mean,0.376833
std,0.594859
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,2.0


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364 entries, 0 to 1363
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweets        1364 non-null   object
 1   labels        1364 non-null   int64 
 2   clean_tweets  1364 non-null   object
dtypes: int64(1), object(2)
memory usage: 32.1+ KB


In [7]:
import random
import torch, torchtext
from torchtext import data

In [8]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fdd50dd44f0>

In [9]:
%%time
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)

CPU times: user 1.3 s, sys: 263 ms, total: 1.56 s
Wall time: 3.97 s


In [10]:
%%time
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

CPU times: user 652 ms, sys: 75.7 ms, total: 728 ms
Wall time: 724 ms


In [11]:
fields = [('clean_tweets', Tweet), ('label', Label)]

In [12]:
%%time
example = [torchtext.legacy.data.Example.fromlist([dataset.tweets[i],dataset.labels[i]], fields) for i in range(dataset.shape[0])]

CPU times: user 1min 8s, sys: 1.26 s, total: 1min 9s
Wall time: 1min 9s


In [13]:
example_dataset = torchtext.legacy.data.Dataset(example, fields)

In [14]:
(train, test) = example_dataset.split(split_ratio=[70, 30], random_state = random.seed(SEED))

In [15]:
print(len(train))
print(len(test))

955
409


In [16]:
vars(train.examples[1])

{'clean_tweets': ['@TBCDG',
  '#',
  'WhatsRomneyHiding',
  '-',
  'OBAMA',
  'wants',
  'to',
  'come',
  'out',
  'of',
  'the',
  'closet',
  'but',
  'Mitt',
  'wants',
  'to',
  'wait',
  'until',
  'after',
  'Elections',
  '!',
  '#',
  'GOP',
  '#',
  'tcot'],
 'label': 0}

In [17]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [18]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4148
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 883), ('#', 644), (':', 644), ('.', 621), (',', 504), ('"', 472), ('the', 458), ('RT', 411), ('?', 362), ('to', 340)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((train, test), batch_size = 64, 
                                                            sort_key = lambda x: len(x.clean_tweets),
                                                            sort_within_batch=True, device = device)

In [21]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [22]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           batch_first=True)

        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        packed_input = nn.utils.rnn.PackedSequence(torch.zeros(packed_output.data.shape[0],packed_output.data.shape[1]), packed_output.batch_sizes, packed_output.sorted_indices, packed_output.unsorted_indices)
        packed_output, (hidden, cell) = self.decoder(packed_input.cuda(), (hidden,cell))
    
        dense_outputs = self.fc(hidden)   
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [23]:
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 1

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers)

In [24]:
%%time
model=model.to(device)
for batch in train_iterator:
  tweet = batch.clean_tweets[0]
  length = batch.clean_tweets[1]
  model(tweet, length)
  break

CPU times: user 11.1 s, sys: 864 ms, total: 12 s
Wall time: 18.6 s


In [25]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(4148, 300)
  (encoder): LSTM(300, 100, batch_first=True)
  (decoder): LSTM(100, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=3, bias=True)
)
The model has 1,486,303 trainable parameters


In [26]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [27]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.clean_tweets  
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [28]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.clean_tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
%%time
N_EPOCHS = 10
best_valid_loss = float('inf')
model_path='./saved_weights.pt'

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
    
    print('\n\tEpoch No: ', epoch)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')


	Epoch No:  0
	Train Loss: 1.078 | Train Acc: 59.17%
	 Val. Loss: 1.074 |  Val. Acc: 67.51% 


	Epoch No:  1
	Train Loss: 1.067 | Train Acc: 69.79%
	 Val. Loss: 1.063 |  Val. Acc: 67.51% 


	Epoch No:  2
	Train Loss: 1.053 | Train Acc: 69.90%
	 Val. Loss: 1.044 |  Val. Acc: 67.51% 


	Epoch No:  3
	Train Loss: 1.020 | Train Acc: 69.90%
	 Val. Loss: 0.991 |  Val. Acc: 67.51% 


	Epoch No:  4
	Train Loss: 0.919 | Train Acc: 69.90%
	 Val. Loss: 0.891 |  Val. Acc: 67.51% 


	Epoch No:  5
	Train Loss: 0.863 | Train Acc: 69.90%
	 Val. Loss: 0.883 |  Val. Acc: 67.51% 


	Epoch No:  6
	Train Loss: 0.858 | Train Acc: 69.90%
	 Val. Loss: 0.881 |  Val. Acc: 67.51% 


	Epoch No:  7
	Train Loss: 0.855 | Train Acc: 69.90%
	 Val. Loss: 0.881 |  Val. Acc: 67.51% 


	Epoch No:  8
	Train Loss: 0.854 | Train Acc: 69.90%
	 Val. Loss: 0.881 |  Val. Acc: 67.51% 


	Epoch No:  9
	Train Loss: 0.853 | Train Acc: 69.90%
	 Val. Loss: 0.880 |  Val. Acc: 67.51% 

CPU times: user 2.81 s, sys: 141 ms, total: 2.95 s

In [30]:
model.load_state_dict(torch.load(model_path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [31]:
classify_tweet("Obama will improve the policies for the people.")

'Negative'