In [1]:
import pandas as pd

df = pd.read_csv("tweets.csv")
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [2]:
import torch.nn as nn
import torch

In [3]:
df.shape

(1364, 2)

In [4]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [5]:
import random
import torch, torchtext
from torchtext import data

In [6]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f8517966ed0>

In [7]:
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [8]:
fields = [('tweet', Tweet), ('label', Label)]

In [9]:
example = [torchtext.legacy.data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])] 

In [10]:
twitterDataset = torchtext.legacy.data.Dataset(example, fields)

In [11]:
(train, valid) = twitterDataset.split(split_ratio=[85, 15], random_state = random.seed(SEED))

In [12]:
len(train), len(valid)

(1159, 205)

In [13]:
vars(train.examples[11])

{'label': 1,
 'tweet': ['@sweetbay',
  'That',
  'was',
  'Paul',
  'Ryan',
  "'s",
  'budget',
  '.',
  'How',
  'did',
  'Obama',
  "'s",
  'budget',
  'do',
  '?',
  'Getting',
  'educated',
  'on',
  'the',
  'facts',
  'is',
  'the',
  'first',
  'step',
  'in',
  'losing',
  'that',
  'liberalism',
  '!']}

In [14]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [15]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4651
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1069), (':', 783), ('#', 780), ('.', 761), (',', 598), ('"', 550), ('the', 542), ('RT', 516), ('?', 419), ('to', 400)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [17]:
train_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweet),
                                                            sort_within_batch=True, device = device)

In [18]:
next(iter(train_iterator))
#len(train.examples[11].tweet)


[torchtext.legacy.data.batch.Batch of size 32]
	[.tweet]:('[torch.cuda.LongTensor of size 32x8 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]

In [19]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [20]:
import torch.nn as nn
import torch.nn.functional as F

In [21]:
import numpy as np


In [22]:
class Encoder(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, device):
        
        super().__init__()          
        self.device = device
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers,                            
                           batch_first=True)
        
  def forward(self, text, text_lengths, verbose):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]

        if not verbose:
          # packed sequence
          packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)        
          output, (hidden, cell) = self.lstm(packed_embedded)
          # output is a packed sequence [sum of sequence lengths, hidden layer]

        else:
          if verbose == 2:
            print("Encoder : \n\n")
          output_batch_list = []
          hidden_batch_list = []
          cell_batch_list = []
          for batch_length in range(embedded.shape[0]):
            hidden = torch.zeros(self.n_layers, 1, self.hidden_dim, device = self.device)
            cell = torch.zeros(self.n_layers, 1, self.hidden_dim, device = self.device)
            output_sequence_list = []
            if verbose == 2:
              print(f"\nSentence {batch_length + 1} : \n")
            for sequence_length in range(embedded.shape[1]):              
              output, (hidden, cell) = self.lstm(embedded[batch_length,sequence_length].unsqueeze(0).unsqueeze(0), (hidden, cell))
              if verbose == 2:
                print(f"\t Token {sequence_length + 1} : \n")
                print(f"\t\t Output Vector : {output} \n\t\tOutput Vector Shape: {output.shape} \n\t\tHidden Vector : {hidden} \n\t\tHidden Vector Shape: {hidden.shape} \n\t\tCell State : {cell} \n\t\tCell State Shape : {cell.shape} \n")
              output_sequence_list.append(output)
            output_batch_list.append(torch.cat(output_sequence_list, dim=1))
            hidden_batch_list.append(hidden)
            cell_batch_list.append(cell)
          output = torch.cat(output_batch_list, dim=0)
          hidden = torch.cat(hidden_batch_list, dim=1)
          cell = torch.cat(cell_batch_list, dim=1)
          if verbose == 2:
            print(f"Encoder Outputs : \n\tOutput Vector : {output} \n\tOutput Vector Shape: {output.shape} \n\tHidden Vector : {hidden} \n\tHidden Vector Shape: {hidden.shape} \n\tCell State : {cell} \n\tCell State Shape: {cell.shape}")    
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        return (output, (hidden, cell))

In [23]:
class Decoder(nn.Module):

 def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, device):
        
        super().__init__() 

        self.device = device
        self.n_layers = n_layers

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers,                            
                           batch_first=True)
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
 def forward(self, input, hidden, cell, verbose):

        if not verbose:
          packed_input = nn.utils.rnn.PackedSequence(torch.zeros(input.data.shape[0],input.data.shape[1]), input.batch_sizes, input.sorted_indices, input.unsorted_indices)
          output, (hidden, cell) = self.lstm(packed_input.to(self.device), (hidden,cell))

        else:
          if verbose == 2:
            print("Decoder : \n\n")
          output_batch_list = []
          hidden_batch_list = []
          cell_batch_list = []
          for layer in range(self.n_layers):
            for batch_length in range(input.shape[0]):            
              output_sequence_list = []
              if verbose == 2:
                print(f"\nEncoded Batch Vector {batch_length + 1} : \n")
              hidden_copy = hidden[layer, batch_length].unsqueeze(0).unsqueeze(0)
              cell_copy = cell[layer, batch_length].unsqueeze(0).unsqueeze(0)
              for sequence_length in range(input.shape[1]):              
                output, (hidden_copy, cell_copy) = self.lstm(torch.zeros(1, 1, input.shape[2], device = self.device), (hidden_copy, cell_copy))
                if verbose == 2:
                  print(f"\t Time Step {sequence_length + 1} : \n")
                  print(f"\t\t Output Vector : {output} \n\t\tOutput Vector Shape: {output.shape} \n\t\tHidden Vector : {hidden} \n\t\tHidden Vector Shape: {hidden.shape} \n\t\tCell State : {cell} \n\t\tCell State Shape : {cell.shape} \n")
                output_sequence_list.append(output)
              output_batch_list.append(torch.cat(output_sequence_list, dim=1))
              hidden_batch_list.append(hidden_copy)
              cell_batch_list.append(cell_copy)
          output = torch.cat(output_batch_list, dim=0)
          hidden = torch.cat(hidden_batch_list, dim=1)
          cell = torch.cat(cell_batch_list, dim=1)
          if verbose == 2:
            print(f"Decoder LSTM Outputs : \n\tOutput Vector : {output} \n\tOutput Vector Shape: {output.shape} \n\tHidden Vector : {hidden} \n\tHidden Vector Shape: {hidden.shape} \n\tCell State : {cell} \n\tCell State Shape: {cell.shape}")    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [24]:
class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, device):
        
        super().__init__()          
        
        self.encoder = Encoder(vocab_size, embedding_dim, hidden_dim, n_layers, device)
        self.decoder = Decoder(hidden_dim, hidden_dim, output_dim, n_layers, device)
        
    def forward(self, text, text_lengths, verbose):

        output, (hidden, cell) = self.encoder(text, text_lengths, verbose)
        output = self.decoder(output, hidden, cell, verbose)
            
        return output

In [25]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 1

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, device)

In [26]:
next(iter(train_iterator))


[torchtext.legacy.data.batch.Batch of size 32]
	[.tweet]:('[torch.cuda.LongTensor of size 32x40 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]

In [27]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (encoder): Encoder(
    (embedding): Embedding(4651, 300)
    (lstm): LSTM(300, 100, batch_first=True)
  )
  (decoder): Decoder(
    (lstm): LSTM(100, 100, batch_first=True)
    (fc): Linear(in_features=100, out_features=3, bias=True)
  )
)
The model has 1,637,203 trainable parameters


In [28]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [29]:
def train(model, iterator, optimizer, criterion, verbose):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweet  
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths, verbose).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
def evaluate(model, iterator, criterion, verbose):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweet
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths, verbose=True).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
N_EPOCHS = 10
best_valid_loss = float('inf')
verbose = 1

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, verbose)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, verbose)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.088 | Train Acc: 54.44%
	 Val. Loss: 1.074 |  Val. Acc: 68.30% 

	Train Loss: 1.038 | Train Acc: 69.12%
	 Val. Loss: 0.922 |  Val. Acc: 68.30% 

	Train Loss: 0.872 | Train Acc: 69.12%
	 Val. Loss: 0.870 |  Val. Acc: 68.30% 

	Train Loss: 0.862 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 

	Train Loss: 0.861 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 

	Train Loss: 0.861 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 

	Train Loss: 0.860 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 

	Train Loss: 0.860 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 

	Train Loss: 0.859 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 

	Train Loss: 0.858 | Train Acc: 69.12%
	 Val. Loss: 0.869 |  Val. Acc: 68.30% 



In [32]:
#load weights and tokenizer
verbose = 2
path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet, verbose=0):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor, verbose)

    _, pred = torch.max(prediction, 1) 
    
    return f"Prediction : {categories[pred.item()]}"

In [33]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.", verbose)

Encoder : 



Sentence 1 : 

	 Token 1 : 

		 Output Vector : tensor([[[ 0.2311, -0.1080,  0.1426, -0.1300,  0.1997,  0.0942, -0.2371,
           0.1375,  0.1587,  0.0797,  0.0872, -0.0191, -0.0968,  0.1260,
           0.0047,  0.0611,  0.0789,  0.2001,  0.2485, -0.0523, -0.0866,
           0.1587, -0.1764,  0.5200,  0.0469, -0.2001, -0.2917,  0.0109,
          -0.3948, -0.0362,  0.0682, -0.2144, -0.2874,  0.2068, -0.4267,
          -0.2933,  0.3453, -0.0605, -0.1362, -0.0485, -0.2290,  0.1022,
          -0.0898,  0.0860,  0.1775,  0.0146, -0.0124,  0.3733, -0.1292,
          -0.1644, -0.0508, -0.0655,  0.2081,  0.1266,  0.1518, -0.1809,
          -0.0268,  0.3593,  0.4176, -0.0852,  0.1124, -0.0787,  0.4671,
           0.0491, -0.0978,  0.2324,  0.3714, -0.0770, -0.0729,  0.0523,
           0.1445, -0.0512,  0.0304,  0.0975,  0.0943,  0.0627,  0.0119,
           0.1005,  0.1509,  0.2556,  0.1531, -0.0552, -0.1567, -0.0137,
          -0.0238, -0.1638,  0.0464, -0.2137,  0.3399,  0.0951

'Prediction : Negative'