<a href="https://colab.research.google.com/github/pratikiiitb2013/END2_p1/blob/main/Session6/%20END2_S6_howToMakeEncoderDecoderClasses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd


df = pd.read_csv("/content/tweets.csv")
# /content/tweets.csv
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [2]:
df.shape

(1364, 2)

In [3]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [4]:
import random
import torch, torchtext
from torchtext import data

In [5]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f0f5a476ed0>

In [6]:
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [7]:
fields = [('tweet', Tweet), ('label', Label)]

In [8]:
example = [torchtext.legacy.data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])] 

In [9]:
twitterDataset = torchtext.legacy.data.Dataset(example, fields)

In [10]:
(train, valid) = twitterDataset.split(split_ratio=[85, 15], random_state = random.seed(SEED))

In [11]:
len(train), len(valid)

(1159, 205)

In [12]:
vars(train.examples[11])

{'label': 1,
 'tweet': ['@sweetbay',
  'That',
  'was',
  'Paul',
  'Ryan',
  "'s",
  'budget',
  '.',
  'How',
  'did',
  'Obama',
  "'s",
  'budget',
  'do',
  '?',
  'Getting',
  'educated',
  'on',
  'the',
  'facts',
  'is',
  'the',
  'first',
  'step',
  'in',
  'losing',
  'that',
  'liberalism',
  '!']}

In [13]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [14]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4651
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1069), (':', 783), ('#', 780), ('.', 761), (',', 598), ('"', 550), ('the', 542), ('RT', 516), ('?', 419), ('to', 400)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
train_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweet),
                                                            sort_within_batch=True, device = device)

In [17]:
next(iter(train_iterator))
#len(train.examples[11].tweet)


[torchtext.legacy.data.batch.Batch of size 32]
	[.tweet]:('[torch.cuda.LongTensor of size 32x8 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]

In [18]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [19]:
import torch.nn as nn
import torch.nn.functional as F

class encoder_part(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout):
    super().__init__() 
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)

  def forward(self, text, text_lengths):
    # print('enc')
    # print(text.shape)
    embedded = self.embedding(text)
    # print(embedded.shape)
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
    packed_output, (hidden, cell) = self.encoder(packed_embedded)
    # print(hidden.shape, cell.shape)
    # print(packed_output.data.shape)
    return packed_output, hidden

class decoder_part(nn.Module):

  def __init__(self, input_to_decoder_size, decoder_hidden_size, no_times_decoder_cell_has_to_run):
    super().__init__()
    self.decoder_single_rnn_cell = nn.LSTMCell(input_to_decoder_size,decoder_hidden_size)
    self.no_times_decoder_cell_has_to_run = no_times_decoder_cell_has_to_run
    self.decoder_hidden_size = decoder_hidden_size

  def forward(self, encoder_context_vector):
    # print('dec')
    # print(encoder_context_vector.shape)
    encoder_context_vector = encoder_context_vector.squeeze(0)
    # print(encoder_context_vector.shape)
    hx = torch.zeros(encoder_context_vector.size(0),self.decoder_hidden_size).to(device)
    cx = torch.zeros(encoder_context_vector.size(0),self.decoder_hidden_size).to(device)
    otpt = []
    for i in range(self.no_times_decoder_cell_has_to_run):
      hx,cx = self.decoder_single_rnn_cell(encoder_context_vector,(hx,cx))
      otpt.append(hx)
      # print(i,hx.shape)
    otpt = torch.stack(otpt,dim = 0)
    return otpt, hx

class combining_encoder_decoder(nn.Module):
  
  def __init__(self, encoder, decoder, hidden_dim, output_dim):
      super().__init__()
      self.encoder = encoder
      self.decoder = decoder
      self.fc = nn.Linear(hidden_dim, output_dim)
  
  def forward(self,src,src_len):
    # print('combined')
    # print(src.shape)
    enc_packed_outputs, enc_hidden = self.encoder(src,src_len)
    # print(enc_hidden.shape)
    dec_otpt, dec_hidden = self.decoder(enc_hidden)
    # print(dec_hidden.shape)
    dense_outputs = self.fc(dec_hidden)
    # print(dense_outputs.shape)
    op = F.softmax(dense_outputs, dim=1)
    # return dense_outputs
    return op




In [20]:

# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 1
dropout = 0.2

# Instantiate the model
# model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)
enc = encoder_part(size_of_vocab, embedding_dim, num_hidden_nodes, num_layers, dropout = dropout)
dec = decoder_part(num_hidden_nodes,num_hidden_nodes,5)

model = combining_encoder_decoder(enc,dec,num_hidden_nodes,num_output_nodes).to(device)


  "num_layers={}".format(dropout, num_layers))


In [21]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

combining_encoder_decoder(
  (encoder): encoder_part(
    (embedding): Embedding(4651, 300)
    (encoder): LSTM(300, 100, batch_first=True, dropout=0.2)
  )
  (decoder): decoder_part(
    (decoder_single_rnn_cell): LSTMCell(100, 100)
  )
  (fc): Linear(in_features=100, out_features=3, bias=True)
)
The model has 1,637,203 trainable parameters


In [22]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [23]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweet  
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweet
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.080 | Train Acc: 55.53%
	 Val. Loss: 1.038 |  Val. Acc: 69.20% 

	Train Loss: 0.982 | Train Acc: 69.12%
	 Val. Loss: 0.895 |  Val. Acc: 68.30% 

	Train Loss: 0.873 | Train Acc: 69.12%
	 Val. Loss: 0.850 |  Val. Acc: 68.30% 

	Train Loss: 0.829 | Train Acc: 72.67%
	 Val. Loss: 0.826 |  Val. Acc: 74.11% 

	Train Loss: 0.794 | Train Acc: 77.70%
	 Val. Loss: 0.813 |  Val. Acc: 75.89% 

	Train Loss: 0.766 | Train Acc: 79.90%
	 Val. Loss: 0.794 |  Val. Acc: 76.34% 

	Train Loss: 0.739 | Train Acc: 82.35%
	 Val. Loss: 0.788 |  Val. Acc: 75.89% 

	Train Loss: 0.716 | Train Acc: 84.12%
	 Val. Loss: 0.785 |  Val. Acc: 76.79% 

	Train Loss: 0.702 | Train Acc: 85.47%
	 Val. Loss: 0.773 |  Val. Acc: 77.68% 

	Train Loss: 0.687 | Train Acc: 86.49%
	 Val. Loss: 0.760 |  Val. Acc: 79.46% 



In [26]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    enc = encoder_part(size_of_vocab, embedding_dim, num_hidden_nodes, num_layers, dropout = dropout).to(device)
    encoder_packed_outputs, encoder_final_hidden = enc(tensor,length_tensor)
    print('Encoder LSTM output vector after each word: ')
    for i in range(encoder_packed_outputs.data.shape[0]):
      print('after',i+1, 'word')
      print(encoder_packed_outputs.data[i])
    print()
    print('Encoder LSTM final hidden vector')
    print(encoder_final_hidden)
    print()
    dec = decoder_part(num_hidden_nodes,num_hidden_nodes,3).to(device)
    decoder_outputs, decoder_final_hidden = dec(encoder_final_hidden)
    print('Decoder LSTM output vector after each time step(total 3 time steps)')
    for i in range(decoder_outputs.shape[0]):
      print('after',i+1, 'time step')
      print(decoder_outputs[i])
    print()
    print('Decoder LSTM final hidden vector')
    print(decoder_final_hidden)
    print()
    print('Final vector after FC layer')
    print(prediction)
    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]


# enc_packed_outputs, enc_hidden = self.encoder(src,src_len)
# # print(enc_hidden.shape)
# dec_otpt, dec_hidden = self.decoder(enc_hidden)
# # print(dec_hidden.shape)
# dense_outputs = self.fc(dec_hidden)

In [27]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

Encoder LSTM output vector after each word: 
after 1 word
tensor([ 0.0352,  0.3577, -0.0248, -0.0633,  0.2264,  0.3074,  0.0270,  0.1468,
        -0.0827, -0.1270,  0.4442,  0.1063, -0.1246,  0.0489, -0.1838,  0.0282,
         0.2339, -0.0226,  0.2054, -0.3047,  0.0681, -0.0149, -0.1499,  0.0845,
        -0.0093, -0.0070, -0.0747, -0.1579, -0.0337, -0.1937, -0.0428,  0.0438,
        -0.1791,  0.0837,  0.2437, -0.0607, -0.0908,  0.1342, -0.0877, -0.1110,
        -0.0668,  0.4461,  0.0657,  0.0326,  0.1159, -0.4542,  0.1636,  0.0097,
        -0.1505,  0.1608, -0.0608,  0.2582, -0.3004, -0.0185,  0.0580,  0.2978,
        -0.0135,  0.0025, -0.0142, -0.0347,  0.0399,  0.1452, -0.3567, -0.2266,
        -0.0751,  0.1065, -0.3240, -0.0757, -0.0326, -0.0850, -0.1516,  0.0220,
         0.1574,  0.2640,  0.0546,  0.0136,  0.0121,  0.2065,  0.3219, -0.0631,
        -0.2596, -0.0485,  0.0212,  0.0215,  0.1327, -0.1218,  0.0588,  0.1193,
         0.1561, -0.0640, -0.0593, -0.0570,  0.2283,  0.1248, 

  "num_layers={}".format(dropout, num_layers))


'Negative'

In [28]:
classify_tweet("RT @Talkmaster: Oh now I get it. Obama was talking in shorthand and we were just too dumb to understand how smart he really is.  Gagme.")

Encoder LSTM output vector after each word: 
after 1 word
tensor([-0.1336, -0.1873, -0.0797, -0.1048, -0.0683, -0.4130, -0.1124, -0.1877,
         0.1896,  0.0704,  0.1121, -0.0431,  0.1759, -0.0422, -0.0012, -0.1328,
        -0.3805, -0.1145,  0.1671, -0.3878, -0.2770,  0.0176,  0.1299,  0.0374,
        -0.1623, -0.2797,  0.0472, -0.0976, -0.2012,  0.1882,  0.0119, -0.0842,
        -0.3260,  0.0758,  0.3246,  0.1177, -0.0512,  0.2329,  0.0161,  0.0095,
         0.4957, -0.2579, -0.1149,  0.1888, -0.0805,  0.0674, -0.2215,  0.3768,
        -0.1377, -0.1988,  0.0228, -0.0469,  0.2688,  0.0443,  0.1323, -0.0745,
        -0.2560,  0.1402, -0.0408, -0.1374, -0.0790, -0.1629, -0.3620, -0.0447,
         0.1269, -0.0353, -0.0540, -0.4123, -0.1184,  0.2504, -0.0104,  0.1671,
         0.0325,  0.2604,  0.0101,  0.2449,  0.2173,  0.1621, -0.0966,  0.0006,
         0.0543, -0.1400,  0.0901, -0.0347, -0.3698, -0.0932,  0.1912, -0.0879,
         0.3361, -0.0133, -0.0915,  0.0136,  0.3797,  0.1884, 

  "num_layers={}".format(dropout, num_layers))


'Positive'