In [1]:
%%capture
!pip install wandb --upgrade

In [2]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import pandas as pd
df = pd.read_csv("/content/tweets.csv")
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [4]:
df.shape

(1364, 2)

In [5]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [6]:
import random
import torch, torchtext
from torchtext import data

In [7]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fed76fe75d0>

In [8]:
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [9]:
fields = [('tweet', Tweet), ('label', Label)]

In [10]:
example = [torchtext.legacy.data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])] 

In [11]:
twitterDataset = torchtext.legacy.data.Dataset(example, fields)

In [12]:
(train, valid) = twitterDataset.split(split_ratio=[85, 15], random_state = random.seed(SEED))

In [13]:
len(train), len(valid)

(1159, 205)

In [14]:
vars(train.examples[11])

{'label': 1,
 'tweet': ['@sweetbay',
  'That',
  'was',
  'Paul',
  'Ryan',
  "'s",
  'budget',
  '.',
  'How',
  'did',
  'Obama',
  "'s",
  'budget',
  'do',
  '?',
  'Getting',
  'educated',
  'on',
  'the',
  'facts',
  'is',
  'the',
  'first',
  'step',
  'in',
  'losing',
  'that',
  'liberalism',
  '!']}

In [15]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [16]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4651
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1069), (':', 783), ('#', 780), ('.', 761), (',', 598), ('"', 550), ('the', 542), ('RT', 516), ('?', 419), ('to', 400)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [18]:
train_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweet),
                                                            sort_within_batch=True, device = device)

In [19]:
next(iter(train_iterator))
#len(train.examples[11].tweet)


[torchtext.legacy.data.batch.Batch of size 32]
	[.tweet]:('[torch.cuda.LongTensor of size 32x8 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]

In [20]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [25]:
import torch.nn as nn
import torch.nn.functional as F

class classifier_connected_arch(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=1, 
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        self.decoder = nn.LSTM(100, 
                           hidden_dim, 
                           num_layers=1, 
                           batch_first=True)
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)

        packed_encoded_output, (hidden, cell) = self.encoder(packed_embedded)

        packed_decoded_output, (hidden, cell) = self.decoder(packed_encoded_output)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output, packed_encoded_output, packed_decoded_output

In [26]:
import torch.nn as nn
import torch.nn.functional as F

class classifier_disconnected_arch(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=1, 
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        self.decoder = nn.LSTM(100, 
                           hidden_dim, 
                           num_layers=1, 
                           batch_first=True)
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        
        encoded_output, (hidden, cell) = self.encoder(embedded)
        last_layer_output = encoded_output.data[:,-1:,:]

        output_stack = []
        for i in range(4):
                decoded_output, (hidden, cell) = self.decoder(last_layer_output)
                output_stack.append(decoded_output)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            

        return output, encoded_output, output_stack

In [50]:
config = dict(
    size_of_vocab = len(Tweet.vocab),
    embedding_dim = 300,
    num_hidden_nodes = 100,
    num_output_nodes = 3,
    num_layers = 1,
    dropout = 0.2
    )

# Define hyperparameters

config["size_of_vocab"]
# Instantiate the model classifier_connected_arch
# model = classifier_connected_arch(config["size_of_vocab"], config["embedding_dim"], config["num_hidden_nodes"], config["num_output_nodes"], config["num_layers"], dropout = config["dropout"])
model = classifier_disconnected_arch(config["size_of_vocab"], config["embedding_dim"], config["num_hidden_nodes"], config["num_output_nodes"], config["num_layers"], dropout = config["dropout"])


In [51]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier_disconnected_arch(
  (embedding): Embedding(4651, 300)
  (encoder): LSTM(300, 100, batch_first=True)
  (decoder): LSTM(100, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=3, bias=True)
)
The model has 1,637,203 trainable parameters


In [52]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)


In [53]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweet  
        
        # convert to 1D tensor
        predictions, _, _ = model(tweet, tweet_lengths)  
        predictions = predictions.squeeze()
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [54]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweet
            
            # convert to 1d tensor
            predictions, _ , _ = model(tweet, tweet_lengths)
            predictions = predictions.squeeze()
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [55]:
N_EPOCHS = 20
best_valid_loss = float('inf')
with wandb.init(project="session-6", config=config):
    wandb.watch(model, criterion, log="all", log_freq=10)
    for epoch in range(N_EPOCHS):
        
        # train the model
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        
        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights.pt')
        wandb.log({"epoch": epoch, "train_loss": train_loss , "train_acc": train_acc,  "val_loss": valid_loss, "val_acc": valid_acc})
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.086 | Train Acc: 69.12%
	 Val. Loss: 1.076 |  Val. Acc: 68.30% 

	Train Loss: 1.071 | Train Acc: 69.12%
	 Val. Loss: 1.056 |  Val. Acc: 68.30% 

	Train Loss: 1.052 | Train Acc: 69.12%
	 Val. Loss: 1.032 |  Val. Acc: 68.30% 

	Train Loss: 1.029 | Train Acc: 69.12%
	 Val. Loss: 1.000 |  Val. Acc: 68.30% 

	Train Loss: 1.002 | Train Acc: 69.12%
	 Val. Loss: 0.973 |  Val. Acc: 68.30% 

	Train Loss: 0.972 | Train Acc: 69.12%
	 Val. Loss: 0.950 |  Val. Acc: 68.30% 

	Train Loss: 0.944 | Train Acc: 69.12%
	 Val. Loss: 0.932 |  Val. Acc: 68.30% 

	Train Loss: 0.930 | Train Acc: 69.12%
	 Val. Loss: 0.919 |  Val. Acc: 68.30% 

	Train Loss: 0.920 | Train Acc: 69.12%
	 Val. Loss: 0.910 |  Val. Acc: 68.30% 

	Train Loss: 0.906 | Train Acc: 69.12%
	 Val. Loss: 0.903 |  Val. Acc: 68.30% 

	Train Loss: 0.901 | Train Acc: 69.12%
	 Val. Loss: 0.897 |  Val. Acc: 68.30% 

	Train Loss: 0.890 | Train Acc: 69.12%
	 Val. Loss: 0.892 |  Val. Acc: 68.30% 

	Train Loss: 0.890 | Train Acc: 69.12%
	

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,19.0
train_loss,0.86537
train_acc,0.69124
val_loss,0.87902
val_acc,0.68304
_runtime,12.0
_timestamp,1623339097.0
_step,19.0


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_loss,██▇▆▅▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁
train_acc,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▆▅▄▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▂▂▃▃▃▃▄▄▅▅▅▆▆▆▆▇▇██
_timestamp,▁▂▂▃▃▃▃▄▄▅▅▅▆▆▆▆▇▇██
_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██


In [58]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    print(tokenized)
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction, enc, dec = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()], enc, dec, tokenized

In [59]:
pred, enc, dec, tokenized = classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

['A', 'valid', 'explanation', 'for', 'why', 'Trump', 'wo', "n't", 'let', 'women', 'on', 'the', 'golf', 'course', '.']


In [60]:
enc.data.shape[0]

1

In [61]:
for i in range(enc.data.shape[1]):
    print(tokenized[i])
    print(enc[0][i])
    print("-"* 100)

A
tensor([ 0.0431,  0.3128, -0.0834, -0.0561,  0.0753, -0.4185, -0.0869,  0.0546,
        -0.0699, -0.0739, -0.2454,  0.1826,  0.0034,  0.1956, -0.2409, -0.0176,
        -0.1535,  0.0246,  0.2640, -0.1745, -0.0527,  0.1092,  0.3016,  0.2156,
        -0.0108,  0.0161,  0.1804,  0.0386, -0.2951, -0.3025,  0.0943, -0.1377,
        -0.2137,  0.0958, -0.0568,  0.1755, -0.0952,  0.0382, -0.0458, -0.0078,
        -0.0657,  0.0849,  0.0266, -0.0072,  0.0755, -0.2042,  0.3718,  0.0024,
        -0.2598,  0.3956, -0.1675,  0.1869, -0.0673, -0.0793,  0.0922,  0.0134,
        -0.2729,  0.0040,  0.0280, -0.0348,  0.3063,  0.1800, -0.1911, -0.0460,
        -0.0604,  0.1184, -0.3419, -0.1800,  0.0823, -0.0234,  0.2799,  0.1220,
         0.0264, -0.1698,  0.0613,  0.0186,  0.0480,  0.2481,  0.1661, -0.0455,
        -0.0689,  0.0621,  0.0337, -0.1824,  0.2746, -0.1772,  0.0857,  0.0864,
         0.0382,  0.0246, -0.0810, -0.0392,  0.1773, -0.0134, -0.2248,  0.0152,
         0.0447,  0.2105, -0.3283,  0.

In [63]:
for i in range(4):
    print(dec[i])

tensor([[[ 0.0576,  0.0086,  0.1627,  0.1050,  0.0709, -0.0079,  0.0873,
          -0.1580, -0.0468,  0.1488, -0.0952, -0.0512,  0.2427,  0.0115,
          -0.0869,  0.0545, -0.0095, -0.0430,  0.1399,  0.2015,  0.1519,
           0.0887, -0.0443,  0.0204,  0.1215,  0.0808, -0.0367,  0.1204,
          -0.1028, -0.0247,  0.0463, -0.0911, -0.0046, -0.0258, -0.1187,
          -0.1129,  0.1020, -0.0941, -0.1348,  0.1232, -0.1346, -0.0585,
           0.2150, -0.0254,  0.0695, -0.2412,  0.0406, -0.0386, -0.2741,
           0.1612, -0.2460, -0.0604,  0.1664, -0.2078,  0.1106, -0.0500,
          -0.1183, -0.1632,  0.3300,  0.0408, -0.1406,  0.2410,  0.1216,
          -0.0505, -0.0950,  0.0771,  0.0400, -0.0595, -0.0246, -0.2470,
          -0.1390,  0.0558,  0.0748, -0.0749,  0.0563,  0.0674,  0.1807,
          -0.1447, -0.0081, -0.0025,  0.0721,  0.1840, -0.0920,  0.0180,
          -0.0142,  0.1209, -0.0565, -0.1928,  0.2055,  0.0058, -0.0376,
          -0.2789, -0.3549,  0.2111,  0.1810, -0.01