In [121]:
import pandas as pd
df = pd.read_csv('/content/tweets-1.csv')

In [122]:
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [123]:
df.shape

(1364, 2)

In [124]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [125]:
# !pip install torch --quiet
# !pip uninstall torchtext -y
# !pip install torchtext==0.9.0 --quiet


# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fafd188bb10>

In [126]:
Tweet = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)  # sequential means data is sequential and tokenization is required
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)



In [127]:
fields = [('tweets', Tweet),('labels',Label)]
fields

[('tweets', <torchtext.legacy.data.field.Field at 0x7faef898cbb0>),
 ('labels', <torchtext.legacy.data.field.LabelField at 0x7faef86d5fd0>)]

In [128]:
example = [data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])]   # data.example defines a single training or test example.Stores each column of the example as an attribute.
len(example)

1364

In [129]:
twitterDataset = data.Dataset(example, fields)  # Defines a dataset composed of Examples along with its Fields.
twitterDataset

<torchtext.legacy.data.dataset.Dataset at 0x7faef633cd60>

In [130]:
(train, valid) = twitterDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED))

In [131]:
(len(train), len(valid))
# len(valid.examples)

(1159, 205)

In [132]:
# type(vars((train.examples[10])))
(vars(train.examples[10]))

{'tweets': ['Obama',
  ',',
  'Romney',
  'agree',
  ':',
  'Admit',
  'women',
  'to',
  'Augusta',
  'golf',
  'club',
  ':',
  'US',
  'President',
  'Barack',
  'Obama',
  'believes',
  'women',
  'should',
  'be',
  'allowe',
  '...',
  'http://t.co/PVKrepqI'],
 'labels': 0}

In [133]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [134]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)
print('Labels : ', Tweet.vocab.stoi)


Size of input vocab :  4653
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1069), (':', 781), ('#', 780), ('.', 761), (',', 598), ('"', 550), ('the', 542), ('RT', 516), ('?', 419), ('to', 400)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [135]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [136]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 20, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [137]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)
# len(Tweet.vocab)

In [138]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer 
        # self.encoder = nn.LSTM(embedding_dim, 
        #                    hidden_dim, 
        #                    num_layers=n_layers, 
        #                    dropout=dropout,
        #                    batch_first=True)
        # self.encoder = nn.RNN(embedding_dim, hidden_dim, num_layers = n_layers,dropout=dropout,batch_first=True )  # RNN
        # self.encoder = nn.GRU(embedding_dim, 
        #                    hidden_dim, 
        #                    num_layers=n_layers, 
        #                    dropout=dropout,
        #                    batch_first=True)
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True, bidirectional= True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        # print('Step 1', text.shape, (text.data.shape)) # [32, 15] - 32 (batch size) sentences with 15 word max
        # text = [batch size, sent_length]  
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]  
        # print('Step 2', embedded.shape)  # [32, 15, 300] - 32 sentences with 15 word max and each word with 300d embedding
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        # print('Step 3', packed_embedded.data.shape)  # [476, 300])  # 476 total words in 32 sentences and each word has 300 dim embedding. This excludes <pad> and <unk> sequences in input

        packed_output, (hidden, cell) = self.encoder(packed_embedded) # LSTM
        # packed_output, hidden = self.encoder(packed_embedded) # RNN
        # packed_output, (hidden) = self.encoder(packed_embedded) # GRU
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        # print('Step 4', hidden.shape) # torch.Size([2, 32, 100]) - 2 layers of LSTM with 32 output words (output of LSTM is a word not a sentence) and 100 as hidden layer dimension
        # Hidden = [batch size, hid dim * num directions]

        dense_outputs = self.fc(hidden)   
        # print('Step 5', dense_outputs.shape)# torch.Size([2, 32, 3]) 

        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
        # print('Step 6', output.shape)    # torch.Size([32, 3])  # 32 words for each of input sentences with 3 outputs coming out as relative probabilities.
        return output


In [139]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 1
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)



In [140]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape)  # 5*3 not 3*5; same is issue with hidden weights

classifier(
  (embedding): Embedding(4653, 300)
  (encoder): LSTM(300, 100, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=100, out_features=3, bias=True)
)
The model has 1,717,803 trainable parameters
embedding.weight torch.Size([4653, 300])
encoder.weight_ih_l0 torch.Size([400, 300])
encoder.weight_hh_l0 torch.Size([400, 100])
encoder.bias_ih_l0 torch.Size([400])
encoder.bias_hh_l0 torch.Size([400])
encoder.weight_ih_l0_reverse torch.Size([400, 300])
encoder.weight_hh_l0_reverse torch.Size([400, 100])
encoder.bias_ih_l0_reverse torch.Size([400])
encoder.bias_hh_l0_reverse torch.Size([400])
fc.weight torch.Size([3, 100])
fc.bias torch.Size([3])


In [141]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [142]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        # print('seperate', tweet)
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [143]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [144]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.060 | Train Acc: 55.42%
	 Val. Loss: 1.007 |  Val. Acc: 64.09% 

	Train Loss: 0.959 | Train Acc: 72.50%
	 Val. Loss: 0.915 |  Val. Acc: 68.18% 

	Train Loss: 0.866 | Train Acc: 71.46%
	 Val. Loss: 0.854 |  Val. Acc: 71.82% 

	Train Loss: 0.811 | Train Acc: 75.86%
	 Val. Loss: 0.841 |  Val. Acc: 71.82% 

	Train Loss: 0.781 | Train Acc: 78.27%
	 Val. Loss: 0.811 |  Val. Acc: 75.91% 

	Train Loss: 0.757 | Train Acc: 80.69%
	 Val. Loss: 0.814 |  Val. Acc: 74.09% 

	Train Loss: 0.739 | Train Acc: 81.98%
	 Val. Loss: 0.787 |  Val. Acc: 76.82% 

	Train Loss: 0.723 | Train Acc: 83.79%
	 Val. Loss: 0.783 |  Val. Acc: 76.82% 

	Train Loss: 0.709 | Train Acc: 85.08%
	 Val. Loss: 0.779 |  Val. Acc: 78.18% 

	Train Loss: 0.698 | Train Acc: 86.81%
	 Val. Loss: 0.778 |  Val. Acc: 78.64% 



LSTM: 

Step 1 torch.Size([20, 26]) torch.Size([20, 26]) 

Step 2 torch.Size([20, 26, 300]) 

Step 3 torch.Size([506, 300]) 

Step 4 torch.Size([1, 20, 100]) 

Step 5 torch.Size([1, 20, 3]) 

Step 6 torch.Size([20, 3]) 


RNN: Underfitting 76:85  
LSTM also shows similar result except for intial layers which start very low.   
GRU & Bidirectional LSTM works best still having underfitting 79:86 & 79:87
 