In [1]:
import pandas as pd 
import spacy
from torch.optim import Adam
nlp = spacy.load("en")

## Dataset Preview

In [2]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [3]:
df.shape

(1364, 2)

In [4]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [5]:
df['tweets'][4]

'RT @wardollarshome: Obama has approved more targeted assassinations than any modern US prez; READ & RT: http://t.co/bfC4gbBW'

In [6]:
text = nlp(df['tweets'][4])

In [7]:
for token in text:
    print(token.text, "-->", token.dep_, "-->", token.pos_)

RT --> dep --> PROPN
@wardollarshome --> advmod --> X
: --> punct --> PUNCT
Obama --> nsubj --> PROPN
has --> aux --> AUX
approved --> ROOT --> VERB
more --> advmod --> ADV
targeted --> amod --> VERB
assassinations --> dobj --> NOUN
than --> prep --> SCONJ
any --> det --> DET
modern --> amod --> ADJ
US --> compound --> PROPN
prez --> pobj --> PROPN
; --> punct --> PUNCT
READ --> appos --> PROPN
& --> cc --> CCONJ
RT --> conj --> PROPN
: --> punct --> PUNCT
http://t.co/bfC4gbBW --> appos --> PROPN


## Defining Fields

In [8]:
import random
import torch, torchtext
from torchtext import data

SEED = 42
torch.manual_seed(SEED)

<torch._C.Generator at 0x2a26e6a48d0>

In [89]:
Tweet = data.Field(sequential=True, lower=True, 
                   tokenize='spacy', batch_first=True, include_lengths=True)
Label = data.LabelField(sequential=False, batch_first=True)

In [90]:
fields = [('tweets', Tweet), ('labels', Label)]

In [91]:
twitterDataset = data.TabularDataset("tweets.csv", format="CSV", 
                    fields=fields, skip_header=True)

In [92]:
twitterDataset.examples[0].tweets, twitterDataset.examples[0].labels

(['obama',
  'has',
  'called',
  'the',
  'gop',
  'budget',
  'social',
  'darwinism',
  '.',
  'nice',
  'try',
  ',',
  'but',
  'they',
  'believe',
  'in',
  'social',
  'creationism',
  '.'],
 '1')

In [93]:
train, valid = twitterDataset.split(split_ratio=[0.85, 0.15], 
                                    random_state=random.seed(SEED))

In [94]:
len(train), len(valid)

(1159, 205)

In [95]:
vars(train.examples[0])

{'tweets': ['as',
  'obama',
  "'s",
  'policies',
  'have',
  'turned',
  'women',
  'into',
  '3rd',
  'class',
  'citizens',
  ',',
  'he',
  'is',
  'working',
  'on',
  'bringing',
  'down',
  'the',
  'whole',
  'economy',
  '.',
  '#',
  'edshow'],
 'labels': '0'}

In [96]:
Tweet.build_vocab(train)

In [97]:
Tweet.vocab.freqs.most_common(5)

[('obama', 1149), ('#', 822), (':', 784), ('.', 775), ('the', 617)]

In [98]:
Label.build_vocab(train)

In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [100]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size=32, 
                                                           sort_key = lambda x:len(x.tweets),
                                                           sort_within_batch = True, device = device)

In [101]:
import os, pickle
with open('tweet_vocab.pkl', 'wb') as tokens:
    pickle.dump(Tweet.vocab.stoi, tokens)

In [102]:
import torch.nn as nn
import torch.nn.functional as F

In [162]:
class classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(classifier, self).__init__()
        
        #Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #Encoder
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, 
                               num_layers = n_layers, dropout = dropout,
                               batch_first = True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
#         print("Embedding Shape >>> :", embedded.shape)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
                                                            text_lengths.cpu(), 
                                                            batch_first=True)

        packed_output, (hidden, cell) = self.encoder(packed_embedded)
#         print("Hidden Shape >>> :", hidden.shape)
        dense_outputs = self.fc(hidden)

        output = F.softmax(dense_outputs[0], dim=1)
#         print("Output Shape >>> :", output.shape)
        return output

In [163]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 2
dropout = 0.2

In [164]:
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,
                  num_output_nodes, num_layers, dropout)

In [165]:
print(model)

classifier(
  (embedding): Embedding(3996, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=3, bias=True)
)


In [166]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [167]:
count_params(model)

1440703

In [193]:
def binary_accuracy(preds, y):
#     correct = (torch.max(preds, 1)[1] == y).float()
    correct = (torch.argmax(preds, axis=1) == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [194]:
optimizer = Adam(model.parameters(), lr = 0.0001)
criterion = nn.CrossEntropyLoss()

In [195]:
model = model.to(device)
criterion = criterion.to(device)

In [196]:
def train(model, criterion, optimizer, iterator):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        tweet, tweet_length = batch.tweets
        pred = model(tweet, tweet_length).squeeze()
#         print(pred.shape, batch.labels.shape)
        loss = criterion(pred, batch.labels)
        acc = binary_accuracy(pred, batch.labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss+= loss.item()
        epoch_acc+= acc.item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [197]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():        
        for batch in iterator:
            tweet, tweet_length = batch.tweets
            pred = model(tweet, tweet_length)
            loss = criterion(pred, batch.labels)
            acc = binary_accuracy(pred, batch.labels)
            
            epoch_loss+= loss.item()
            epoch_acc+= acc.item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)
        

In [198]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, criterion, optimizer, train_iterator)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Sentiment_weights.pt')
        
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\tValid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% \n")
    

	Train Loss: 1.026 | Train Acc: 62.54%
	Valid Loss: 1.001 | Valid Acc: 65.18% 

	Train Loss: 0.992 | Train Acc: 70.56%
	Valid Loss: 0.967 | Valid Acc: 70.09% 

	Train Loss: 0.958 | Train Acc: 70.39%
	Valid Loss: 0.935 | Valid Acc: 70.09% 

	Train Loss: 0.925 | Train Acc: 70.98%
	Valid Loss: 0.902 | Valid Acc: 72.32% 

	Train Loss: 0.894 | Train Acc: 70.90%
	Valid Loss: 0.870 | Valid Acc: 73.21% 

	Train Loss: 0.862 | Train Acc: 72.00%
	Valid Loss: 0.840 | Valid Acc: 74.11% 

	Train Loss: 0.836 | Train Acc: 74.36%
	Valid Loss: 0.819 | Valid Acc: 75.45% 

	Train Loss: 0.817 | Train Acc: 75.63%
	Valid Loss: 0.806 | Valid Acc: 77.23% 

	Train Loss: 0.803 | Train Acc: 76.81%
	Valid Loss: 0.796 | Valid Acc: 77.68% 

	Train Loss: 0.791 | Train Acc: 77.74%
	Valid Loss: 0.789 | Valid Acc: 77.23% 



In [199]:
one_batch = list(train_iterator)[0]
one_batch_tweets, one_batch_tl = one_batch.tweets
one_batch_labels = one_batch.labels

In [203]:
class classifier_B(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        #Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #Encoder
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, 
                               num_layers = n_layers, dropout = dropout,
                               batch_first = True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
#         print("Embedding Shape >>> :", embedded.shape)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
                                                            text_lengths.cpu(), 
                                                            batch_first=True)

        packed_output, (hidden, cell) = self.encoder(packed_embedded)
#         print("Hidden Shape >>> :", hidden.shape)
        dense_outputs = self.fc(hidden)

#         output = F.softmax(dense_outputs[0], dim=1)
#         print("Output Shape >>> :", output.shape)
        return dense_outputs

In [204]:
model1 = classifier_B(size_of_vocab, embedding_dim, num_hidden_nodes,
                  num_output_nodes, num_layers, dropout).to(device)

In [207]:
model1(one_batch_tweets, one_batch_tl)[1]

tensor([[ 3.0392e-02, -5.4347e-02, -7.9423e-02],
        [-2.9230e-02, -4.5042e-02, -8.6598e-02],
        [ 2.4554e-02, -4.5730e-02, -7.2890e-02],
        [-2.6812e-02, -9.4583e-02,  1.8838e-02],
        [-3.8122e-02, -1.0819e-01, -5.4539e-02],
        [ 8.4793e-03, -4.4041e-02, -1.7424e-02],
        [ 2.2971e-02, -1.0575e-01, -2.2305e-02],
        [-4.0405e-05, -1.4334e-01, -6.5632e-02],
        [ 1.9115e-03, -1.5202e-01, -4.9661e-02],
        [ 3.9820e-02, -1.2150e-01,  3.0164e-02],
        [ 1.8126e-02, -1.3804e-01, -6.4614e-02],
        [-1.0994e-02, -1.0100e-01, -3.6021e-02],
        [-3.7468e-02, -6.9810e-02, -5.8716e-02],
        [ 5.7924e-03, -1.0713e-01, -3.1258e-02],
        [-1.7007e-02, -9.2673e-02, -4.1319e-03],
        [-3.8182e-03, -8.1070e-02, -2.2898e-02],
        [ 1.6265e-02, -1.4541e-01, -5.7484e-02],
        [ 4.8177e-03, -3.4969e-02, -5.0477e-02],
        [ 2.1575e-02, -5.0200e-02, -3.0151e-02],
        [ 1.5401e-02, -7.3037e-02, -2.0071e-02],
        [ 1.3708e-02