### NLP in PyTorch
# Binary Classification
### Using GLoVe Embeddings 

In [1]:
import torch
import torchtext
import pandas as pd
import numpy as np
import spacy
import re

In [2]:
tweets = pd.read_csv('tweets.csv')
tweets.sample(5)

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
30524,30537,pos,Sentiment140,@Admance good morning mate
127,128,pos,Sentiment140,wide awake NOT!!!
8045,8048,pos,Sentiment140,#StarTrek in 41 minutes haven't acquired #IPh...
38391,38404,neg,Sentiment140,@amethystgurl07 i know exactly what you feel. ...
49987,50000,pos,Sentiment140,@ashsimpsonwentz @petewentz happy anniversary!...


In [3]:
tweets = tweets[['SentimentText', 'Sentiment']]
tweets.head()

Unnamed: 0,SentimentText,Sentiment
0,is so sad for my APL frie...,neg
1,I missed the New Moon trail...,neg
2,omg its already 7:30 :O,pos
3,.. Omgaga. Im sooo im gunna CRy. I've been at...,neg
4,i think mi bf is cheating on me!!! ...,neg


In [4]:
# good split ~ low skewness
tweets['Sentiment'].value_counts()

pos    26921
neg    23079
Name: Sentiment, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tweets, test_size=0.2)

In [6]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [7]:
train.to_csv('processed/train_tweets.csv', index=False)
test.to_csv('processed/test_tweets.csv', index=False)

In [8]:
# need to remove urls and @ mentions from twitter data and convert them to lower case
# p1 keeps all the alphabets and numeric data
# p2 removes http(s) word

def clean_tweets(text):
    p1 = re.compile(r'[^A-Za-z0-9]+')
    text = re.sub(p1, ' ', str(text))
    
    p2 = re.compile(r'https? ')
    text = re.sub(p2, ' ', text)
    
    text = re.sub('\s\s', ' ', text)
    text = text.lower()
    text = text.strip()
    return text.split()

In [9]:
# if pre-processing is defined, it will not tokenize by default which is necessary
TEXT = torchtext.data.Field(preprocessing=clean_tweets)
LABEL = torchtext.data.LabelField(dtype = torch.float)

In [10]:
datafields = [('SentimentText', TEXT), ('Sentiment', LABEL)]
datafields

[('SentimentText', <torchtext.data.field.Field at 0x23c50326988>),
 ('Sentiment', <torchtext.data.field.LabelField at 0x23c503269c8>)]

In [11]:
from torchtext.data import TabularDataset
trn, tst = TabularDataset.splits(path='processed', train='train_tweets.csv', test='test_tweets.csv',
                                format='csv', skip_header=True, fields=datafields)

In [12]:
print(vars(trn[1]))

{'SentimentText': ['aceofsabres', 'apparently', 'the', 'iphone', 'still', 'likes', 'to', 'fuck', 'with', 'my', 'dyslexia', 'that', 'should', 'have', 'been', 'dragons'], 'Sentiment': 'neg'}


### building vocab using GLoVe Embeddings

In [13]:
glove = torchtext.vocab.GloVe(name='6B', dim=100)

In [14]:
# trn has a len of 40K, we'll encode top 25K and use unk for rest
TEXT.build_vocab(trn, max_size=25000, vectors=glove, unk_init=torch.Tensor.normal_)
len(TEXT.vocab)

25002

In [15]:
LABEL.build_vocab(trn)
LABEL.vocab.freqs

Counter({'neg': 18472, 'pos': 21528})

In [16]:
print(TEXT.vocab.freqs.most_common(25))

[('i', 24333), ('the', 12196), ('to', 11987), ('you', 10680), ('a', 9148), ('it', 8357), ('and', 6873), ('my', 6302), ('quot', 5593), ('is', 5161), ('that', 5016), ('for', 4966), ('s', 4964), ('in', 4907), ('t', 4769), ('me', 4554), ('of', 4407), ('on', 3935), ('have', 3679), ('so', 3626), ('but', 3513), ('m', 3458), ('be', 2913), ('not', 2860), ('just', 2761)]


In [17]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'i', 'the', 'to', 'you', 'a', 'it', 'and', 'my']

In [18]:
TEXT.vocab.stoi['london']

736

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [20]:
from torchtext.data import BucketIterator

train_iterator, test_iterator = BucketIterator.splits(
    (trn, tst), device = device, batch_size=256, sort_key= lambda x: len(x.SentimentText)
)

In [21]:
len(train_iterator), len(test_iterator)

(157, 40)

In [22]:
import torch.nn as nn

In [23]:
# NN architecture
input_dim = len(TEXT.vocab)
embedding_dim = 100 # since we have taken GLoVe of 100 D
hidden_dim = 25
output_dim = 1

In [24]:
class BiDirectRNN(nn.Module):
    def __init__(self):
        super(BiDirectRNN, self).__init__()
            
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # two layer RNN -> GRU cells stacked over each other
        # dropout inside GRU config, is for each GRU layer
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2,
                          bidirectional=True, dropout=0.3)
        # dimensionality of the input is hidden * 2, because we get two hidden states in a bidirectional RNN
        # one in the forward direction and another when we pass input in the reverse direction
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, text):
        embedded = self.embedding(text)
        embed_drop = self.dropout(embedded)
        output, hidden = self.gru(embed_drop)
        # access the last two hidden states from the forward and backward RNNs and concatenate them to get final hidden
        hidden = torch.cat((hidden[-2,:], hidden[-1,:]), dim=1)
        hidden = self.dropout(hidden)
        hidden_1d = hidden.squeeze(0)
        out = self.fc(hidden_1d)       
        
        return out

In [25]:
model = BiDirectRNN()
model.to(device)
model

BiDirectRNN(
  (embedding): Embedding(25002, 100)
  (gru): GRU(100, 25, num_layers=2, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=50, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [26]:
pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings.shape

torch.Size([25002, 100])

### replace the initial weights of the embedding layer with the pre-trained GLoVe embeddings

In [27]:
# since we had added unk_init during build_vocab, unk and pad are already present as first two vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [28]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [29]:
# training the model
epochs = 15
model.train()

for epoch in range(epochs):
    
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in train_iterator:
        batch.SentimentText = batch.SentimentText.to(device)
        batch.Sentiment = batch.Sentiment.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(batch.SentimentText).squeeze(1)
        
        loss = criterion(predictions, batch.Sentiment)
        
        rounded_pred = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_pred == batch.Sentiment).float()
        acc = correct.sum() / len(correct)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    train_loss = epoch_loss/len(train_iterator)
    train_acc = epoch_acc/len(train_iterator)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f} %')

| Epoch: 01 | Train Loss: 0.618 | Train Acc: 65.38 %
| Epoch: 02 | Train Loss: 0.500 | Train Acc: 75.85 %
| Epoch: 03 | Train Loss: 0.435 | Train Acc: 80.09 %
| Epoch: 04 | Train Loss: 0.376 | Train Acc: 83.42 %
| Epoch: 05 | Train Loss: 0.335 | Train Acc: 85.65 %
| Epoch: 06 | Train Loss: 0.305 | Train Acc: 87.06 %
| Epoch: 07 | Train Loss: 0.280 | Train Acc: 88.41 %
| Epoch: 08 | Train Loss: 0.262 | Train Acc: 89.08 %
| Epoch: 09 | Train Loss: 0.247 | Train Acc: 89.91 %
| Epoch: 10 | Train Loss: 0.232 | Train Acc: 90.54 %
| Epoch: 11 | Train Loss: 0.221 | Train Acc: 91.07 %
| Epoch: 12 | Train Loss: 0.209 | Train Acc: 91.39 %
| Epoch: 13 | Train Loss: 0.199 | Train Acc: 91.91 %
| Epoch: 14 | Train Loss: 0.188 | Train Acc: 92.54 %
| Epoch: 15 | Train Loss: 0.181 | Train Acc: 92.82 %


In [30]:
# testing the model

epoch_loss = 0
epoch_acc = 0

model.eval()
with torch.no_grad():
    for batch in test_iterator:
        batch.Sentiment = batch.Sentiment.to(device)
        batch.SentimentText = batch.SentimentText.to(device)
        
        predictions = model(batch.SentimentText).squeeze(1)
        loss = criterion(predictions, batch.Sentiment)
        
        rounded_pred = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_pred == batch.Sentiment).float()
        acc = correct.sum() / len(correct)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    test_loss = epoch_loss/len(test_iterator)
    test_acc = epoch_acc/len(test_iterator)
    
    print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

| Test Loss: 0.692 | Test Acc: 73.42%


In [31]:
# manual testing

def manual_test(scentence):
    tokens = clean_tweets(scentence)
    embed = [TEXT.vocab.stoi[w] for w in tokens]
    tensor = torch.LongTensor(embed)
    tensor = tensor.unsqueeze(1)
    cuda_tensor = tensor.to(device)
    pred = torch.sigmoid(model(cuda_tensor))
    return pred.item()

In [32]:
manual_test('I hated the show #awful')

0.9903268218040466

In [33]:
manual_test('great movie... #nice #enjoyed')

0.024314794689416885

In [34]:
manual_test('okayish')

0.41913822293281555