### NLP in PyTorch
# Binary Classification

In [1]:
import torchtext
import torch
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2889,ham,Shuhui has bought ron's present it's a swatch ...,,,
529,ham,Jay says that you're a double-faggot,,,
2590,ham,Lol I have to take it. member how I said my au...,,,
4033,ham,Wot u up 2? Thout u were gonna call me!! Txt b...,,,
2519,ham,Ok. I only ask abt e movie. U wan ktv oso?,,,


In [3]:
data = data.rename(columns={'v1' : 'label', 'v2' : 'text'})
data = data[['text', 'label']]
data.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

train.reset_index(drop=True)
test.reset_index(drop=True)

train.to_csv('processed/train.csv', index=False)
test.to_csv('processed/test.csv', index=False)

In [5]:
# torchtext Field parameter specify how data should be preprocessed
from torchtext.data import Field

In [6]:
# Field - by default tokenizes the words based on space
TEXT = Field()
LABEL = torchtext.data.LabelField(dtype = torch.float)

In [7]:
datafields = [('text', TEXT), ('label', LABEL)]
datafields

[('text', <torchtext.data.field.Field at 0x21a02154208>),
 ('label', <torchtext.data.field.LabelField at 0x21a02145fc8>)]

In [8]:
# field objects know how to process the raw data 
# we need to specify what raw data they should work on which is done using TabularDataset
from torchtext.data import TabularDataset

# splits data into train/test as torchtext.datasets objects & process the data using the Fields defined
trn, tst = TabularDataset.splits(path='processed', train='train.csv', test='test.csv',
                                 format='csv', skip_header=True, fields=datafields)

In [9]:
# every record is now represented using an example object
trn[:3]

[<torchtext.data.example.Example at 0x21a02155fc8>,
 <torchtext.data.example.Example at 0x21a02162048>,
 <torchtext.data.example.Example at 0x21a02162088>]

In [10]:
tst[:3]

[<torchtext.data.example.Example at 0x21a02776248>,
 <torchtext.data.example.Example at 0x21a027762c8>,
 <torchtext.data.example.Example at 0x21a02776688>]

In [11]:
# each object is represented by text and label keys
trn[0].__dict__.keys()

dict_keys(['text', 'label'])

In [12]:
print(trn[5].text)

['Text', 'PASS', 'to', '69669', 'to', 'collect', 'your', 'polyphonic', 'ringtones.', 'Normal', 'gprs', 'charges', 'apply', 'only.', 'Enjoy', 'your', 'tones']


In [13]:
print(vars(trn.examples[5]))

{'text': ['Text', 'PASS', 'to', '69669', 'to', 'collect', 'your', 'polyphonic', 'ringtones.', 'Normal', 'gprs', 'charges', 'apply', 'only.', 'Enjoy', 'your', 'tones'], 'label': 'spam'}


In [14]:
# For the TEXT field to convert words into integers, it needs to be told what the entire vocabulary is
# To do this, build_vocab by passing in the dataset to build the vocabulary on
TEXT.build_vocab(trn, max_size=10000)

In [15]:
len(TEXT.vocab)
# extra 2 are for unk (words that are outside of top 10K will be considered to be unknown) 
# and padding (to make words of the same length)

10002

In [16]:
LABEL.build_vocab(tst)
len(LABEL.vocab)
# contains just spam and ham labels

2

In [17]:
# most common 10 words
print(TEXT.vocab.freqs.most_common(25))

[('to', 1744), ('you', 1311), ('I', 1177), ('a', 1060), ('the', 957), ('and', 682), ('in', 638), ('is', 611), ('i', 591), ('u', 549), ('for', 522), ('my', 507), ('of', 463), ('me', 455), ('your', 434), ('on', 395), ('have', 394), ('that', 347), ('2', 321), ('are', 315), ('it', 315), ('call', 307), ('or', 301), ('at', 295), ('be', 280)]


In [18]:
# what words correspond to the numeric integers, you can use itos
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'to', 'you', 'I', 'a', 'the', 'and', 'in', 'is']

In [19]:
# numeric representation of individual strings can be known using stoi
TEXT.vocab.stoi['you']

3

set up an iterator that will iterate over our text in batches

Bucket iterator returns a batch of examples where every example is of a similar length
For each batch of text data that is feed into an RNN, the RNN memory cell has to be unrolled to the length of the sentences in that batch.
All of the sentences in the batch have to have the same length, which means they may need to be padded.
The bucket iterator tries to return examples of a similar length in each batch, thus minimizing the padding per example

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [21]:
from torchtext.data import BucketIterator

train_iterator, test_iterator = BucketIterator.splits(
    (trn, tst), # pass in the datasets we want the iterator to draw data from
    device = device,
    batch_size = 128,
    sort_key = lambda x: len(x.text)
)

In [22]:
import torch.nn as nn

nn.Embedding class in PyTorch converts the one-hot encoded sentences into a dense format using embeddings to represent individual words, rather than one-hot feature vectors

In [23]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # input to RNN cell is the current word's embeddings and the previous hidden state
        # instead of basic RNN cell, can use GRU cell without anyother code changes
        # self.rnn = nn.GRU(embedding_dim, hidden_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, text):
        
        # text: [scentence_length, batch_size]
        # every input scentence is a list of indices of the one-hot encoded words
        
        embedded = self.embedding(text)
        # words will now be represented using its embeddings
        # embedded: [scentence_length, batch_size, embedding_dim]
        embed_drop = self.dropout(embedded)
        
        # RNN cells produces, output as well as hidden state for next iteration
        output, hidden = self.rnn(embed_drop)
        # output: [scentence_length, batch_size, hidden_dim]
        # hidden: [1, batch_size, hidden_dim]
        
        hidden_1d = hidden.squeeze(0)
        
        out = self.fc(hidden_1d)
        return out

In [24]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # input to LSTM cell is the current word's embeddings and the previous hidden state
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, text):
        
        # text: [scentence_length, batch_size]
        # every input scentence is a list of indices of the one-hot encoded words
        
        embedded = self.embedding(text)
        # words will now be represented using its embeddings
        # embedded: [scentence_length, batch_size, embedding_dim]
        embed_out = self.dropout(embedded)
        # LSTM cells produces, output, last hidden state and last cell state of LSTM cell
        output, (hidden, _) = self.lstm(embed_out)
        # output: [scentence_length, batch_size, hidden_dim]
        # hidden: [1, batch_size, hidden_dim]
        
        hidden_1d = hidden.squeeze(0)
        
        out = self.fc(hidden_1d)
        return out

In [25]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

In [26]:
# model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)
model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim)
model.to(device)

LSTM(
  (embedding): Embedding(10002, 100)
  (lstm): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [27]:
# Binary cross-entropy with logits = cross-entropy for binary classification+sigmoid function to get predictions b/w 0 and 1
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [28]:
def train_model(model, iterator):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        batch.text = batch.text.to(device)

        optimizer.zero_grad()
        # model outputs data in form [batch_size, 1]
        # needs to be converted to [batch_size]
        predictions =  model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.label)

        # sigmoid the predictions to get val b.w 0 and 1
        # round it off, to make it fall under a single label
        rounded_pred = torch.round(torch.sigmoid(predictions))

        correct = (rounded_pred == batch.label).float()

        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [29]:
epochs = 15

for epoch in range(epochs):
    train_loss, train_acc = train_model(model, train_iterator)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f} %')

| Epoch: 01 | Train Loss: 0.470 | Train Acc: 86.35 %
| Epoch: 02 | Train Loss: 0.398 | Train Acc: 86.37 %
| Epoch: 03 | Train Loss: 0.398 | Train Acc: 86.39 %
| Epoch: 04 | Train Loss: 0.398 | Train Acc: 86.45 %
| Epoch: 05 | Train Loss: 0.398 | Train Acc: 86.48 %
| Epoch: 06 | Train Loss: 0.398 | Train Acc: 86.48 %
| Epoch: 07 | Train Loss: 0.397 | Train Acc: 86.47 %
| Epoch: 08 | Train Loss: 0.398 | Train Acc: 86.54 %
| Epoch: 09 | Train Loss: 0.398 | Train Acc: 86.59 %
| Epoch: 10 | Train Loss: 0.398 | Train Acc: 86.54 %
| Epoch: 11 | Train Loss: 0.399 | Train Acc: 86.61 %
| Epoch: 12 | Train Loss: 0.397 | Train Acc: 86.60 %
| Epoch: 13 | Train Loss: 0.398 | Train Acc: 86.59 %
| Epoch: 14 | Train Loss: 0.398 | Train Acc: 86.61 %
| Epoch: 15 | Train Loss: 0.397 | Train Acc: 86.58 %


In [30]:
epoch_loss = 0
epoch_acc = 0

In [31]:
# to check the model against test data
model.eval()

with torch.no_grad():
    
    for batch in test_iterator:
        batch.text = batch.text.to(device)
        
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        
        rounded_pred = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_pred == batch.label).float()
        acc = correct.sum() / len(correct)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    test_loss = epoch_loss/len(test_iterator)
    test_acc = epoch_acc/len(test_iterator)
        
    print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f} %')

| Test Loss: 0.469 | Test Acc: 86.56 %
