## Importing the libraries

In [1]:
!pip install -q torchtext 



In [2]:
import torch
import torchtext
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Data Preprocessing

In [3]:
df = pd.read_csv('../input/nlp-getting-started/train.csv') 
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [4]:
# shuffling the dataset
df = df.sample(frac=1)

In [5]:
df.shape

(7613, 5)

In [6]:
df.head()

Unnamed: 0,id,keyword,location,text,target
2569,3684,destroy,,(SJ GIST): 148 Houses Farm Produce Destroy... ...,1
4149,5898,harm,Kansas City,@dinallyhot Love what you picked! We're playin...,0
767,1110,blew%20up,california mermaid ?,Some guy whistled at me in the parking lot &am...,0
7482,10704,wreck,"Atlanta, Georgia",#Trump debate will be most highly watched show...,0
1228,1769,buildings%20burning,"Washington, D.C.",Watching Xela firefighters struggle to save bu...,1


In [33]:
# training and validation set

train_df = df[:6090]
valid_df = df[6090:6850]
test_df = df[6850:]

train_df.shape, valid_df.shape, test_df.shape

((6090, 5), (760, 5), (763, 5))

**Dataset**

In [34]:
class Data(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['target']
        return text, label

In [35]:
train_dataset = Data(train_df)
valid_dataset = Data(valid_df)
test_dataset = Data(test_df)
len(train_dataset), len(valid_dataset), len(test_dataset)

(6090, 760, 763)

In [36]:
ds = Data(df)

**tokenization**

In [37]:
# finding unique tokens (words)

import re
from collections import Counter, OrderedDict
from nltk.tokenize import word_tokenize

def tokenizer(text):
    tokenized = word_tokenize(text)
    return tokenized

token_counts = Counter()

for text, label in ds:
    tokens = tokenizer(text)
    token_counts.update(tokens)

print('Vocab length', len(token_counts))

Vocab length 27291


**numericalization**

In [38]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[42, 17, 60, 3299]


**Dataloader**

In [39]:
# Step 3-A: define the functions for transformation

# device = torch.device("cuda:0")
device = 'cpu'

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.


## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [40]:
## Take a small batch

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[  41, 8169, 8170,   38,    2, 8171, 2036, 5285, 8172, 1821,   16,    3,
            2, 8173,    4, 8174,   83,   34,    2, 8175,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [   7, 8176,  517,   87,   22, 2037,   15,  130,   91, 1054, 3170,  985,
           31, 3982, 3983, 3171,  172,   12,   22,   15, 1314,   32,   43,   25,
         2644,    2,    3,    2, 3984],
        [ 330,  752, 8177,   30,   46,   11,    8, 3172,  414,   32,   43,   25,
           24,  138,   53,  233,   27,    8,  415,  491,   28, 3985,   48,  249,
           11,    8,  154,    5,    5],
        [   4, 2038, 2297,   58,   36,  295, 3986, 1822,  545,  387,  194, 1823,
         8178,    6,   13,  168,   58,  342, 3987,   18,    9, 5286, 3988,  260,
            6,    0,    0,    0,    0]])
tensor([0., 0., 0., 0.])
tensor([20, 29, 29, 25])
torch.Size([4, 29])


In [41]:
## Step 4: batching the datasets

batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [42]:
len(train_dl), len(valid_dl), len(test_dl)

(191, 24, 24)

## Model Building

In [43]:
embedding = nn.Embedding(num_embeddings=10, 
                         embedding_dim=3, 
                         padding_idx=0)
 
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-1.0233, -0.5962, -1.0055],
         [-0.2106, -0.0075, -1.7869],
         [-0.9962, -0.8313,  1.3075],
         [-1.1628,  0.1196, -0.1631]],

        [[-0.9962, -0.8313,  1.3075],
         [ 1.6103, -0.7040, -0.1853],
         [-0.2106, -0.0075, -1.7869],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward>)


In [44]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [45]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [48]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} train_loss: {loss_train:.4f} valid_loss: {loss_valid:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 train_loss: 0.0864 valid_loss: 0.0020 val_accuracy: 1.0000
Epoch 1 train_loss: 0.0008 valid_loss: 0.0007 val_accuracy: 1.0000
Epoch 2 train_loss: 0.0004 valid_loss: 0.0003 val_accuracy: 1.0000
Epoch 3 train_loss: 0.0002 valid_loss: 0.0002 val_accuracy: 1.0000
Epoch 4 train_loss: 0.0001 valid_loss: 0.0001 val_accuracy: 1.0000
Epoch 5 train_loss: 0.0001 valid_loss: 0.0001 val_accuracy: 1.0000
Epoch 6 train_loss: 0.0000 valid_loss: 0.0000 val_accuracy: 1.0000
Epoch 7 train_loss: 0.0000 valid_loss: 0.0000 val_accuracy: 1.0000
Epoch 8 train_loss: 0.0000 valid_loss: 0.0000 val_accuracy: 1.0000
Epoch 9 train_loss: 0.0000 valid_loss: 0.0000 val_accuracy: 1.0000
