##NLP - Binary Text Classification usings RNNs

In [None]:
!pip install torch==1.1.0



In [None]:
import re
import numpy as np
import pandas as pd

from pprint import pprint
from collections import Counter

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x7f02cf5a5ed0>

In [None]:
!wget https://raw.githubusercontent.com/hallr/DAT_SF_19/master/data/yelp_labelled.txt

In [None]:
df = pd.read_csv("yelp_labelled.txt", sep="\t", header=None, names=['text', 'tag'])

df.dropna(inplace=True)
df.head()

Unnamed: 0,text,tag
0,Wow... Loved this place.,1.0
3,Crust is not good.,0.0
4,Not tasty and the texture was just nasty.,0.0
10,Stopped by during the late May bank holiday of...,1.0
11,The selection on the menu was great and so wer...,1.0


In [None]:
sentence_list = [t for t in df['text'].to_list()]
tag_list = [t for t in df['tag'].to_list()]

In [None]:
sentence_list[1:10]

['Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.']

In [None]:
tag_list[1:10]

[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0]

In [None]:
sentence_list = [s.lower() for s in sentence_list]

regex_remove_nonalphabets = re.compile('[^a-zA-Z]')
sentence_list = [regex_remove_nonalphabets.sub(' ', s) for s in sentence_list]

regex_remove_shortwords = re.compile(r'\b\w{1,2}\b')
sentence_list = [regex_remove_shortwords.sub("", s) for s in sentence_list]

c = Counter(w for s in sentence_list for w in s.split())
sentence_list = [' '.join(y for y in x.split() if c[y] > 1) for x in sentence_list]

sentence_list = [" ".join(s.split()) for s in sentence_list]

In [None]:
sentence_list[1:10]

['crust not good',
 'not tasty and the texture was just nasty',
 'stopped during the late may off recommendation and loved',
 'the selection the menu was great and were the prices',
 'now getting and want damn pho',
 'didn taste that fresh',
 'the potatoes were like and you could tell they had been made time being kept under',
 'the fries were great too',
 'great touch']

In [None]:
words = []

for sentence in sentence_list:
    for w in sentence.split():
        words.append(w)

words = list(set(words))
print(f"Size of word-vocabulary: {len(words)}\n")

Size of word-vocabulary: 844



In [None]:
word2idx = {word: i for i, word in enumerate(words)}

In [None]:
tags = []

for tag in tag_list:
    tags.append(tag)

tags = list(set(tags))

print(f"Size of tag-vocab: {len(tags)}\n")
print(tags)

Size of tag-vocab: 2

[0.0, 1.0]


In [None]:
tag2idx = {word: i for i, word in enumerate(tags)}
print(tag2idx)

{0.0: 0, 1.0: 1}


In [None]:
X = [[word2idx[w] for w in s.split()] for s in sentence_list]
X[:3]

[[576, 224, 707, 793],
 [234, 325, 307],
 [325, 447, 282, 120, 135, 546, 28, 315]]

In [None]:
y = [tag2idx[t] for t in tag_list]
y[:3]

[1, 0, 0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
print("X_train size: ", len(X_train))
print("X_test size: ", len(X_test))

X_train size:  700
X_test size:  300


In [None]:
EPOCHS = 100
BATCH_SIZE = 128
EMBEDDING_SIZE = 256

VOCAB_SIZE = len(word2idx)
TARGET_SIZE = len(tag2idx)

HIDDEN_SIZE = 8
LEARNING_RATE = 0.001
STACKED_LAYERS = 8

In [None]:
class TrainData(Dataset):

    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

    def __len__(self):
        return len(self.X_data)

In [None]:
train_data = TrainData(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=lambda x:x)

In [None]:
class TestData(Dataset):

    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

    def __len__(self):
        return len(self.X_data)

In [None]:
test_data = TestData(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=1, collate_fn=lambda x:x)

In [None]:
class ModelLSTM(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size, stacked_layers):
        super(ModelLSTM, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, hidden_size = hidden_size, batch_first = True, num_layers = stacked_layers, dropout = 0.3)
        self.linear = nn.Linear(in_features = hidden_size, out_features=1)
        self.tanh = nn.Tanh()
        
    def forward(self, x_batch):
        len_list = list(map(len, x_batch))
        
        padded_batch = pad_sequence(x_batch, batch_first=True)
        embeds = self.word_embeddings(padded_batch)
        pack_embeds = pack_padded_sequence(embeds, lengths=len_list, batch_first=True, enforce_sorted=False)
        
        rnn_out, (rnn_h, rnn_c) = self.lstm(pack_embeds)
        linear_out = self.linear(self.tanh(rnn_h))
        y_out = linear_out[-1]
        
        return y_out

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
lstm_model = ModelLSTM(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE,
                       target_size=len(tag2idx), stacked_layers=STACKED_LAYERS)

lstm_model.to(device)
print(lstm_model)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters())

ModelLSTM(
  (word_embeddings): Embedding(844, 256)
  (lstm): LSTM(256, 8, num_layers=8, batch_first=True, dropout=0.3)
  (linear): Linear(in_features=8, out_features=1, bias=True)
  (tanh): Tanh()
)


In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()

    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

In [None]:
lstm_model.train()

for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for batch in train_loader:
        x_batch, y_batch = map(list, zip(*batch))
        x_batch = [torch.tensor(i).to(device) for i in x_batch]
        y_batch = torch.tensor(y_batch).long().to(device)

        optimizer.zero_grad()

        y_pred = lstm_model(x_batch)

        loss = criterion(y_pred.squeeze(1), y_batch.float())
        acc = binary_acc(y_pred.squeeze(1), y_batch.float())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {acc}')

Epoch 001: | Loss: 0.70801 | Acc: 37.0
Epoch 002: | Loss: 0.70620 | Acc: 37.0
Epoch 003: | Loss: 0.70413 | Acc: 37.0
Epoch 004: | Loss: 0.70295 | Acc: 37.0
Epoch 005: | Loss: 0.70134 | Acc: 37.0
Epoch 006: | Loss: 0.70013 | Acc: 37.0
Epoch 007: | Loss: 0.69907 | Acc: 37.0
Epoch 008: | Loss: 0.69788 | Acc: 37.0
Epoch 009: | Loss: 0.69639 | Acc: 37.0
Epoch 010: | Loss: 0.69587 | Acc: 37.0
Epoch 011: | Loss: 0.69502 | Acc: 45.0
Epoch 012: | Loss: 0.69531 | Acc: 52.0
Epoch 013: | Loss: 0.69450 | Acc: 53.0
Epoch 014: | Loss: 0.69442 | Acc: 55.0
Epoch 015: | Loss: 0.69470 | Acc: 52.0
Epoch 016: | Loss: 0.69462 | Acc: 60.0
Epoch 017: | Loss: 0.69391 | Acc: 58.0
Epoch 018: | Loss: 0.69405 | Acc: 58.0
Epoch 019: | Loss: 0.69378 | Acc: 63.0
Epoch 020: | Loss: 0.69362 | Acc: 60.0
Epoch 021: | Loss: 0.69347 | Acc: 60.0
Epoch 022: | Loss: 0.69393 | Acc: 57.0
Epoch 023: | Loss: 0.69352 | Acc: 58.0
Epoch 024: | Loss: 0.69378 | Acc: 58.0
Epoch 025: | Loss: 0.69267 | Acc: 60.0
Epoch 026: | Loss: 0.6918

In [None]:
y_out_tags_list = []

with torch.no_grad():
    for batch in test_loader:
        x_batch, y_batch = map(list, zip(*batch))
        x_batch = [torch.tensor(i).to(device) for i in x_batch]
        y_batch = torch.tensor(y_batch).long().to(device)

        y_pred = lstm_model(x_batch)
        y_pred = torch.sigmoid(y_pred)
        y_pred_tag = torch.round(y_pred)

        y_out_tags_list.append(y_pred_tag.squeeze(0).cpu().numpy())

In [None]:
y_out_tags_list = [a.squeeze().tolist() for a in y_out_tags_list]

In [None]:
print(confusion_matrix(y_test, y_out_tags_list))

[[111  35]
 [ 38 116]]


In [None]:
print(classification_report(y_test, y_out_tags_list))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75       146
           1       0.77      0.75      0.76       154

    accuracy                           0.76       300
   macro avg       0.76      0.76      0.76       300
weighted avg       0.76      0.76      0.76       300



In [None]:
idx2word = {v: k for k, v in word2idx.items()}
idx2tag = {v: k for k, v in tag2idx.items()}

In [None]:
print('{:80}: {:15}\n'.format("Sentence", "Sentiment"))

for sentence, tag in zip(X_test[:10], y_out_tags_list[:10]):
    s = " ".join([idx2word[w] for w in sentence])
    print('{:80}: {:5}\n'.format(s, tag))

Sentence                                                                        : Sentiment      

why are these sad little vegetables overcooked                                  :   0.0

this place has                                                                  :   1.0

the service was great even the manager came and helped with our table           :   1.0

needless say won going back anytime soon                                        :   0.0

how hard its actually rare for give star                                        :   0.0

the right next our table was large been stepped and green                       :   1.0

this                                                                            :   1.0

boy was that dry                                                                :   0.0

perfect for someone who only beer ice cold this even                            :   1.0

warm atmosphere fun and fresh appetizers steaks steak                           :   0.0

