[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/prabuscihero/NLP-Basic-to-Bert/blob/master/Basic_lstm.ipynb)

# Model - Basic LSTM model with glove features

In [0]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from sklearn.model_selection import train_test_split
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch.optim.lr_scheduler import StepLR
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tn
from sklearn.metrics import classification_report
from torch import optim
import pickle

In [2]:
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
cd '/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert'

/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert


In [0]:
# Load the training data
train_df = pd.read_csv('training_data.csv')
test_df = pd.read_csv('testing_data.csv')


In [0]:
# Find the number of words in user review
train_df['length']= train_df.user_review.str.split().apply(len)
test_df['length'] = test_df.user_review.str.split().apply(len)

In [0]:
#sample_train_df_full = train_df.head(5000)
#sample_test_df = test_df.head(5000)

sample_train_df_full = train_df
sample_test_df = test_df

sample_df, valid_df, train_labels, valid_labels = train_test_split(sample_train_df_full, sample_train_df_full.user_rating, random_state=42, stratify=sample_train_df_full.user_rating,test_size=0.20)

In [0]:
# Get the maximum number of words
MAX_LEN =  max(sample_df['length'].max(),sample_df['length'].max(),valid_df['length'].max()) 

x_train = sample_df['user_review']
x_test  = sample_test_df['user_review']
x_valid = valid_df['user_review']

# Tokenize the words
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test)+list(x_valid))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_valid = tokenizer.texts_to_sequences(x_valid)

# Pad the zero at the end to the same length
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN,padding='post')
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN,padding='post')
x_valid = sequence.pad_sequences(x_valid, maxlen=MAX_LEN,padding='post')

In [0]:
# Function to get the indices from glove vector
def build_matrix(word_index):
    
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embeddings[word_to_indx[word]]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [0]:
# Load the embedding and corresponding word indices
embeddings = np.load('embed.npy')
word_to_indx = pickle.load(open('stoi.pkl', 'rb'))

In [0]:
# Get the glove matrix for the user review
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index)

In [0]:
# Convert the data set type to tensor
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
x_train_torch = torch.tensor(x_train,dtype=torch.long).to(device)
x_test_torch = torch.tensor(x_test,dtype=torch.long).to(device)
x_valid_torch = torch.tensor(x_valid,dtype=torch.long).to(device)
y_train_torch = torch.tensor(sample_df['user_rating'].values,dtype=torch.long)
y_test_torch = torch.tensor(sample_test_df['user_rating'].values,dtype=torch.long)
y_valid_torch = torch.tensor(valid_df['user_rating'].values,dtype=torch.long)

In [0]:
# Initialize the variables
max_features = len(tokenizer.index_word)
num_class = 2

In [0]:
# Create the data loader for train and test set
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch,y_test_torch)
valid_dataset = data.TensorDataset(x_valid_torch,y_valid_torch)

train_dl = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size=1024, shuffle=True)
valid_dl = DataLoader(valid_dataset, batch_size=1024, shuffle=True)

In [0]:
# Create Spatial Dropout class
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [0]:
# Class to create basic lstm model
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        self.n_hid = 150
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.4)
        self.lstm1 = nn.LSTM(embed_size, self.n_hid,bidirectional=True,dropout=0.4,num_layers =2)
        self.out = nn.Linear(self.n_hid*4, 2)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = self.embedding_dropout(embedded)
        pre_clas,(final_hidden_state, final_cell_state) = self.lstm1(embedded)
        avg_pool = torch.mean(pre_clas, 1)
        max_pool,_ = torch.max(pre_clas,1)
        h_conc = torch.cat((max_pool, avg_pool), 1)
        out = self.out(h_conc)
        return out        

In [0]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

In [32]:
# Train the Lstm model
epochs = 30
loss_func = F.cross_entropy
mbc = NeuralNet(glove_matrix)
mbc.to(device)
opt = optim.Adam(mbc.parameters(), lr=0.01)
scheduler = StepLR(opt, step_size=5, gamma=0.95)

for epoch in tn(range(epochs)):
    scheduler.step()
    # Print Learning Rate
    print('Epoch:', epoch,'LR:', scheduler.get_lr())
    mbc.train()
    for xb, yb in tn(train_dl):
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        loss = loss_func(pred, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

    mbc.eval()
    
    valid_accs = []
    with torch.no_grad():
      for xb, yb in valid_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        #batch_loss = loss_func(pred, yb)
        batch_acc = accuracy(pred, yb)
        #losses.append(batch_loss.item())
        valid_accs.append(batch_acc.item())

    train_accs = []
    with torch.no_grad():
      for xb, yb in train_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        #batch_loss = loss_func(pred, yb)
        batch_acc = accuracy(pred, yb)
        #losses.append(batch_loss.item())
        train_accs.append(batch_acc.item())

    print('validation accuracy', np.mean(valid_accs),'training accuracy',np.mean(train_accs))

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

Epoch: 0 LR: [0.01]




HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8322943210601806 training accuracy 0.83330078125
Epoch: 1 LR: [0.01]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8368084311485291 training accuracy 0.8461669921875
Epoch: 2 LR: [0.01]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8632613182067871 training accuracy 0.873046875
Epoch: 3 LR: [0.01]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.816886556148529 training accuracy 0.8201416015625
Epoch: 4 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8823899865150452 training accuracy 0.8954345703125
Epoch: 5 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8791234850883484 training accuracy 0.894970703125
Epoch: 6 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8905691921710968 training accuracy 0.9056884765625
Epoch: 7 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8732401907444001 training accuracy 0.888232421875
Epoch: 8 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8884366989135742 training accuracy 0.9103759765625
Epoch: 9 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8900669634342193 training accuracy 0.91396484375
Epoch: 10 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8928392052650451 training accuracy 0.914990234375
Epoch: 11 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8903459787368775 training accuracy 0.9177490234375
Epoch: 12 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8984773576259613 training accuracy 0.9230712890625
Epoch: 13 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8973293960094452 training accuracy 0.9247802734375
Epoch: 14 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8976980984210968 training accuracy 0.92548828125
Epoch: 15 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8941167056560516 training accuracy 0.9259033203125
Epoch: 16 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8907505571842194 training accuracy 0.9203369140625
Epoch: 17 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8956473171710968 training accuracy 0.9264892578125
Epoch: 18 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8946946740150452 training accuracy 0.931494140625
Epoch: 19 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8986746609210968 training accuracy 0.9330078125
Epoch: 20 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8964803874492645 training accuracy 0.93212890625
Epoch: 21 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8993801772594452 training accuracy 0.9328369140625
Epoch: 22 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8905811548233032 training accuracy 0.9209716796875
Epoch: 23 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8964604556560516 training accuracy 0.9326904296875
Epoch: 24 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.9003128945827484 training accuracy 0.9380859375
Epoch: 25 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8974589407444 training accuracy 0.933203125
Epoch: 26 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.9004982471466064 training accuracy 0.937841796875
Epoch: 27 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.9006277859210968 training accuracy 0.941064453125
Epoch: 28 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8933115422725677 training accuracy 0.93486328125
Epoch: 29 LR: [0.007350918906249998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8977758288383484 training accuracy 0.9435546875


In [28]:
mbc.parameters()

<generator object Module.parameters at 0x7fd1c227b9e8>

In [0]:
# Predict the output for the test data
predict_list = []
expected_list = []
mbc.eval()
for xb, yb in test_dl:
    xb = xb.to(device)
    yb = yb.to(device)
    pred = mbc(xb)
    pred_label = torch.argmax(pred, dim=1)
    predict_list.extend(pred_label.cpu().detach().numpy().tolist())
    expected_list.extend(yb.cpu().detach().numpy().tolist())

In [34]:
print(classification_report(expected_list,predict_list))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90     24626
           1       0.91      0.88      0.90     25374

    accuracy                           0.90     50000
   macro avg       0.90      0.90      0.90     50000
weighted avg       0.90      0.90      0.90     50000



In [35]:
# Train the Lstm model
epochs = 30
loss_func = F.cross_entropy
mbc = NeuralNet(glove_matrix)
mbc.to(device)
opt = optim.Adam(mbc.parameters(), lr=0.01)
scheduler = StepLR(opt, step_size=5, gamma=0.95)
for epoch in tn(range(epochs)):
    scheduler.step()
    # Print Learning Rate
    print('Epoch:', epoch,'LR:', scheduler.get_lr())
    mbc.train()
    for xb, yb in tn(train_dl):
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        loss = loss_func(pred, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

    mbc.eval()
    
    valid_accs = []
    with torch.no_grad():
      for xb, yb in valid_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        #batch_loss = loss_func(pred, yb)
        batch_acc = accuracy(pred, yb)
        #losses.append(batch_loss.item())
        valid_accs.append(batch_acc.item())

    train_accs = []
    with torch.no_grad():
      for xb, yb in train_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        #batch_loss = loss_func(pred, yb)
        batch_acc = accuracy(pred, yb)
        #losses.append(batch_loss.item())
        train_accs.append(batch_acc.item())

    print('validation accuracy', np.mean(valid_accs),'training accuracy',np.mean(train_accs))

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

Epoch: 0 LR: [0.01]




HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8156389474868775 training accuracy 0.822412109375
Epoch: 1 LR: [0.01]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8363261282444 training accuracy 0.8415771484375
Epoch: 2 LR: [0.01]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8643195927143097 training accuracy 0.872412109375
Epoch: 3 LR: [0.01]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.875577962398529 training accuracy 0.8837158203125
Epoch: 4 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8728814542293548 training accuracy 0.8866943359375
Epoch: 5 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8705676019191741 training accuracy 0.884521484375
Epoch: 6 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.88935546875 training accuracy 0.906005859375
Epoch: 7 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8877092599868774 training accuracy 0.9048095703125
Epoch: 8 LR: [0.0095]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8884307205677032 training accuracy 0.9083984375
Epoch: 9 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8904894769191742 training accuracy 0.9136962890625
Epoch: 10 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.898736447095871 training accuracy 0.91806640625
Epoch: 11 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.889359450340271 training accuracy 0.90810546875
Epoch: 12 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8926279485225678 training accuracy 0.9187255859375
Epoch: 13 LR: [0.009025]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8972078263759613 training accuracy 0.92158203125
Epoch: 14 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8947604417800903 training accuracy 0.925244140625
Epoch: 15 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8894132614135742 training accuracy 0.915625
Epoch: 16 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8964006662368774 training accuracy 0.930078125
Epoch: 17 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8923509240150451 training accuracy 0.921533203125
Epoch: 18 LR: [0.00857375]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8981046676635742 training accuracy 0.9298095703125
Epoch: 19 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8971380710601806 training accuracy 0.935205078125
Epoch: 20 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8993283629417419 training accuracy 0.9351806640625
Epoch: 21 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.9012675344944 training accuracy 0.9378173828125
Epoch: 22 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8973473370075226 training accuracy 0.937841796875
Epoch: 23 LR: [0.0081450625]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8976223647594452 training accuracy 0.9392333984375
Epoch: 24 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8962193071842194 training accuracy 0.93740234375
Epoch: 25 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8995077311992645 training accuracy 0.9405517578125
Epoch: 26 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.897171950340271 training accuracy 0.9423828125
Epoch: 27 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8979691445827485 training accuracy 0.9431884765625
Epoch: 28 LR: [0.007737809374999998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8950932681560516 training accuracy 0.94423828125
Epoch: 29 LR: [0.007350918906249998]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

validation accuracy 0.8994280099868774 training accuracy 0.9431396484375


In [0]:
# Predict the output for the test data
predict_list = []
expected_list = []
mbc.eval()
for xb, yb in test_dl:
    xb = xb.to(device)
    yb = yb.to(device)
    pred = mbc(xb)
    pred_label = torch.argmax(pred, dim=1)
    predict_list.extend(pred_label.cpu().detach().numpy().tolist())
    expected_list.extend(yb.cpu().detach().numpy().tolist())

In [37]:
print(classification_report(expected_list,predict_list))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90     24626
           1       0.91      0.89      0.90     25374

    accuracy                           0.90     50000
   macro avg       0.90      0.90      0.90     50000
weighted avg       0.90      0.90      0.90     50000

