In [1]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tn
from sklearn.metrics import classification_report
from torch import optim
from google.colab import files
from google.colab import drive
import pickle

Using TensorFlow backend.


# Basic LSTM model with glove features

In [2]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
cd '/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert'

/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert


In [0]:
# Load the training data
train_df = pd.read_csv('training_data.csv')
test_df = pd.read_csv('testing_data.csv')

In [0]:
# Find the number of words in user review
train_df['length']= train_df.user_review.str.split().apply(len)
test_df['length'] = test_df.user_review.str.split().apply(len)

In [0]:
# Load the numb

#sample_df = train_df.head(1000)
#sample_test_df = test_df.head(1000)

sample_df = train_df #.head(1000)
sample_test_df = test_df #.head(1000)

In [0]:
# Get the maximum number of words
MAX_LEN =  max(train_df['length'].max(),test_df['length'].max()) 

x_train = sample_df['user_review']
x_test  = sample_test_df['user_review']

# Tokenize the words
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# Pad the zero at the end to the same length
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN,padding='post')
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN,padding='post')

In [0]:
# Function to get the indices from glove vector
def build_matrix(word_index):
    
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embeddings[word_to_indx[word]]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [0]:
# Load the embedding and corresponding word indices
embeddings = np.load('embed.npy')
word_to_indx = pickle.load(open('stoi.pkl', 'rb'))

In [0]:
# Get the glove matrix for the user review
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index)

In [0]:
# Convert the data set type to tensor
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
x_train_torch = torch.tensor(x_train,dtype=torch.long).to(device)
x_test_torch = torch.tensor(x_test,dtype=torch.long).to(device)
y_train_torch = torch.tensor(sample_df['user_rating'],dtype=torch.long)
y_test_torch = torch.tensor(sample_test_df['user_rating'],dtype=torch.long)

In [0]:
# Initialize the variables
max_features = len(tokenizer.index_word)
num_class = 2
epochs = 10

In [0]:
# Create the data loader for train and test set
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch,y_test_torch)

train_dl = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [0]:
# Create Spatial Dropout class
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [0]:
# Class to create basic lstm model
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        self.n_hid = 30
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        self.lstm1 = nn.LSTM(embed_size, self.n_hid,bidirectional=True,dropout=0.15)
        self.lstm2 = nn.LSTM(self.n_hid*2, self.n_hid,bidirectional=True,dropout=0.15)
        self.fc = nn.Linear(self.n_hid*4,2)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = self.embedding_dropout(embedded)
        pre_clas,(final_hidden_state, final_cell_state) = self.lstm1(embedded)
        pre_clas2,(final_hidden_state, final_cell_state) = self.lstm2(pre_clas)
        avg_pool = torch.mean(pre_clas2, 1)
        max_pool,_ = torch.max(pre_clas2,1)
        h_conc = torch.cat((max_pool, avg_pool), 1)
        out = self.fc(h_conc)
        return out        

In [28]:
# Train the Lstm model
loss_func = F.cross_entropy
mbc = NeuralNet(glove_matrix)
mbc.to(device)
opt = optim.Adam(mbc.parameters(), lr=0.01)
for epoch in tn(range(epochs)):
    mbc.train()
    for xb, yb in tn(train_dl):
        xb = xb.to(device)
        yb = yb.to(device)
        pred = mbc(xb)
        loss = loss_func(pred, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

  "num_layers={}".format(dropout, num_layers))


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))




In [0]:
# Predict the output for the test data
batch_size = 256
predict_list = []
expected_list = []
for xb, yb in test_dl:
    xb = xb.to(device)
    yb = yb.to(device)
    pred = mbc(xb)
    pred_label = torch.argmax(pred, dim=1)
    predict_list.extend(pred_label.cpu().detach().numpy().tolist())
    expected_list.extend(yb.cpu().detach().numpy().tolist())


In [30]:
print(classification_report(predict_list,expected_list))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90    193654
           1       0.92      0.89      0.91    206346

    accuracy                           0.91    400000
   macro avg       0.91      0.91      0.91    400000
weighted avg       0.91      0.91      0.91    400000

