# Sentiment Analysis using LSTM

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
from collections import Counter
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

import pandas as pd
import numpy as np
import bz2
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
device = "cpu"

## 1) Load in and visualize the data

In [3]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 2) Data Processing — convert to lower case, Remove punctuation etc

In [4]:
def data_preprocessing(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text) # Remove HTML from text
    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text

df['cleaned_reviews'] = df['review'].apply(data_preprocessing)
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


## 5) Tokenize — Create Vocab to Int mapping dictionary
In most of the NLP tasks, you will create an index mapping dictionary in such a way that your frequently occurring words are assigned lower indexes. One of the most common way of doing this is to use Counter method from Collections library.

In [5]:
max_features = 8192
maxlen = 30

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['cleaned_reviews'])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

Vocabulary Size : 222610


In [6]:
training_token = tokenizer.texts_to_sequences(df['cleaned_reviews'])
x_data = pad_sequences(training_token, maxlen = maxlen, padding = 'post')

In [7]:
y_data = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [8]:
X_train, X_remain, y_train, y_remain = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_remain, y_remain, test_size=0.5, random_state=1)

In [9]:
# create tensor dataset
train_data = TensorDataset(torch.from_numpy(X_train.astype('float64')), torch.from_numpy(np.array(y_train).astype('float64')))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(np.array(y_test).astype('float64')))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(np.array(y_valid).astype('float64')))

# dataloaders
batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [10]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([50, 30])
Sample input: 
 tensor([[3.5720e+03, 6.0000e+00, 4.4100e+02,  ..., 2.3800e+02, 2.3890e+03,
         1.0690e+03],
        [3.3200e+02, 1.5000e+01, 6.9750e+03,  ..., 1.1200e+02, 1.0000e+01,
         6.5430e+03],
        [2.5300e+02, 3.5500e+02, 2.7000e+02,  ..., 3.2500e+02, 8.0000e+00,
         1.8700e+02],
        ...,
        [1.1490e+03, 5.0000e+00, 2.1900e+02,  ..., 1.0700e+02, 2.2330e+03,
         2.4840e+03],
        [6.2580e+03, 1.8400e+02, 5.5150e+03,  ..., 5.4600e+02, 6.8200e+02,
         3.1170e+03],
        [6.3500e+02, 2.7200e+02, 3.6790e+03,  ..., 6.6250e+03, 5.8500e+02,
         7.9200e+02]], dtype=torch.float64)
Sample input: 
 tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0.,
        1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
        1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.],
       dtype=torch.float64)


In [11]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #self.conv = nn.Conv1d(embedding_dim, 8, kernel_size=5)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        #self.gru = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        if len(x) != 1:
            batch_size = x.size(0)
        else:
            batch_size = 1
        
        x = x.long()
        embeds = self.embedding(x)

        #conv = self.conv(x)
        
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        #hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                     weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [12]:
def model_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [13]:
output_size = 1
embedding_dim = 10
hidden_dim = 32
n_layers = 8

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(model_params(model))
model.to(device)
lr=0.008
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

2290901


In [14]:
epochs = 5
counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    h = model.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output, labels.float())
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inp, lab in test_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out, lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/5... Step: 100... Loss: 0.694815... Val Loss: 0.693394
Validation loss decreased (inf --> 0.693394).  Saving model ...
Epoch: 1/5... Step: 200... Loss: 0.696078... Val Loss: 0.693334
Validation loss decreased (0.693394 --> 0.693334).  Saving model ...
Epoch: 1/5... Step: 300... Loss: 0.701727... Val Loss: 0.693100
Validation loss decreased (0.693334 --> 0.693100).  Saving model ...
Epoch: 1/5... Step: 400... Loss: 0.692034... Val Loss: 0.693638
Epoch: 1/5... Step: 500... Loss: 0.694883... Val Loss: 0.693095
Validation loss decreased (0.693100 --> 0.693095).  Saving model ...
Epoch: 1/5... Step: 600... Loss: 0.692668... Val Loss: 0.693103
Epoch: 1/5... Step: 700... Loss: 0.697681... Val Loss: 0.693061
Validation loss decreased (0.693095 --> 0.693061).  Saving model ...
Epoch: 1/5... Step: 800... Loss: 0.692893... Val Loss: 0.693198
Epoch: 2/5... Step: 900... Loss: 0.693024... Val Loss: 0.693403
Epoch: 2/5... Step: 1000... Loss: 0.693959... Val Loss: 0.693639
Epoch: 2/5... Step:

In [15]:
h = model.init_hidden(1)
sentence = "I love this movie because it is great"
trial = torch.tensor(pad_sequences(tokenizer.texts_to_sequences([sentence]), maxlen = maxlen)).to(device)


model(trial, h)[0]

tensor([0.5013], grad_fn=<SelectBackward>)

In [16]:
import torch.onnx
torch.onnx.export(model,               # model being run
                  (trial, (torch.randn(8,1,32).to(device), torch.randn(8,1,32).to(device))),                   # model input (or a tuple for multiple inputs)
                  "lstm.onnx")

  "or define the initial states (h0/c0) as inputs of the model. ")
