# Sentiment Analysis using LSTM

In [1]:
import numpy as np
import pandas as pd
from torch import nn

import matplotlib.pyplot as plt
import seaborn as sns

import torch.nn.functional as F

import re
import string
from collections import Counter
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

import pandas as pd
import numpy as np
import bz2
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

2021-11-26 12:08:49.295294: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
device = "cuda:0"

## 1) Load in and visualize the data

In [3]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 2) Data Processing — convert to lower case, Remove punctuation etc

In [4]:
def data_preprocessing(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text) # Remove HTML from text
    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text

df['cleaned_reviews'] = df['review'].apply(data_preprocessing)
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


## 5) Tokenize — Create Vocab to Int mapping dictionary
In most of the NLP tasks, you will create an index mapping dictionary in such a way that your frequently occurring words are assigned lower indexes. One of the most common way of doing this is to use Counter method from Collections library.

In [5]:
max_features = 8192
maxlen = 30

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['cleaned_reviews'])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

Vocabulary Size : 222610


In [6]:
training_token = tokenizer.texts_to_sequences(df['cleaned_reviews'])
x_data = pad_sequences(training_token, maxlen = maxlen, padding = 'post')

In [7]:
y_data = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [8]:
X_train, X_remain, y_train, y_remain = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_remain, y_remain, test_size=0.5, random_state=1)

In [9]:
y_train = y_train.to_numpy().reshape(-1,1)
y_test = y_test.to_numpy().reshape(-1,1)

In [10]:
# create tensor dataset
train_data = TensorDataset(torch.from_numpy(X_train.astype('float64')), torch.from_numpy(np.array(y_train).astype('float64')))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(np.array(y_test).astype('float64')))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(np.array(y_valid).astype('float64')))

# dataloaders
batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [11]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([50, 30])
Sample input: 
 tensor([[8.3600e+02, 1.0000e+00, 1.1200e+02,  ..., 4.5000e+01, 5.9000e+01,
         2.9330e+03],
        [2.0000e+00, 5.0780e+03, 5.8200e+02,  ..., 3.4300e+02, 1.4000e+01,
         5.3000e+01],
        [1.1400e+02, 2.1800e+02, 1.4190e+03,  ..., 2.4800e+02, 3.4750e+03,
         1.2000e+01],
        ...,
        [1.0590e+03, 6.0870e+03, 2.1180e+03,  ..., 3.2500e+02, 3.9520e+03,
         1.1300e+03],
        [4.9000e+01, 1.1000e+01, 2.7100e+02,  ..., 1.9800e+03, 2.3700e+02,
         3.2570e+03],
        [5.6100e+02, 8.0500e+02, 3.1300e+02,  ..., 4.4000e+02, 1.0000e+01,
         1.8030e+03]], dtype=torch.float64)
Sample input: 
 tensor([[0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
  

In [12]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [13]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.2):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(drop_prob)
        self.embedding_dim = embedding_dim
        self.sigmoid = nn.Sigmoid()
        
        
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim, drop_prob)
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, 2, hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        

        self.fc = nn.Linear(embedding_dim * 30 , 1)

     
    """
    def forward(self, x):
        #print(x.shape)
        if len(x) != 1:
            batch_size = x.size(0)

        x = x.long()
        
        embeds = self.embedding(x)
        
        print(embeds.shape)
        
        out = self.transformer_encoder(embeds)
        
        print(out.shape)

        out = self.fc(out)
                
        print(out.shape)

        return out
        """
    
    def forward(self, src):
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.view(output.size(0), -1)
        output = self.sigmoid(self.fc(output))
        return output

In [14]:
def model_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [15]:
import math

output_size = 1
embedding_dim = 10
hidden_dim = 32
n_layers = 8

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
print(model_params(model))
lr=0.008
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

2235697


In [16]:
epochs = 10
counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    
    for inputs, labels in train_loader:
        counter += 1
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output = model(inputs.long())
        
        loss = criterion(output, labels.float())
        
        
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_losses = []
            model.eval()
            for inp, lab in test_loader:
                inp, lab = inp.to(device), lab.to(device)
                out = model(inp.long())
                val_loss = criterion(out, lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/10... Step: 100... Loss: 0.678639... Val Loss: 0.703627
Validation loss decreased (inf --> 0.703627).  Saving model ...
Epoch: 1/10... Step: 200... Loss: 0.714005... Val Loss: 0.694654
Validation loss decreased (0.703627 --> 0.694654).  Saving model ...
Epoch: 1/10... Step: 300... Loss: 0.703290... Val Loss: 0.702088
Epoch: 1/10... Step: 400... Loss: 0.705424... Val Loss: 0.693558
Validation loss decreased (0.694654 --> 0.693558).  Saving model ...
Epoch: 1/10... Step: 500... Loss: 0.692307... Val Loss: 0.693089
Validation loss decreased (0.693558 --> 0.693089).  Saving model ...
Epoch: 1/10... Step: 600... Loss: 0.701554... Val Loss: 0.695090
Epoch: 1/10... Step: 700... Loss: 0.700285... Val Loss: 0.693584
Epoch: 1/10... Step: 800... Loss: 0.698686... Val Loss: 0.694521
Epoch: 2/10... Step: 900... Loss: 0.694636... Val Loss: 0.693161
Epoch: 2/10... Step: 1000... Loss: 0.689194... Val Loss: 0.694009
Epoch: 2/10... Step: 1100... Loss: 0.696859... Val Loss: 0.693594
Epoch: 2/10.

In [17]:
sentence = "I love you"
trial = torch.tensor(pad_sequences(tokenizer.texts_to_sequences([sentence]), maxlen = maxlen)).long().to(device)

model(trial)

tensor([[0.4973]], device='cuda:0', grad_fn=<SigmoidBackward>)

In [18]:
import torch.onnx
torch.onnx.export(model,               # model being run
                  trial,                  # model input (or a tuple for multiple inputs)
                  "transformer-imdb.onnx", opset_version = 11)

  app.launch_new_instance()
