# Sentiment Analysis using RNN and LSTM

In [1]:
# Importing all the required libraries
import numpy as np
import pandas as pd
from string import punctuation 
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Load and visualize the Data

In [2]:
# Read data
with open('review.txt', 'r', encoding='utf-8') as f:
    reviews = f.read()
with open('sentiment.txt', 'r', encoding='utf-8') as f:
    labels = f.read()

print(reviews[:500])
print()
print(labels[:40])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ

positive
positive
positive
negative
posi


## Data Preprocessing

In [3]:
# Removing all the punctuation and converting to lower case
reviews = reviews.lower() 
alltext = ''.join([c for c in reviews if c not in punctuation])

# Splitting by new lines and spaces
r_split = alltext.split('\n')
alltext = ' '.join(r_split)

# Create a list of words
words = alltext.split()
words[:25]

['one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'youll',
 'be',
 'hooked',
 'they',
 'are',
 'right',
 'as',
 'this',
 'is',
 'exactly',
 'what']

## Encoding the words

### Tokenize

In [4]:
## Build a dictionary that maps words to integers 
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [5]:
reviews_int = [] 
for review in r_split:     
  reviews_int.append([vocab_to_int[word] for word in review.split()])

# Test the encoding
# stats about vocabulary
print('Unique words: ', len((vocab_to_int)))

# Print tokens in first review 
print('Tokenized review: \n', reviews_int[:1])

Unique words:  182727
Tokenized review: 
 [[28, 4, 1, 77, 1941, 44, 1063, 11, 100, 145, 40, 479, 3324, 393, 461, 26, 3190, 34, 23, 205, 14, 10, 6, 601, 48, 590, 15, 2136, 12, 1, 87, 146, 11, 3255, 69, 42, 3324, 13, 29, 5600, 2, 15378, 134, 4, 582, 61, 282, 7, 205, 35, 1, 670, 138, 1707, 69, 10, 6, 21, 3, 118, 16, 1, 8330, 5794, 39, 11861, 10, 118, 2507, 55, 6063, 15, 5636, 5, 1469, 381, 39, 582, 29, 6, 3407, 7, 1, 352, 339, 4, 1, 23503, 12, 8, 6, 469, 3324, 14, 11, 6, 1, 11516, 338, 5, 1, 16020, 6870, 2543, 1061, 61700, 8, 2636, 1375, 20, 25362, 536, 33, 4727, 2520, 4, 1, 1208, 112, 31, 1, 7152, 25, 2992, 13017, 2, 408, 61701, 37, 17526, 6, 21, 319, 20, 1, 5097, 3745, 536, 6, 344, 5, 81870, 8469, 41125, 15379, 5170, 7893, 2461, 2, 18403, 61702, 329, 9263, 7467, 13444, 2, 8720, 34937, 23, 109, 224, 5435, 12, 9, 57, 128, 1, 269, 1303, 4, 1, 118, 6, 668, 5, 1, 187, 11, 8, 262, 112, 77, 257, 548, 2999, 819, 178, 1271, 4349, 16, 2499, 1095, 819, 1412, 819, 81871, 147, 978, 181, 1, 87, 393, 

In [6]:
# 1=positive, 0=negative 
l_split = labels.split('\n') 
encoded_labels = np.array([1 if label == 'positive' else 0 for label in l_split])

In [7]:
# Identifying the number of outliers
review_lens = Counter([len(x) for x in reviews_int])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2469


In [8]:
# Removing the Outliers
print('Number of reviews before removing outliers: ', len(reviews_int))  
non_zero_idx = [ii for ii, review in enumerate(reviews_int) if len(review) != 0]

# Remove all the details about 0-length reviews 
reviews_int = [reviews_int[ii] for ii in non_zero_idx] 
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])  
print('Number of reviews after removing outliers: ', len(reviews_int))

Number of reviews before removing outliers:  50001
Number of reviews after removing outliers:  50000


### Padding Sequences

In [9]:
# Making all reviews of same length
def pad_features(reviews_int, seq_length):

  features = np.zeros((len(reviews_int), seq_length), dtype=int)
  for i, row in enumerate(reviews_int):
    features[i, -len(row):] = np.array(row)[:seq_length]
  return features

# Testing the implementation
seq_length = 200
features = pad_features(reviews_int, seq_length=seq_length)

## Test statements
assert len(features) == len(reviews_int), "Your features should have as many rows as reviews."
assert len(features[0]) == seq_length, "Each feature row should contain seq_length values."
print(features[:30,:10])

[[   28     4     1    77  1941    44  1063    11   100   145]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [81876 34938   110     7     1    59     4   291     6     3]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   37   141    21     3   191   320     4 16021   163    18]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0

## Training, Validation and Test

In [10]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)
split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(40000, 200) 
Validation set: 	(5000, 200) 
Test set: 		(5000, 200)


### Dataloaders and batching

In [11]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
batch_size = 50

# Shuffling the training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [12]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[    0,     0,     0,  ...,    13,   178,   981],
        [    0,     0,     0,  ...,    31,     1,    98],
        [    0,     0,     0,  ...,   266,     1,  1222],
        ...,
        [    0,     0,     0,  ...,     2,   103,    11],
        [    0,     0,     0,  ...,    37,   115,  3451],
        [28980,   878,   122,  ...,    50,     8,  1999]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
        1, 1], dtype=torch.int32)


## Sentiment Network with PyTorch

In [13]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()
if(train_on_gpu):
  print('Training on GPU.')
else:
  print('No GPU available, training on CPU.')

Training on GPU.


In [14]:
class SentimentRNN(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):

        super(SentimentRNN, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        # linear and sigmoid layers
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc1(out)
        out = self.fc2(out)
        sig_out = self.sig(out)
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [15]:
# Instantiating the hyperparameters
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 512
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)
# Loss and optimization
lr=0.001

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=lr)

SentimentRNN(
  (embedding): Embedding(182728, 512)
  (lstm): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


### Training

In [16]:
# training params

epochs = 4
counter = 0
printing = 100
batch_size = 50
clip = 5 # Gradient clipping

if(train_on_gpu):
    net.cuda()

net.train()
for e in range(epochs):
    h = net.init_hidden(batch_size)

    # Batch loop
    for inputs, labels in train_loader:
        counter += 1
        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        h = tuple([each.data for each in h])
        net.zero_grad()
        output, h = net(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        if counter % printing == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                val_h = tuple([each.data for each in val_h])
                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())
                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.750410... Val Loss: 0.746545
Epoch: 1/4... Step: 200... Loss: 0.573088... Val Loss: 0.639460
Epoch: 1/4... Step: 300... Loss: 0.691579... Val Loss: 0.692826
Epoch: 1/4... Step: 400... Loss: 0.678167... Val Loss: 0.665632
Epoch: 1/4... Step: 500... Loss: 0.670626... Val Loss: 0.587338
Epoch: 1/4... Step: 600... Loss: 0.534300... Val Loss: 0.564771
Epoch: 1/4... Step: 700... Loss: 0.421392... Val Loss: 0.510150
Epoch: 1/4... Step: 800... Loss: 0.504427... Val Loss: 0.490066
Epoch: 2/4... Step: 900... Loss: 0.404255... Val Loss: 0.427317
Epoch: 2/4... Step: 1000... Loss: 0.322211... Val Loss: 0.400227
Epoch: 2/4... Step: 1100... Loss: 0.221610... Val Loss: 0.402740
Epoch: 2/4... Step: 1200... Loss: 0.290652... Val Loss: 0.365611
Epoch: 2/4... Step: 1300... Loss: 0.202752... Val Loss: 0.374199
Epoch: 2/4... Step: 1400... Loss: 0.274044... Val Loss: 0.351327
Epoch: 2/4... Step: 1500... Loss: 0.299229... Val Loss: 0.363770
Epoch: 2/4... Step: 1600... Loss: 

### Testing

In [17]:
# Get test data loss and accuracy

test_losses = []
num_correct = 0
h = net.init_hidden(batch_size)

net.eval()
# Iterating over test data
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    output, h = net(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # Converting output probabilities to predicted class
    pred = torch.round(output.squeeze())
    # Comparing predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.433
Test accuracy: 0.858


###  Testing the reviews

In [18]:
review_neg = 'It is a film with a faulty assembly, as boring and exhausting as the predecessor.'
review_pos = 'What you will be getting when you walk into an inevitably overstuffed movie theater is something singular that reflects our age in a way that none of the Marvel films that preceded it have - indeed, very few Hollywood spectacles ever have.'

In [19]:
def tokenize_review(test_review):
    test_review = test_review.lower()
    test_text = ''.join([c for c in test_review if c not in punctuation])
    test_words = test_text.split()
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])
    return test_ints

test_ints = tokenize_review(review_neg)
print(test_ints)

[[8, 6, 3, 19, 15, 3, 14542, 12813, 14, 347, 2, 13049, 14, 1, 5618]]


In [20]:
seq_length=200
features = pad_features(test_ints, seq_length)
print(features)

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [21]:
feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

torch.Size([1, 200])


In [22]:
def predict(net, test_review, sequence_length=200):

    net.eval()
    
    # Tokenize 
    test_ints = tokenize_review(test_review)
    # Pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # Convert to tensor to pass into the model
    feature_tensor = torch.from_numpy(features)
    batch_size = feature_tensor.size(0)
    # Initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    output, h = net(feature_tensor, h)
    pred = torch.round(output.squeeze()) 
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        print("Positive review detected:)")
    else:
        print("Negative review detected:(")

### Checking the negative review

In [23]:
seq_length = 200 
predict(net, review_neg, seq_length)

Prediction value, pre-rounding: 0.049122
Negative review detected:(


### Checking the positive review

In [24]:
predict(net, review_pos, seq_length)

Prediction value, pre-rounding: 0.918858
Positive review detected:)
