Data Collection

In [None]:
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials 
  
  
# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [None]:
link = 'https://drive.google.com/file/d/1YFQduaJ57Lx0EvhStxafhSXKM-3jbIbe/view?usp=sharing'
  
import pandas as pd 
  
# to get the id part of the file 
id = link.split("/")[-2] 
  
downloaded = drive.CreateFile({'id':id})  
downloaded.GetContentFile('IMDB Dataset.csv')   
# Importing the dataset  
df = pd.read_csv('IMDB Dataset.csv') 
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Conversion to the text file
df.review.to_csv('reviews.txt', sep=" ", header=False)
df.sentiment.to_csv('sentiment.txt', sep=" ", header=False)

In [None]:
# Importing text file
with open("reviews.txt") as f:
    reviews = f.read()
    
with open("sentiment.txt") as f:
    sentiment = f.read()

Data Preprocessing

In [None]:
from string import punctuation

def preprocess(text):
    text = text.lower()
    # Omitting punctuation marks from the text file 
    text = "".join([letter for letter in text if letter not in punctuation])
    # collecting all the reviews
    universe = text.split("\n")
    text = "".join(text)
    # collecting all the words
    corpus = text.split()
    
    return universe, corpus


universe, corpus = preprocess(reviews)

In [None]:
universe[0:3]

['0 one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with mebr br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordbr br it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awaybr br i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare

In [None]:
corpus[0:10]

['0',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after']

Tokenisation

In [None]:
from collections import Counter
word_count = Counter(corpus)

In [None]:
vocab = sorted(word_count, key=word_count.get, reverse=True)
vocab[0:5]

['the', 'and', 'a', 'of', 'to']

In [None]:
vocab_to_int = {word: idx+1 for idx, word in enumerate(vocab)}
int_to_vocab = {idx: word for idx, word in vocab_to_int.items()}

In [None]:
encoded_reviews = [[vocab_to_int[word] for word in review.split()] for review in universe]

In [None]:
import re
labels=sentiment.split("\n")
labels = "".join([re.sub("\d", "", label) for label in labels])
labels =labels.split()

In [None]:
encoded_labels=[1 if label == "positive" else 0 for label in labels]

Analysis of Labels

Analysis of Reviews

In [None]:
word_len=[len(x) for x in encoded_reviews]
pd.Series(word_len).describe()

count    50001.000000
mean       231.253615
std        170.665314
min          0.000000
25%        127.000000
50%        173.000000
75%        281.000000
max       2470.000000
dtype: float64

In [None]:
import numpy as np
pad_max=np.quantile(word_len, 0.99)

In [None]:
import torch

In [None]:
encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]


In [None]:
print(len(encoded_reviews))
print(len(encoded_labels))

50000
50000


Padding


In [None]:
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)
padded_reviews = pad_text(encoded_reviews, seq_length = 900)

In [None]:
padded_reviews

array([[    0,     0,     0, ...,   122,  4020,   501],
       [    0,     0,     0, ...,  1900,    73,   223],
       [    0,     0,     0, ...,    64,    15,   333],
       ...,
       [    0,     0,     0, ..., 23659,     2,  6059],
       [    0,     0,     0, ...,    68,   711,    42],
       [    0,     0,     0, ...,   782,    10,    17]])

Train-test Split

In [None]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2
total = padded_reviews.shape[0]
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = padded_reviews[:train_cutoff], encoded_labels[:train_cutoff]
valid_x, valid_y = padded_reviews[train_cutoff : valid_cutoff], encoded_labels[train_cutoff : valid_cutoff]
test_x, test_y = padded_reviews[valid_cutoff:], encoded_labels[valid_cutoff:]

In [None]:
from torch.utils.data import DataLoader, TensorDataset
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
# dataloaders
batch_size = 50
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
train_x,train_y

(array([[   0,    0,    0, ...,  122, 4020,  501],
        [   0,    0,    0, ..., 1900,   73,  223],
        [   0,    0,    0, ...,   64,   15,  333],
        ...,
        [   0,    0,    0, ...,    5,  129,  120],
        [   0,    0,    0, ...,  284,   59,  153],
        [   0,    0,    0, ...,   59,   15,   10]]),
 array([1, 1, 1, ..., 1, 0, 0]))

In [None]:
valid_x, valid_y

(array([[     0,      0,      0, ...,      5,    128,   1601],
        [     0,      0,      0, ...,    162,    111,   3790],
        [     0,      0,      0, ...,      9,     13,    990],
        ...,
        [     0,      0,      0, ...,   1827,   1320,     22],
        [     0,      0,      0, ..., 215947,     12,   1979],
        [     0,      0,      0, ...,      4,      1,    188]]),
 array([0, 0, 0, ..., 1, 1, 0]))

In [None]:
test_x, test_y

(array([[     0,      0,      0, ...,     53, 215950,    120],
        [     0,      0,      0, ...,   7515,     16,   5322],
        [     0,      0,      0, ...,     70,     10,    625],
        ...,
        [     0,      0,      0, ...,  23659,      2,   6059],
        [     0,      0,      0, ...,     68,    711,     42],
        [     0,      0,      0, ...,    782,     10,     17]]),
 array([1, 1, 0, ..., 0, 0, 0]))

Model designing

In [None]:
from torch import nn

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h


In [None]:
n_vocab = len(vocab_to_int)
n_embed = 50
n_hidden = 32
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
print(net)

SentimentLSTM(
  (embedding): Embedding(230647, 50)
  (lstm): LSTM(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


Training 

In [None]:
from torch import optim

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.1)



print_every = 50
step = 0
n_epochs = 2  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()




Epoch: 1/2 Step: 50 Training Loss: 0.7233 Validation Loss: 0.6801
Epoch: 1/2 Step: 100 Training Loss: 0.6884 Validation Loss: 0.6960
Epoch: 1/2 Step: 150 Training Loss: 0.6955 Validation Loss: 0.6878
Epoch: 1/2 Step: 200 Training Loss: 0.7069 Validation Loss: 0.6816
Epoch: 1/2 Step: 250 Training Loss: 0.7152 Validation Loss: 0.6825
Epoch: 1/2 Step: 300 Training Loss: 0.7147 Validation Loss: 0.6993
Epoch: 1/2 Step: 350 Training Loss: 0.6762 Validation Loss: 0.6730
Epoch: 1/2 Step: 400 Training Loss: 0.6935 Validation Loss: 0.6778
Epoch: 1/2 Step: 450 Training Loss: 0.6922 Validation Loss: 0.6914
Epoch: 1/2 Step: 500 Training Loss: 0.6700 Validation Loss: 0.6768
Epoch: 1/2 Step: 550 Training Loss: 0.6888 Validation Loss: 0.6867
Epoch: 1/2 Step: 600 Training Loss: 0.6956 Validation Loss: 0.6895
Epoch: 1/2 Step: 650 Training Loss: 0.6882 Validation Loss: 0.6975
Epoch: 1/2 Step: 700 Training Loss: 0.6982 Validation Loss: 0.6884
Epoch: 1/2 Step: 750 Training Loss: 0.6981 Validation Loss: 0.6

In [None]:
net.eval()
test_losses = []
num_correct = 0
h = net.init_hidden(batch_size)

for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)

    test_output, h = net(inputs)
    loss = criterion(test_output.squeeze(), labels.float())
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

IndexError: ignored

In [None]:
def predict(net, review, seq_length = 200):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    words = preprocess(review)
    encoded_words = [vocab_to_int[word] for word in words]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    
    if(len(padded_words) == 0):
        "Your review must contain at least 1 word!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
    output, h = net(padded_words, h)
    pred = torch.round(output.squeeze())
    msg = "This is a positive review." if pred == 0 else "This is a negative review."
    
    return msg