In [1]:
!git clone https://github.com/lukysummer/Movie-Review-Sentiment-Analysis-LSTM-Pytorch.git

Cloning into 'Movie-Review-Sentiment-Analysis-LSTM-Pytorch'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 36 (delta 0), reused 0 (delta 0), pack-reused 33[K
Unpacking objects: 100% (36/36), done.


In [2]:
!ls Movie-Review-Sentiment-Analysis-LSTM-Pytorch/data

labels.txt  reviews.txt


In [3]:
with open("Movie-Review-Sentiment-Analysis-LSTM-Pytorch/data/reviews.txt") as f:
    reviews = f.read()
    
with open("Movie-Review-Sentiment-Analysis-LSTM-Pytorch/data/labels.txt") as f:
    labels = f.read()

In [4]:
from string import punctuation

In [5]:
def preprocess(text):
    text = text.lower()
    text = "".join([ch for ch in text if ch not in punctuation])
    all_reviews = text.split("\n")
    text = " ".join(text)
    all_words = text.split()
    
    return all_reviews, all_words

In [6]:
all_reviews, all_words = preprocess(reviews)

In [7]:
from collections import Counter

In [8]:
word_counts = Counter(reviews)

In [9]:
word_counts

Counter({'b': 515850,
         'r': 1550946,
         'o': 1922852,
         'm': 709785,
         'w': 509705,
         'e': 3086458,
         'l': 1143358,
         ' ': 7434318,
         'h': 1408705,
         'i': 1984336,
         'g': 537842,
         's': 1754598,
         'a': 2082813,
         'c': 720691,
         't': 2420932,
         'n': 1705883,
         'd': 906473,
         'y': 533174,
         '.': 327192,
         'p': 441581,
         'u': 687982,
         'f': 572095,
         'v': 326565,
         'k': 223823,
         'x': 43360,
         '\n': 25000,
         'z': 23095,
         'j': 58707,
         'q': 20148})

In [10]:
word_list = sorted(word_counts, key = word_counts.get, reverse = True)

In [11]:
vocab_to_int = {word:idx for idx, word in enumerate(word_list)}

In [12]:
vocab_to_int

{' ': 0,
 'e': 1,
 't': 2,
 'a': 3,
 'i': 4,
 'o': 5,
 's': 6,
 'n': 7,
 'r': 8,
 'h': 9,
 'l': 10,
 'd': 11,
 'c': 12,
 'm': 13,
 'u': 14,
 'f': 15,
 'g': 16,
 'y': 17,
 'b': 18,
 'w': 19,
 'p': 20,
 '.': 21,
 'v': 22,
 'k': 23,
 'j': 24,
 'x': 25,
 '\n': 26,
 'z': 27,
 'q': 28}

In [13]:
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}

In [14]:
int_to_vocab

{0: ' ',
 1: 'e',
 2: 't',
 3: 'a',
 4: 'i',
 5: 'o',
 6: 's',
 7: 'n',
 8: 'r',
 9: 'h',
 10: 'l',
 11: 'd',
 12: 'c',
 13: 'm',
 14: 'u',
 15: 'f',
 16: 'g',
 17: 'y',
 18: 'b',
 19: 'w',
 20: 'p',
 21: '.',
 22: 'v',
 23: 'k',
 24: 'j',
 25: 'x',
 26: '\n',
 27: 'z',
 28: 'q'}

In [15]:
encoded_reviews = [[vocab_to_int[word] for word in review] for review in all_reviews]

In [16]:
# encoded_reviews = []
# i = 0
# for review in all_reviews:
#     curr_rev = []
#     for letter in review:
#         if letter==" ":
#             continue
#         curr_rev.append(vocab_to_int[letter])
#     encoded_reviews.append(letter)

In [17]:
all_labels = labels.split("\n")
encoded_labels = [1 if label == "positive" else 0 for label in all_labels]

Note: you may need to restart the kernel to use updated packages.


Note: you may need to restart the kernel to use updated packages.


In [18]:
import numpy as np

import torch

In [19]:
encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

In [20]:
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)

In [21]:
padded_reviews = pad_text(encoded_reviews, seq_length = 200)

In [22]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2
total = padded_reviews.shape[0]
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

In [23]:
train_x, train_y = padded_reviews[:train_cutoff], encoded_labels[:train_cutoff]
valid_x, valid_y = padded_reviews[train_cutoff : valid_cutoff], encoded_labels[train_cutoff : valid_cutoff]
test_x, test_y   = padded_reviews[valid_cutoff:], encoded_labels[valid_cutoff:]

In [24]:
train_x = torch.Tensor(train_x).type(torch.LongTensor)
train_y = torch.Tensor(train_y).type(torch.LongTensor)
valid_x = torch.Tensor(valid_x).type(torch.LongTensor)
valid_y = torch.Tensor(valid_y).type(torch.LongTensor)
test_x  = torch.Tensor(test_x).type(torch.LongTensor)
test_y  = torch.Tensor(test_y).type(torch.LongTensor)

In [25]:
from torch.utils.data import TensorDataset, DataLoader

In [26]:
train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

In [27]:
batch_size = 25
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [28]:
import torch

In [29]:
from torch import nn

In [30]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words, h):
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words, h)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [31]:
n_vocab = len(vocab_to_int)+1
n_embed = 400
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [32]:
from torch import optim

In [40]:
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

In [41]:
print_every = 100
step = 0
n_epochs = 4  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available else 'cpu'

In [None]:
for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

In [None]:
net.eval()
test_losses = []
num_correct = 0
test_h = net.init_hidden(batch_size)

for inputs, labels in test_loader:
    test_h = tuple([each.data for each in test_h])
    test_output, test_h = net(inputs, test_h)
    loss = criterion(test_output, labels)
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

In [None]:
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

In [None]:
words = preprocess("It made me cry.")

In [None]:
words

In [None]:
encoded_words = [vocab_to_int[word] for word in words[1]]

In [None]:
def predict(net, review, seq_length = 200):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    device = "cpu"
    words = preprocess(review)
    encoded_words = [vocab_to_int[word] for word in words[1]]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    
    if(len(padded_words) == 0):
        "Your review must contain at least 1 word!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
    output, h = net(padded_words, h)
    pred = (output.squeeze())
    msg = "This is a positive review." if pred[0] == 0 else "This is a negative review."
    
    return msg

In [None]:
review1 = "It made me cry."
predict(net, review1)  ## negative ##

'This is a negative review.'

In [None]:
review2 = "It was so good it made me cry."
predict(net, review2)  ## positive ##

'This is a negative review.'

In [None]:
review3 = "It's ok."
predict(net, review3)  ## negative ##

In [None]:
review4 = "This movie had the best acting and the dialogue was so good. I loved it."
predict(net, review4)  ## positive ##

In [None]:
review5 = "Garbage"
predict(net, review5)  ## negative ##