In [2]:
cd drive/My\ Drive/Colab\ Notebooks/

/content/drive/My Drive/Colab Notebooks


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import time
import torch
import numpy as np
from string import punctuation
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

**DATA PREPARATION**

In [0]:
log_dir=""
writer = SummaryWriter(log_dir)

# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

#remove punctuation    
reviews = reviews.lower()
all_text = ''.join([c for c in reviews if c not in punctuation])  

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()  

# word embedding and mapping
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}

#tokenize
reviews_ints = []
for review in reviews_split:
  reviews_ints.append([vocab_to_int[word] for word in review.split()])
  
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

# review length
review_lens = Counter([len(x) for x in reviews_ints])


# remove 0-length review with their labels
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx], dtype=np.int64)


def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    ## getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=np.int64)
    
    ## for each review, I grab that review
    for i, row in enumerate(reviews_ints):
      features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)

# train test split
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)
split_idx = int(len(features)*0.8)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

**DATA LOAD**

In [0]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

**LSTM Model**

In [0]:
class ReviewLSTM(nn.Module):
    """
    The LSTM model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(ReviewLSTM, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())

        
        return hidden

**TRAINING**

In [14]:
vocab_size = len(vocab_to_int) + 1 # +1 for zero padding + our word tokens
output_size = 1
embedding_dim = 400 
hidden_dim = 256
n_layers = 2

model = ReviewLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.cuda()
model.train()
        
lr=0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 17 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping


for e in range(epochs):

    #time calculation
    start = time.time()

    #initialize total training loss
    total_training_loss= 0.0

    # initialize hidden state
    h = model.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        inputs, labels = inputs.cuda(), labels.cuda()
            

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        model.zero_grad()

        # get the output from the model
        output, h = model(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_training_loss+=loss.item()
        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs.cuda(), labels.cuda()
                    

                output, val_h = model(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            model.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            writer.add_scalar('Validation_Loss',np.mean(val_losses),counter)
    end = time.time()
    writer.add_scalar('Time_Taken_for_each_epoch',end-start,e)
    writer.add_scalar('Total_training_loss_for_each_epoch', total_training_loss, e)
            
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = model.init_hidden(batch_size)

model.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    inputs, labels = inputs.cuda(), labels.cuda()
        
    
    # get predicted outputs
    output, h = model(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Epoch: 1/17... Step: 100... Loss: 0.694316... Val Loss: 0.692038
Epoch: 1/17... Step: 200... Loss: 0.686400... Val Loss: 0.680889
Epoch: 1/17... Step: 300... Loss: 0.583242... Val Loss: 0.596320
Epoch: 1/17... Step: 400... Loss: 0.638035... Val Loss: 0.561320
Epoch: 2/17... Step: 500... Loss: 0.420604... Val Loss: 0.569542
Epoch: 2/17... Step: 600... Loss: 0.567373... Val Loss: 0.502231
Epoch: 2/17... Step: 700... Loss: 0.456642... Val Loss: 0.451310
Epoch: 2/17... Step: 800... Loss: 0.416352... Val Loss: 0.446645
Epoch: 3/17... Step: 900... Loss: 0.358730... Val Loss: 0.492935
Epoch: 3/17... Step: 1000... Loss: 0.530183... Val Loss: 0.452473
Epoch: 3/17... Step: 1100... Loss: 0.383606... Val Loss: 0.477211
Epoch: 3/17... Step: 1200... Loss: 0.334373... Val Loss: 0.427914
Epoch: 4/17... Step: 1300... Loss: 0.268622... Val Loss: 0.455816
Epoch: 4/17... Step: 1400... Loss: 0.438608... Val Loss: 0.513695
Epoch: 4/17... Step: 1500... Loss: 0.330515... Val Loss: 0.410962
Epoch: 4/17... Step

**RAW TESTING**

In [0]:
test_review_neg = 'Nice  make one in every State of India.  Do in whole India  state wise.'

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuatuon
    test_text = ''.join([c for c in test_review if c not in punctuation])
    
    # splitting by spaces
    test_words = test_text.split()
    
    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])
    
    return test_ints
  
# test code and generate tokenized review
test_ints = tokenize_review(test_review_neg)

seq_length = 200
features = pad_features(test_ints, seq_length)

feature_tensor = torch.from_numpy(features)

def predict(net, test_review, sequence_length=200):
    ''' Prints out whether a give review is predicted to be 
        positive or negative in sentiment, using a trained model.
        
        params:
        net - A trained net 
        test_review - a review made of normal text and punctuation
        sequence_length - the padded length of a review
        '''
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenize sequence
    seq_length = sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass to model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    feature_tensor = feature_tensor.cuda() 
      
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response based on whether test_review is pos/neg
    if(pred.item()==1):
      print('Positive review detected!')
    else:
      print('Negative review detected!')

      
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'

seq_length=200      

**RESULT**

In [0]:
predict(model, test_review_neg, seq_length)
predict(model, test_review_pos, seq_length)

Prediction value, pre-rounding: 0.135852
Negative review detected!
Prediction value, pre-rounding: 0.646279
Positive review detected!
