In [27]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from numpy import array

In [28]:
reviews = pd.read_csv("~/Desktop/ece684/synthetic_reviews.csv", sep="\t")
reviews

Unnamed: 0,index,review,sentiment
0,1194,attempt line i let the nuclear scene beautiful...,0
1,1128,self from with could socially least in the old...,0
2,1281,gambling for about jungle one the plays have t...,0
3,879,cancelled be slashers trying wasn all go more ...,0
4,729,semester the think but lacks creek of it of go...,0
...,...,...,...
4995,974,big there work just read was of angrier saw am...,0
4996,241,movies this danny understand to propaganda on ...,1
4997,27,old stop with shots it in in actors sargent ke...,1
4998,647,not crazy that film didn for did which victori...,1


In [29]:
import re

# Reformats Reviews


def cleanReview(rev):
    # Removing anything within a HTML tag
    edited_rev = re.compile(r"<[^>]+>").sub(" ", rev)
    # Removing Punctuation
    edited_rev = re.sub(r"[^\w\s]", " ", edited_rev)
    # Removing Numbers
    edited_rev = re.sub(r"[0-9]", " ", edited_rev)
    # Removing single characters
    edited_rev = re.sub(r"\s+[a-zA-Z]\s+", " ", edited_rev)
    # Removing multiple spaces
    edited_rev = re.sub(r"\s+", " ", edited_rev)

    return edited_rev

In [30]:
reviews["review"] = reviews["review"].apply(lambda x: x.replace("wouldn't", "would not"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("won't", "will not"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("can't", "can not"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("couldn't", "could not"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("I'm", "I am"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("ain't", "is not"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("shouldn't", "should not"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("(\w+)'ll", "\g<1> will"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("(\w+)'ve", "\g<1> have"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("(\w+)'s", "\g<1> is"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("(\w+)'re", "\g<1> are"))
reviews["review"] = reviews["review"].apply(lambda x: x.replace("(\w+)'d", "\g<1> would"))

In [31]:
review_list = []

# Adding all of the movie reviews to a list
for r in reviews["review"]:
    review_list.append(cleanReview(r))

In [32]:
#Start of tokenizing
words = " ".join(review_list)
words = list(words.split(" "))

#Getting word counts
totalCounts = {}
for j in range(len(words)):
    if words[j] not in totalCounts:
        totalCounts[words[j]] = 1
    if words[j] in totalCounts:
        totalCounts[words[j]] += 1

In [33]:
#Sorting dictionary to have highest word counts first
sorted_counts=dict(sorted(totalCounts.items(), key=lambda x: x[1], reverse=True))

In [34]:
#Making the highest word count value be 1, etc.
w_to_i = {w:i+1 for i, (w) in enumerate(sorted_counts)}

In [35]:
# Encoding the words by replacing the word with its given integer
X = []
for m in review_list:
    r = [w_to_i[w] for w in m.split()]
    X.append(r)

In [36]:
import statistics
lengths = []
for i in range(len(X)):
    l = len(X[i])
    lengths.append(l)
statistics.mean(lengths)

221.1912

In [37]:
#Padding for different review lengths
X_array = np.zeros((len(X), 221), dtype =int)
for i, rev2 in enumerate(X):
    rev_length = len(rev2)

    if rev_length <= 221:
        zeros = list(np.zeros(221-rev_length))
        mat = zeros + rev2
    elif rev_length > 221:
        mat = rev2[0:221]
    X_array[i,:] = np.array(mat)

In [38]:
# Creating a training and test data sets
y = reviews["sentiment"]

y = np.array(list(map(lambda x: 1 if x == 1 else 0, y)))

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X_array, y, test_size=0.20, random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [40]:
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn

# create Tensor Dataset
train_data = TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))
valid_data=TensorDataset(torch.LongTensor(X_valid), torch.LongTensor(y_valid))
test_data = TensorDataset(torch.LongTensor(X_test), torch.LongTensor(y_test))

In [41]:
#dataloader
batch_size=50 #50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [42]:
class SentimentalLSTM(nn.Module):
    def __init__(
        self, num_words, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5
    ):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Embedding and LSTM layers
        self.embedding = nn.Embedding(num_words, embedding_dim)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True
        )

        # dropout layer
        self.dropout = nn.Dropout(0.4) #0.3

        # Linear and sigmoid layer
        self.fc1 = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size()

        #   Embadding and LSTM output
        embedd = self.embedding(x)
        lstm_out, hidden = self.lstm(embedd, hidden)

        # stack up the lstm output
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layers
        out = self.dropout(lstm_out)
        out = self.fc1(out)
        out = self.dropout(out)
        '''out = self.fc2(out)
        out = self.dropout(out)
        out = self.fc3(out)'''
        sig_out = self.sigmoid(out)

        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]

        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [43]:
# Instantiate the model w/ hyperparams
#vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
num_words = len(words)
output_size = 1
embedding_dim = 221
hidden_dim = 250 #256 250
n_layers = 2 #2

net = SentimentalLSTM(num_words, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SentimentalLSTM(
  (embedding): Embedding(1105958, 221)
  (lstm): LSTM(221, 250, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc1): Linear(in_features=250, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [44]:
lr=0.0001 #0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

# training params

epochs = 10 # is approx where I noticed the validation loss stop decreasing, 3

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()


net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)


    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()
        
        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs, labels 
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


Epoch: 2/10... Step: 100... Loss: 0.688107... Val Loss: 0.692925
Epoch: 4/10... Step: 200... Loss: 0.684471... Val Loss: 0.692397
Epoch: 5/10... Step: 300... Loss: 0.664346... Val Loss: 0.685722
Epoch: 7/10... Step: 400... Loss: 0.554848... Val Loss: 0.587087
Epoch: 9/10... Step: 500... Loss: 0.650548... Val Loss: 0.468002
Epoch: 10/10... Step: 600... Loss: 0.440894... Val Loss: 0.459061


In [45]:
test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()


    output, h = net(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.435
Test accuracy: 0.817


In [46]:
train_acc = num_correct/len(train_loader.dataset)
print("Train accuracy: {:.3f}".format(test_acc))

Train accuracy: 0.817


In [47]:
def predictions(reviews):
    review_list2 = []

    # Adding all of the movie reviews to a list
    for r2 in reviews["review"]:
        review_list2.append(cleanReview(r2))

    # Encoding the words by replacing the word with its given integer
    X2 = []
    for m2 in review_list2:
        r2 = [w_to_i[w2] for w2 in m2.split()]
        X2.append(r)

    #Padding for different review lengths
    X_array2 = np.zeros((len(X2), 221), dtype =int)
    for i2, rev8 in enumerate(X2):
        rev_length2 = len(rev8)

        if rev_length2 <= 221:
            zeros2 = list(np.zeros(221-rev_length2))
            mat2 = zeros2 + rev8
        elif rev_length2 > 221:
            mat2 = rev8[0:221]
        X_array2[i2,:] = np.array(mat2)

    counter = 0
    h = net.init_hidden(batch_size)
    
    final_probs = []
    with torch.no_grad():
        # batch loop
        for inputs, labels in train_loader:
            counter += 1

            if(train_on_gpu):
                inputs=inputs.cuda()
                labels=labels.cuda()
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
        
            # get the output from the model
            output, h = net(inputs, h)
            final_probs.append(output)

        return final_probs

In [48]:
prediction = predictions(reviews)
print(prediction)

[tensor([0.2858, 0.8430, 0.7066, 0.1627, 0.5354, 0.7859, 0.1638, 0.3929, 0.8117,
        0.8063, 0.2262, 0.8595, 0.7969, 0.8467, 0.8300, 0.3722, 0.8839, 0.6210,
        0.8147, 0.2015, 0.8303, 0.2208, 0.2096, 0.8066, 0.8634, 0.1695, 0.8436,
        0.8700, 0.1722, 0.3576, 0.6575, 0.8552, 0.1606, 0.8099, 0.3998, 0.6622,
        0.4305, 0.1939, 0.8112, 0.7640, 0.6031, 0.1888, 0.8462, 0.6844, 0.2827,
        0.1378, 0.7942, 0.7163, 0.8433, 0.8506]), tensor([0.7668, 0.4739, 0.8199, 0.7997, 0.6442, 0.8627, 0.8560, 0.7416, 0.4116,
        0.8116, 0.1251, 0.8396, 0.1847, 0.8031, 0.8595, 0.2980, 0.6873, 0.1682,
        0.2212, 0.7987, 0.7941, 0.9019, 0.7900, 0.7873, 0.1510, 0.1704, 0.2554,
        0.2221, 0.1739, 0.3663, 0.3053, 0.8765, 0.8667, 0.6595, 0.1449, 0.8365,
        0.8543, 0.2647, 0.1431, 0.2348, 0.4740, 0.8054, 0.7681, 0.7320, 0.8961,
        0.7796, 0.2212, 0.8743, 0.6902, 0.8722]), tensor([0.1716, 0.5915, 0.8472, 0.8663, 0.1582, 0.1850, 0.1117, 0.2438, 0.8215,
        0.7638, 0.8

In [52]:
reviews.head(20)

Unnamed: 0,index,review,sentiment
0,1194,attempt line i let the nuclear scene beautiful...,0
1,1128,self from with could socially least in the old...,0
2,1281,gambling for about jungle one the plays have t...,0
3,879,cancelled be slashers trying wasn all go more ...,0
4,729,semester the think but lacks creek of it of go...,0
5,1652,be for chemistry explicit more is so would to ...,0
6,783,the work me about one seizures for of film ave...,0
7,2476,etc tunnel many as wasted hordes california vi...,1
8,2390,the that loved edgy or on seen glorious are su...,1
9,985,voice who production be car the the mountain h...,0


In [54]:
reviews["review"][7]

