In [38]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from numpy import array

In [39]:
reviews = pd.read_csv("~/Desktop/ece684/movies.csv")
reviews

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0
...,...,...
4995,This is the kind of picture John Lassiter woul...,1
4996,A MUST SEE! I saw WHIPPED at a press screening...,1
4997,NBC should be ashamed. I wouldn't allow my chi...,0
4998,This movie is a clumsy mishmash of various gho...,0


In [40]:
subset = reviews[reviews["label"]==0]
subset
#about half are negative and half are positive

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
4,"when i first read about ""berlin am meer"" i did...",0
6,I saw a screening of this movie last night. I ...,0
...,...,...
4988,"I had high expectations following ""My Beautifu...",0
4992,This one is just like the 6th movie. The movie...,0
4997,NBC should be ashamed. I wouldn't allow my chi...,0
4998,This movie is a clumsy mishmash of various gho...,0


In [41]:
reviews.isnull().values.any()
# The data set has no missing values

False

In [42]:
reviews["text"][6]

"I saw a screening of this movie last night. I had high expectations going into it, but was definitely disappointed. Within 5 minutes of the opening, Williams is already campaigning for his presidency. And he becomes president in the first 40 minutes. So there goes all that aspect of the movie. The first half hour are hilarious. Don't get me wrong, the movie has its moments. But after the first half hour, it takes a turn for the worst. It becomes less of a comedy, and more of a thriller/drama/love story...which is pointless. the movie goes nowhere and stands still for a good 30 minutes. there are laughs interspersed here and there, but the consistently funny part is in the beginning and only the beginning. at one point, the biggest cheer i heard in the audience is when a person in the crowd yelled 'boooo' during a very confusingly emotional scene. Williams gives a great performance, right on par with his comedic style. Walken also delivers a strong supporting role as only he can. I thi

In [43]:
import re

# Reformats Reviews


def cleanReview(rev):
    # Removing anything within a HTML tag
    edited_rev = re.compile(r"<[^>]+>").sub(" ", rev)
    # Removing Punctuation
    edited_rev = re.sub(r"[^\w\s]", " ", edited_rev)
    # Removing Numbers
    edited_rev = re.sub(r"[0-9]", " ", edited_rev)
    # Removing single characters
    edited_rev = re.sub(r"\s+[a-zA-Z]\s+", " ", edited_rev)
    # Removing multiple spaces
    edited_rev = re.sub(r"\s+", " ", edited_rev)

    return edited_rev

In [44]:
reviews["text"] = reviews["text"].apply(lambda x: x.replace("wouldn't", "would not"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("won't", "will not"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("can't", "can not"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("couldn't", "could not"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("I'm", "I am"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("ain't", "is not"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("shouldn't", "should not"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("(\w+)'ll", "\g<1> will"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("(\w+)'ve", "\g<1> have"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("(\w+)'s", "\g<1> is"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("(\w+)'re", "\g<1> are"))
reviews["text"] = reviews["text"].apply(lambda x: x.replace("(\w+)'d", "\g<1> would"))

In [45]:
reviews["text"]

0       I always wrote this series off as being a comp...
1       1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...
2       This movie was so poorly written and directed ...
3       The most interesting thing about Miryang (Secr...
4       when i first read about "berlin am meer" i did...
                              ...                        
4995    This is the kind of picture John Lassiter woul...
4996    A MUST SEE! I saw WHIPPED at a press screening...
4997    NBC should be ashamed. I would not allow my ch...
4998    This movie is a clumsy mishmash of various gho...
4999    Formula movie about the illegitimate son of a ...
Name: text, Length: 5000, dtype: object

In [46]:
review_list = []

# Adding all of the movie reviews to a list
for r in reviews["text"]:
    review_list.append(cleanReview(r))

In [47]:
#Start of tokenizing
words = " ".join(review_list)
words = list(words.split(" "))

#Getting word counts
totalCounts = {}
for j in range(len(words)):
    if words[j] not in totalCounts:
        totalCounts[words[j]] = 1
    if words[j] in totalCounts:
        totalCounts[words[j]] += 1

In [48]:
#Sorting dictionary to have highest word counts first
sorted_counts=dict(sorted(totalCounts.items(), key=lambda x: x[1], reverse=True))

In [49]:
#Making the highest word count value be 1, etc.
w_to_i = {w:i+1 for i, (w) in enumerate(sorted_counts)}

In [50]:
# Encoding the words by replacing the word with its given integer
X = []
for m in review_list:
    r = [w_to_i[w] for w in m.split()]
    X.append(r)

In [51]:
import statistics
lengths = []
for i in range(len(X)):
    l = len(X[i])
    lengths.append(l)
statistics.mean(lengths)

221.3644

In [52]:
#Padding for different review lengths
X_array = np.zeros((len(X), 221), dtype =int)
for i, rev2 in enumerate(X):
    rev_length = len(rev2)

    if rev_length <= 221:
        zeros = list(np.zeros(221-rev_length))
        mat = zeros + rev2
    elif rev_length > 221:
        mat = rev2[0:221]
    X_array[i,:] = np.array(mat)

In [53]:
# Creating a training and test data sets
y = reviews["label"]

y = np.array(list(map(lambda x: 1 if x == 1 else 0, y)))

In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X_array, y, test_size=0.20, random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [55]:
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn

# create Tensor Dataset
train_data = TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))
valid_data=TensorDataset(torch.LongTensor(X_valid), torch.LongTensor(y_valid))
test_data = TensorDataset(torch.LongTensor(X_test), torch.LongTensor(y_test))

In [56]:
#dataloader
batch_size=50 #50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [57]:
class SentimentalLSTM(nn.Module):
    def __init__(
        self, num_words, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5
    ):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Embedding and LSTM layers
        self.embedding = nn.Embedding(num_words, embedding_dim)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True
        )

        # dropout layer
        self.dropout = nn.Dropout(0.4) #0.3

        # Linear and sigmoid layer
        self.fc1 = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size()

        #   Embadding and LSTM output
        embedd = self.embedding(x)
        lstm_out, hidden = self.lstm(embedd, hidden)

        # stack up the lstm output
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layers
        out = self.dropout(lstm_out)
        out = self.fc1(out)
        out = self.dropout(out)
        '''out = self.fc2(out)
        out = self.dropout(out)
        out = self.fc3(out)'''
        sig_out = self.sigmoid(out)

        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]

        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [58]:
# Instantiate the model w/ hyperparams
#vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
num_words = len(words)
output_size = 1
embedding_dim = 221
hidden_dim = 250 #256 250
n_layers = 2 #2

net = SentimentalLSTM(num_words, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SentimentalLSTM(
  (embedding): Embedding(1111828, 221)
  (lstm): LSTM(221, 250, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc1): Linear(in_features=250, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [59]:
lr=0.0001 #0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

# training params

epochs = 10 # is approx where I noticed the validation loss stop decreasing, 3

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()


net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)


    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()
        
        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs, labels 
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


Epoch: 2/10... Step: 100... Loss: 0.696152... Val Loss: 0.691314
Epoch: 4/10... Step: 200... Loss: 0.692318... Val Loss: 0.687720
Epoch: 5/10... Step: 300... Loss: 0.661424... Val Loss: 0.672706
Epoch: 7/10... Step: 400... Loss: 0.579090... Val Loss: 0.621010
Epoch: 9/10... Step: 500... Loss: 0.614898... Val Loss: 0.600089
Epoch: 10/10... Step: 600... Loss: 0.484117... Val Loss: 0.571037


In [60]:
test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()


    output, h = net(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.577
Test accuracy: 0.690


In [61]:
train_acc = num_correct/len(train_loader.dataset)
print("Train accuracy: {:.3f}".format(test_acc))

Train accuracy: 0.690


import matplotlib.pyplot as plt

"""Plot training progress."""
fig, ax = plt.subplots(2, 1, figsize=(12, 8))
ax[0].plot(test_losses)
ax[0].set_ylabel("Test Loss")
ax[0].set_title("Test Loss")

ax[1].plot(test_acc)
ax[1].set_ylabel("Classification Accuracy")
ax[1].set_title("Test Accuracy")

plt.tight_layout()
plt.show()

fig, ax = plt.subplots(2, 1, figsize=(12, 8))
ax[0].plot(val_losses)
ax[0].set_ylabel("Validation Loss")
ax[0].set_title("Validation Loss")

ax[1].plot(test_acc)
ax[1].set_ylabel("Classification Accuracy")
ax[1].set_title("Test Accuracy")

plt.tight_layout()
plt.show()

In [62]:
'''def predict(reviews, net):
    rev1_lst=[]
    rev1_lst.append(cleanReview(rev1))
    rev1_lst

    X2 = []
    for m2 in rev1_lst:
        r2 = [w_to_i[w] for w in m2.split()]
        X2.append(r2)

    X_array2 = np.zeros((len(X2), 221), dtype =int)
    rev_length2 = len(X2)

    if rev_length2 <= 221:
        zeros2 = list(np.zeros(221-rev_length))
        mat2 = X2 + zeros2
    elif rev_length2 > 221:
        mat2 = rev2[0:221]
    X_array2 = np.array(mat2)

    h = net.init_hidden(batch_size)


    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()
        
        # get the output from the model
        output, h = net(inputs, h)

    return output.item()'''

"def predict(reviews, net):\n    rev1_lst=[]\n    rev1_lst.append(cleanReview(rev1))\n    rev1_lst\n\n    X2 = []\n    for m2 in rev1_lst:\n        r2 = [w_to_i[w] for w in m2.split()]\n        X2.append(r2)\n\n    X_array2 = np.zeros((len(X2), 221), dtype =int)\n    rev_length2 = len(X2)\n\n    if rev_length2 <= 221:\n        zeros2 = list(np.zeros(221-rev_length))\n        mat2 = X2 + zeros2\n    elif rev_length2 > 221:\n        mat2 = rev2[0:221]\n    X_array2 = np.array(mat2)\n\n    h = net.init_hidden(batch_size)\n\n\n    # batch loop\n    for inputs, labels in train_loader:\n        counter += 1\n\n        if(train_on_gpu):\n            inputs=inputs.cuda()\n            labels=labels.cuda()\n        # Creating new variables for the hidden state, otherwise\n        # we'd backprop through the entire training history\n        h = tuple([each.data for each in h])\n\n        # zero accumulated gradients\n        net.zero_grad()\n        \n        # get the output from the model\n    

In [65]:
def predictions(reviews):
    review_list2 = []

    # Adding all of the movie reviews to a list
    for r2 in reviews["text"]:
        review_list2.append(cleanReview(r2))

    # Encoding the words by replacing the word with its given integer
    X2 = []
    for m2 in review_list2:
        r2 = [w_to_i[w2] for w2 in m2.split()]
        X2.append(r)

    #Padding for different review lengths
    X_array2 = np.zeros((len(X2), 221), dtype =int)
    for i2, rev8 in enumerate(X2):
        rev_length2 = len(rev8)

        if rev_length2 <= 221:
            zeros2 = list(np.zeros(221-rev_length2))
            mat2 = zeros2 + rev8
        elif rev_length2 > 221:
            mat2 = rev8[0:221]
        X_array2[i2,:] = np.array(mat2)

    counter = 0
    h = net.init_hidden(batch_size)
    
    final_probs = []
    with torch.no_grad():
        # batch loop
        for inputs, labels in train_loader:
            counter += 1

            if(train_on_gpu):
                inputs=inputs.cuda()
                labels=labels.cuda()
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
        
            # get the output from the model
            output, h = net(inputs, h)
            final_probs.append(output)

        return final_probs

In [69]:
prediction = predictions(reviews)
print(prediction)

[tensor([0.1833, 0.7122, 0.6963, 0.7764, 0.4810, 0.7806, 0.7723, 0.2054, 0.8332,
        0.1642, 0.7441, 0.2583, 0.2985, 0.2478, 0.1306, 0.1607, 0.8262, 0.1646,
        0.6686, 0.3993, 0.1189, 0.1562, 0.4004, 0.2801, 0.8193, 0.2178, 0.8933,
        0.3521, 0.2682, 0.8213, 0.2880, 0.7835, 0.6741, 0.7987, 0.9099, 0.1131,
        0.7110, 0.3359, 0.8120, 0.8211, 0.5013, 0.8275, 0.5388, 0.8589, 0.8655,
        0.1233, 0.8406, 0.1916, 0.8341, 0.5186]), tensor([0.8223, 0.1499, 0.5615, 0.6986, 0.5422, 0.8505, 0.6626, 0.8429, 0.1389,
        0.2150, 0.8184, 0.5047, 0.8741, 0.6570, 0.6241, 0.1812, 0.8994, 0.1934,
        0.6604, 0.6460, 0.7666, 0.6805, 0.5312, 0.7316, 0.8447, 0.8233, 0.8060,
        0.8306, 0.3077, 0.1358, 0.2183, 0.1828, 0.2796, 0.8595, 0.1610, 0.4470,
        0.2911, 0.8276, 0.7262, 0.1439, 0.3001, 0.8865, 0.3945, 0.5489, 0.3301,
        0.1960, 0.7462, 0.8535, 0.8358, 0.3651]), tensor([0.8845, 0.1149, 0.1112, 0.4812, 0.1107, 0.1877, 0.8932, 0.4947, 0.2166,
        0.8265, 0.6

In [67]:
reviews

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0
...,...,...
4995,This is the kind of picture John Lassiter woul...,1
4996,A MUST SEE! I saw WHIPPED at a press screening...,1
4997,NBC should be ashamed. I would not allow my ch...,0
4998,This movie is a clumsy mishmash of various gho...,0
