# Sentiment Analysis with an RNN using Word Embedding

The following implements word embedding to learn the sentiment (positive or negative) or movie reviews

In [None]:
# DON'T MODIFY THIS CELL

# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="-1" 

# import some modules
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.optimizers import RMSprop
import pandas as pd
import numpy as np
import re
import requests
import matplotlib.pyplot as plt

In [None]:
# DON'T MODIFY THIS CELL

# import the data
reviews_url = 'https://raw.githubusercontent.com/prof-groff/deep-learning/master/data/sentiment/reviews.txt'
labels_url = 'https://raw.githubusercontent.com/prof-groff/deep-learning/master/data/sentiment/labels.txt'
reviews = requests.get(reviews_url).text
labels = requests.get(labels_url).text

# do some additional preprocessing
reviews = re.sub(' br ','', reviews) # remove random remnants of <br> tags
reviews = re.sub('\n', 'newline_char', reviews) # temporarily remove \n characters
reviews = re.sub('\s+', ' ', reviews) # remove all special characters and extra spaces
reviews = re.sub('newline_char', '\n', reviews) # replace the \n characters

# split the reviews and lables text at \n characters to make lists
reviews_list = reviews.split('\n')[0:-1] # for some reason the last element is junk
labels_list = labels.split('\n')[0:-1]

In [None]:
# DON'T MODIFY THIS CELL

# look at an example negative and positive review
print(labels_list[1001].upper())
print('\n')
print(reviews_list[1001])
print('\n')
print(labels_list[1002].upper())
print('\n')
print(reviews_list[1002])


In [None]:
# DON'T MODIFY THIS CELL

# look at the distribution of review lengths (# of words)
# some of the reviews are really long but most are under 500 words long
review_lengths = []
for each in reviews_list:
    review_lengths.append(len(each.split()))

plt.hist(review_lengths,bins=50)
plt.ylabel('counts')
plt.xlabel('review length')
plt.show()    

In [None]:
# TO-DO: PICK A MAX_LENGTH (AT LEAST 100) AND TRIM THE DATA TO ONLY THOSE REVIEWS 
# WITH NO MORE THAN THIS MANY WORDS
max_length = 

# DO NOT MODIFY THIS CELL BELOW THIS POINT

reviews_trimmed = []
labels_trimmed = []

for r, l in zip(reviews_list, labels_list):
    if len(r.split())<= max_length:
        reviews_trimmed.append(r)
        labels_trimmed.append(l)
        
print('TOTAL REVIEWS: ' + str(len(reviews_list)))
print('REVIEWS WITH < ' + str(max_length) + ' WORDS: ' + str(len(reviews_trimmed)))

In [None]:
# DO NOT MODIFY THIS CELL

# determine the number of unique words in the reviews
vocab = set(' '.join(reviews_trimmed).split())
vocab_length = len(vocab)+1 # add one so zero is reserved for padding
print('VOCAB LENGTH: ' + str(vocab_length))

In [None]:
# DO NOT MODIFY THIS CELL

# a function to create mappings between words and integers
# each integer is a key for each word and vice versa
def create_lookup_tables(vocab):
    # enumerate adds an index to each word in the vocab and returns a list of tuples
    int_to_vocab = dict(enumerate(vocab, 1))
    vocab_to_int = dict(zip(int_to_vocab.values(), int_to_vocab.keys()))
    
    return vocab_to_int, int_to_vocab

vocab_to_int, int_to_vocab = create_lookup_tables(vocab)

In [None]:
# DO NOT MODIFY THIS CELL

# the following tokenizes the reviews and labels
n_reviews = len(reviews_trimmed)
reviews_vect = np.zeros([n_reviews, max_length])
labels_vect = np.zeros([n_reviews,2])
for ii, r, l in zip(np.arange(n_reviews), reviews_trimmed, labels_trimmed):
    words = r.split()
    n_words = len(words)
    for jj, w in zip(np.arange(n_words), words):
        reviews_vect[ii, max_length-n_words+jj] = vocab_to_int[w]
    if l == 'positive':
        labels_vect[ii,1]=1
    else:
        labels_vect[ii,0]=1

In [None]:
# DO NOT MODIFY THIS CELL

# look at an example tokenized review

print(labels_trimmed[0].upper())
print('\n')
print(reviews_trimmed[0])
print('\n')
print(reviews_vect[0])

In [None]:
# TO-DO: BREAK THE DATA INTO A TRAINING, VALIDATION, AND TESTING SETS 
# WITH AT LEAST 1000 ELEMENTS IN THE VALIDATION AND TESTING SETS
val_size = 
test_size = 

# DO NOT MODIFY THIS CELL BELOW THIS POINT

x_train, x_val, x_test = reviews_vect[:-(val_size+test_size)], reviews_vect[-(val_size+test_size):-test_size], reviews_vect[-test_size:]
y_train, y_val, y_test = labels_vect[:-(val_size+test_size)], labels_vect[-(val_size+test_size):-test_size], labels_vect[-test_size:]

In [None]:
# TO-DO: SELECT MODEL SIZE PARAMETERS
embedding_size = 
memory_units = 

# TO-DO: SELECT HYPERPARAMETERS
batch_size = 
epochs = 
learning_rate = 

# BUILD A MODEL
print('build model...'.upper())
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=embedding_size, input_length=max_length))
# TO-DO: ADD A LSTM LAYER
model.add()
# TO-DO: ADD A DENSE (FULLY CONNECTED) LAYER WITH 2 OUTPUT UNITS AND SOFTMAX ACTIVATION
model.add()

optimizer = RMSprop(lr=learning_rate)
# ALTERNATIVE OPTIMIZER
# optimizer = 'adam'

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.summary()
print('\ntrain...\n'.upper())
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_val, y_val))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('\ntest score: '.upper() + str(score))
print('test accuracy: '.upper() + str(acc))


In [None]:
# let's look at a specific review and compare the predicted sentiment to the actual sentiment

# TO-DO: PICK A TEST CASE
test_case =  # index of one of the reviews in the test set

# DO NOT MODIFY THIS CELL BELOW THIS POINT

x = x_test[test_case]
y = y_test[test_case]

# convert from tokens back to text
test_review = []
for each in x:
    if each != 0:
        test_review.append(int_to_vocab[each])

test_review = ' '.join(test_review)

# look at an example negative and positive review
print('predicting sentiment of following review...\n'.upper())
print(test_review)
print('\n')

print('passing the review through the trained model...\n'.upper())
x = np.reshape(x, (1,len(x)))
y_prime = model.predict(x)
print('predicted sentiment - 0 - negative, 1 - positive'.upper())
print(y_prime[0])
print(['negative', 'positive'][np.argmax(y_prime[0])].upper())
print('\n')
print('actual sentiment'.upper())
print(['negative', 'positive'][np.argmax(y)].upper())


In [None]:
# CHALLENGE: WRITE YOUR OWN REVIEW AN USE THE MODEL TO PREDICT ITS SENTIMENT