In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') 

[nltk_data] Downloading package punkt to /home/sid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('movie_review.csv',
                 names=['Col1', 'Col2', 'Col3','Col4','Col5','Col6'], header=1)
df.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6
0,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
1,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
2,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
3,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos
4,0,cv000,29590,5,if you can get past the whole comic book thing...,pos


In [3]:
df.shape

(64719, 6)

In [4]:
reviews = df['Col5']
labels = df['Col6']
reviews = reviews[0:201]
reviews

0      for starters , it was created by alan moore ( ...
1      to say moore and campbell thoroughly researche...
2      the book ( or " graphic novel , " if you will ...
3      in other words , don't dismiss this film becau...
4      if you can get past the whole comic book thing...
                             ...                        
196    because american history x is a mainstream fil...
197    i partially disagree with this , because , alt...
198    the reasons for why he becomes a skinhead are ...
199    it's clear that he's passionate about his beli...
200    of course , it helps that kaye has an actor as...
Name: Col5, Length: 201, dtype: object

In [5]:
labels = labels[0:201]
labels

0      pos
1      pos
2      pos
3      pos
4      pos
      ... 
196    pos
197    pos
198    pos
199    pos
200    pos
Name: Col6, Length: 201, dtype: object

## Cleaning and pre-processing

In [6]:
from string import punctuation
punctuations1 = '''!()-[]``{};:'"\,<>./?@#$%^&*_~'''

In [7]:
clean_text=''
reviews_split=[]
for review in reviews:
    reviews_split.append(review)
    for ch in nltk.word_tokenize(review):
        if ch not in punctuation and ch not in punctuations1:
            clean_text=clean_text+ ' '+ch
clean_text

" for starters it was created by alan moore and eddie campbell who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd the book or graphic novel if you will is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes in other words do n't dismiss this film because of its source if you can get past the whole comic book thing you might find another stumbling block in from hell 's directors albert and allen hughes getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in well anything but riddle me this who better to direct a film that 's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society the ghetto in question is of course whitechapel in 1888 london 's east end it 's a filthy so

In [8]:
len(reviews_split)

201

In [9]:
words = clean_text.split()
words[:30]

['for',
 'starters',
 'it',
 'was',
 'created',
 'by',
 'alan',
 'moore',
 'and',
 'eddie',
 'campbell',
 'who',
 'brought',
 'the',
 'medium',
 'to',
 'a',
 'whole',
 'new',
 'level',
 'in',
 'the',
 'mid',
 "'80s",
 'with',
 'a',
 '12-part',
 'series',
 'called',
 'the']

## Vectorization

In [10]:
from collections import Counter

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
print(vocab)

['the', 'a', 'and', 'of', 'to', 'in', 'is', 'it', "'s", 'that', 'his', 'with', 'as', 'he', 'but', 'from', 'for', "n't", 'this', 'film', 'an', 'on', 'by', 'i', 'has', 'was', 'like', 'all', 'not', 'be', 'you', 'have', 'because', 'are', 'about', 'shark', 'do', 'so', 'does', 'than', 'even', 'they', 'at', 'one', 'who', 'movie', 'lumumba', 'what', 'two', 'new', 'first', 'there', 'story', 'more', 'when', 'him', 'had', 'jackie', 'or', 'been', 'how', 'much', 'time', 'were', 'right', "'ve", 'us', '--', 'its', 'up', 'good', 'performance', 'only', 'jaws', 'we', 'chan', 'other', 'into', 'no', 'then', 'know', '_election_', 'money', 'spielberg', 'men', 'history', 'little', 'if', 'better', 'enough', 'both', 'make', 'see', 'now', 'production', 'her', 'every', 'films', 'high', 'school', 'makes', '_rushmore_', 'most', 'gets', 'life', 'why', 'got', 'them', 'out', 'shaw', 'american', 'black', 'man', 'patrice', 'derek', 'campbell', 'over', 'can', 'get', 'seems', 'me', 'place', 'through', 'world', 'here', 'i

In [11]:
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
print(vocab_to_int)

{'the': 1, 'a': 2, 'and': 3, 'of': 4, 'to': 5, 'in': 6, 'is': 7, 'it': 8, "'s": 9, 'that': 10, 'his': 11, 'with': 12, 'as': 13, 'he': 14, 'but': 15, 'from': 16, 'for': 17, "n't": 18, 'this': 19, 'film': 20, 'an': 21, 'on': 22, 'by': 23, 'i': 24, 'has': 25, 'was': 26, 'like': 27, 'all': 28, 'not': 29, 'be': 30, 'you': 31, 'have': 32, 'because': 33, 'are': 34, 'about': 35, 'shark': 36, 'do': 37, 'so': 38, 'does': 39, 'than': 40, 'even': 41, 'they': 42, 'at': 43, 'one': 44, 'who': 45, 'movie': 46, 'lumumba': 47, 'what': 48, 'two': 49, 'new': 50, 'first': 51, 'there': 52, 'story': 53, 'more': 54, 'when': 55, 'him': 56, 'had': 57, 'jackie': 58, 'or': 59, 'been': 60, 'how': 61, 'much': 62, 'time': 63, 'were': 64, 'right': 65, "'ve": 66, 'us': 67, '--': 68, 'its': 69, 'up': 70, 'good': 71, 'performance': 72, 'only': 73, 'jaws': 74, 'we': 75, 'chan': 76, 'other': 77, 'into': 78, 'no': 79, 'then': 80, 'know': 81, '_election_': 82, 'money': 83, 'spielberg': 84, 'men': 85, 'history': 86, 'little'

In [24]:
reviews_ints = [] 
punctuations1 = '''!()-[]``{};:'"\,<>./?@#$%^&*_~'''
for review in reviews:
    print("------------------------------------------")
    print(review)  
    reviews_ints1 = [] 
    for c in nltk.word_tokenize(review):
        if c not in punctuation and c not in punctuations1:
            l=vocab_to_int[c]        
            reviews_ints1.append(l)
            print(reviews_ints1)  
    reviews_ints.append(reviews_ints1)     

------------------------------------------
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
[17]
[17, 560]
[17, 560, 8]
[17, 560, 8, 26]
[17, 560, 8, 26, 304]
[17, 560, 8, 26, 304, 23]
[17, 560, 8, 26, 304, 23, 561]
[17, 560, 8, 26, 304, 23, 561, 202]
[17, 560, 8, 26, 304, 23, 561, 202, 3]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1, 564]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1, 564, 5]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1, 564, 5, 2]
[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1, 564, 5, 2, 152]
[17, 560, 8, 26, 304, 23, 561, 202, 

[6]
[6, 1]
[6, 1, 494]
[6, 1, 494, 19]
[6, 1, 494, 19, 266]
[6, 1, 494, 19, 266, 5]
[6, 1, 494, 19, 266, 5, 1225]
[6, 1, 494, 19, 266, 5, 1225, 27]
[6, 1, 494, 19, 266, 5, 1225, 27, 1]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228, 278]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228, 278, 163]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228, 278, 163, 12]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228, 278, 163, 12, 2]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228, 278, 163, 12, 2, 500]
[6, 1, 494, 19, 266, 5, 1225, 27, 1, 1226, 499, 45, 1227, 2, 1228, 278, 163, 12, 2,

In [25]:
for i in reviews_ints:
    print(i)

[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1, 564, 5, 2, 152, 50, 565, 6, 1, 566, 567, 12, 2, 568, 569, 203, 1, 570]
[5, 153, 202, 3, 116, 571, 572, 1, 204, 4, 305, 1, 205, 206, 30, 27, 573, 574, 575, 7, 306, 5, 207, 2, 87, 307]
[1, 154, 59, 576, 308, 88, 31, 309, 7, 117, 577, 578, 208, 3, 579, 209, 310, 54, 10, 580, 4, 581, 15, 582]
[6, 77, 583, 37, 18, 584, 19, 20, 33, 4, 69, 585]
[88, 31, 118, 119, 586, 1, 152, 210, 154, 211, 31, 212, 311, 587, 588, 155, 6, 16, 156, 9, 312, 589, 3, 313, 314]
[590, 1, 314, 591, 5, 315, 19, 120, 213, 13, 592, 13, 316, 593, 594, 6, 214, 215, 15, 595, 121, 19, 45, 89, 5, 315, 2, 20, 10, 9, 317, 6, 1, 318, 3, 596, 319, 597, 598, 599, 40, 1, 600, 601, 320, 602, 321, 603]
[1, 318, 6, 216, 7, 4, 157, 322, 6, 604, 323, 9, 605, 158]
[8, 9, 2, 606, 607, 122, 217, 1, 608, 203, 609, 34, 306, 5, 119, 2, 87, 610, 35, 19, 611, 612, 45, 25, 60, 613, 123, 218, 614, 12, 615, 616]
[55, 1, 51, 617, 219, 70, 324, 325, 618, 619, 620, 1, 124, 7, 29, 90, 621,

In [26]:
# stats about vocabulary
print('Unique words: ', len((vocab_to_int)))
# should ~ 74000+ print() # print tokens in first review 
print('Tokenized review: \n', reviews_ints[:10])

Unique words:  1693
Tokenized review: 
 [[17, 560, 8, 26, 304, 23, 561, 202, 3, 562, 116, 45, 563, 1, 564, 5, 2, 152, 50, 565, 6, 1, 566, 567, 12, 2, 568, 569, 203, 1, 570], [5, 153, 202, 3, 116, 571, 572, 1, 204, 4, 305, 1, 205, 206, 30, 27, 573, 574, 575, 7, 306, 5, 207, 2, 87, 307], [1, 154, 59, 576, 308, 88, 31, 309, 7, 117, 577, 578, 208, 3, 579, 209, 310, 54, 10, 580, 4, 581, 15, 582], [6, 77, 583, 37, 18, 584, 19, 20, 33, 4, 69, 585], [88, 31, 118, 119, 586, 1, 152, 210, 154, 211, 31, 212, 311, 587, 588, 155, 6, 16, 156, 9, 312, 589, 3, 313, 314], [590, 1, 314, 591, 5, 315, 19, 120, 213, 13, 592, 13, 316, 593, 594, 6, 214, 215, 15, 595, 121, 19, 45, 89, 5, 315, 2, 20, 10, 9, 317, 6, 1, 318, 3, 596, 319, 597, 598, 599, 40, 1, 600, 601, 320, 602, 321, 603], [1, 318, 6, 216, 7, 4, 157, 322, 6, 604, 323, 9, 605, 158], [8, 9, 2, 606, 607, 122, 217, 1, 608, 203, 609, 34, 306, 5, 119, 2, 87, 610, 35, 19, 611, 612, 45, 25, 60, 613, 123, 218, 614, 12, 615, 616], [55, 1, 51, 617, 219, 70,

In [27]:
# 1=positive, 0=negative label conversion  
encoded_labels = np.array([1 if label == 'pos' else 0 for label in labels])
print(len(encoded_labels))

201


In [28]:
print('Number of reviews before removing outliers: ', len(reviews_ints))  
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
print(non_zero_idx)

Number of reviews before removing outliers:  201
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200]


In [29]:
print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews after removing outliers:  201


## Splitting data

In [30]:
from keras.preprocessing import sequence
max_words = 201
X_train = sequence.pad_sequences(reviews_ints, maxlen=max_words)
print(X_train[0])
print(X_train[1])
print(X_train[2])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0  17 560   8  26 304  23 561 202   3 562
 116  45 563   1 564   5   2 152  50 565   6   1 566 567  12   2 568 569
 203   1 570]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 

In [31]:
split_frac = 0.8
split_idx = int(len(X_train)*split_frac)
train_x, remaining_x = X_train[:split_idx], X_train[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]
test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

In [32]:
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), "\nValidation set: \t{}".format(val_x.shape),"\nTest set: \t\t{}".format(test_x.shape))
print("Train set: \t\t{}".format(train_y.shape), "\nValidation set: \t{}".format(val_y.shape),"\nTest set: \t\t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(160, 201) 
Validation set: 	(20, 201) 
Test set: 		(21, 201)
Train set: 		(160,) 
Validation set: 	(20,) 
Test set: 		(21,)


In [33]:
train_x

array([[   0,    0,    0, ...,  203,    1,  570],
       [   0,    0,    0, ...,    2,   87,  307],
       [   0,    0,    0, ...,  581,   15,  582],
       ...,
       [   0,    0,    0, ...,    2,  533,  532],
       [   0,    0,    0, ..., 1439, 1440, 1441],
       [   0,    0,    0, ...,    4,    1,  534]], dtype=int32)

In [34]:
train_y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])

## Training 

In [53]:
from keras import Sequential
from keras.layers import Embedding,Dropout,Dense,LSTM,GlobalMaxPool1D

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(input_dim=1681, output_dim=64))
model.add(LSTM(32, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_x,train_y, epochs=3, batch_size=32)

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, None, 64)          107584    
_________________________________________________________________
lstm_13 (LSTM)               (None, None, 32)          12416     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 32)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 17        
Total params: 120,545
Trainable params: 120,545
Non-trainable params: 0
_______________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f7e5c5b35b0>

## Compare with IMDB

In [54]:
from keras.datasets import imdb
np.random.seed(22)

In [55]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [56]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7e148db400>

In [57]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.32%
