In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Recurrent Neural Network for classifying movie reviews from IMDb dataset
@Author: Sameer Kesava
          * 50000 reviews
          * 1 and 0 labels (sentiment for positive and negative respectively)
          * Train-Test split: 4 to 1
          * 1 LSTM layer, 1 Dense Layer
          * Dropout: 0.5 at the output     
          * Binary Cross Entropy Loss function    
          * Adam Optimizer(learning rate: .001)
          * Batch size: 100
          * Epochs: 10
          * Embed size = 200, LSTM size = 256, Stateful = True - Train data accuracy: 99%; Test data accuracy: 87% => Overfitting
          * Embed size = 200, LSTM size = 256, Stateful = False - Train data accuracy: 99%; Test data accuracy: 87% => Overfitting
          * Embed size = 200, LSTM size = 64, Stateful = False - Train data accuracy: 99%; Test data accuracy: 88% => Still overfitting
          * Requires regularization to reduce overfitting
          * Embed size = 50, LSTM size = 64; L2 Regularization: 0.01, Stateful = False - Train data accuracy: 98%; Test data accuracy: 86% => Still overfitting


        * Dataset from "Learning Word Vectors for Sentiment Analysis", Maas et al, Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pages 142-150, Portland, Oregon, USA, Association for Computational Linguistics, June 2011

#### Importing dataset

In [2]:
#from google.colab import files
#data_file = files.upload()

In [3]:
import pandas as pd

In [4]:
review_data = pd.read_csv('movie_data.csv')

In [5]:
review_data.columns

Index(['review', 'sentiment'], dtype='object')

In [6]:
review_data.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [7]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 781.3+ KB


### Counting the number of words including punctuation

In [8]:
from collections import Counter

In [9]:
counter = Counter()

In [10]:
from string import punctuation

In [11]:
# including punctuation in the text as features
test_str = ''.join([c if c not in punctuation else ' '+c+' ' for c in review_data['review'][0]])
test_str

"I went and saw this movie last night after being coaxed to by a few friends of mine .  I ' ll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy .  I was wrong .  Kutcher played the character of Jake Fischer very well ,  and Kevin Costner played Ben Randall with such professionalism .  The sign of a good movie is that it can toy with our emotions .  This one did exactly that .  The entire theater  ( which was sold out )  was overcome by laughter during the first half of the movie ,  and were moved to tears during the second half .  While exiting the theater I not only saw many women in tears ,  but many full grown men as well ,  trying desperately not to let anyone see them crying .  This movie was great ,  and I suggest that you go see it before you judge . "

In [12]:
# removing punctuation in the text as features
test_str_2 = ''.join([c if c not in punctuation else '' for c in review_data['review'][0]]).lower()
test_str_2

'i went and saw this movie last night after being coaxed to by a few friends of mine ill admit that i was reluctant to see it because from what i knew of ashton kutcher he was only able to do comedy i was wrong kutcher played the character of jake fischer very well and kevin costner played ben randall with such professionalism the sign of a good movie is that it can toy with our emotions this one did exactly that the entire theater which was sold out was overcome by laughter during the first half of the movie and were moved to tears during the second half while exiting the theater i not only saw many women in tears but many full grown men as well trying desperately not to let anyone see them crying this movie was great and i suggest that you go see it before you judge'

In [13]:
test_str_2.split()[:10]

['i', 'went', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'being']

##### Including punctuation in the sequence

In [14]:
review_data['review'] = review_data['review'].map(lambda x: ''.join([c if c not in punctuation \
                                                             else ' '+c+' ' for c \
                                                             in x]).lower())

In [15]:
review_data.loc[0, 'review']

"i went and saw this movie last night after being coaxed to by a few friends of mine .  i ' ll admit that i was reluctant to see it because from what i knew of ashton kutcher he was only able to do comedy .  i was wrong .  kutcher played the character of jake fischer very well ,  and kevin costner played ben randall with such professionalism .  the sign of a good movie is that it can toy with our emotions .  this one did exactly that .  the entire theater  ( which was sold out )  was overcome by laughter during the first half of the movie ,  and were moved to tears during the second half .  while exiting the theater i not only saw many women in tears ,  but many full grown men as well ,  trying desperately not to let anyone see them crying .  this movie was great ,  and i suggest that you go see it before you judge . "

##### Updating counter with the words and its number of instances

In [16]:
review_data['review'].map(lambda x: counter.update(x.split()));

In [17]:
# Unique number of words/characters
len(counter.values())

102966

In [18]:
# sorted in descending order
word_counts = sorted(counter, key = counter.get, reverse = True) 
print(word_counts[0:5]) 

['the', '.', ',', 'and', 'a']


In [19]:
print(word_counts[-5:])

['hoodies', 'mwuhahahaa', '\x91autumn', 'bellwood', 'whelk']


### Converting unique words into integers

In [20]:
n_unique_words = len(word_counts)
n_unique_words

102966

In [21]:
word_to_int = {word:i for i, word in enumerate(word_counts,1)}

In [22]:
word_to_int['the']

1

###### Testing

In [23]:
def word_int(text):    
    return [word_to_int[word] for word in text.split()]

In [24]:
word_int('sentiment analysis')

[6133, 5092]

###### End Testing

In [25]:
review_data['review_int'] = review_data['review'].map(lambda text: [word_to_int[word] for word in text.split()])

In [26]:
review_data.head(5)

Unnamed: 0,review,sentiment,review_int
0,i went and saw this movie last night after bei...,1,"[16, 433, 4, 227, 17, 24, 254, 328, 114, 125, ..."
1,actor turned director bill paxton follows up h...,1,"[297, 672, 169, 972, 5826, 1119, 68, 37, 2424,..."
2,as a recreational golfer with some knowledge o...,1,"[23, 5, 25748, 28252, 26, 61, 1859, 6, 1, 4005..."
3,"i saw this film in a sneak preview , and it i...",1,"[16, 227, 17, 28, 15, 5, 5174, 4070, 3, 4, 14,..."
4,bill paxton has taken the true story of the 19...,1,"[972, 5826, 58, 621, 1, 307, 76, 6, 1, 16605, ..."


#### Calculating word count of each review

In [27]:
review_data['word_count'] = review_data['review_int'].apply(len)

In [28]:
review_data[:5]

Unnamed: 0,review,sentiment,review_int,word_count
0,i went and saw this movie last night after bei...,1,"[16, 433, 4, 227, 17, 24, 254, 328, 114, 125, ...",170
1,actor turned director bill paxton follows up h...,1,"[297, 672, 169, 972, 5826, 1119, 68, 37, 2424,...",421
2,as a recreational golfer with some knowledge o...,1,"[23, 5, 25748, 28252, 26, 61, 1859, 6, 1, 4005...",296
3,"i saw this film in a sneak preview , and it i...",1,"[16, 227, 17, 28, 15, 5, 5174, 4070, 3, 4, 14,...",162
4,bill paxton has taken the true story of the 19...,1,"[972, 5826, 58, 621, 1, 307, 76, 6, 1, 16605, ...",241


In [29]:
review_data[['word_count']].describe()

Unnamed: 0,word_count
count,50000.0
mean,292.14896
std,217.394254
min,8.0
25%,157.0
50%,218.0
75%,356.0
max,3046.0


### Function for setting a constant sequence length
        * Can also use masking layer in tensorflow

In [30]:
import numpy as np

In [31]:
def seq_len(review, ntimesteps, start = False):
    """If start is False, only the last ntimesteps words are returned.
        Else, the first ntimesteps words are returned."""
    
    review_len = len(review)
    
    if review_len >= ntimesteps:
        if not start:
            return np.array(review[-ntimesteps:])
        else: 
            return np.array(review[:ntimesteps])
    else:
        return np.array([0]*(ntimesteps-review_len) + review)
        

In [32]:
seq_len(list(range(2,8)), 10)

array([0, 0, 0, 0, 2, 3, 4, 5, 6, 7])

In [33]:
seq_len(list(range(2,20)), 10)

array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [34]:
seq_len(list(range(2,20)), 10, start = True)

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

#### Applying seq_len function on the dataframe

In [35]:
# Number of times steps as the sequence length
ntimesteps = 200

In [36]:
# choosing the last set of words in a review if greater than sequence length/num.time steps whle padding with zero
review_data['sequenced'] = review_data[['review_int']].applymap(lambda x: seq_len(x, ntimesteps)) 

In [37]:
review_data[:3]

Unnamed: 0,review,sentiment,review_int,word_count,sequenced
0,i went and saw this movie last night after bei...,1,"[16, 433, 4, 227, 17, 24, 254, 328, 114, 125, ...",170,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,actor turned director bill paxton follows up h...,1,"[297, 672, 169, 972, 5826, 1119, 68, 37, 2424,...",421,"[1138, 7, 1, 3990, 2, 51, 8, 21, 61, 1506, 119..."
2,as a recreational golfer with some knowledge o...,1,"[23, 5, 25748, 28252, 26, 61, 1859, 6, 1, 4005...",296,"[1, 533, 6, 1, 3144, 15072, 1507, 47, 20218, 4..."


In [38]:
review_data['sequenced'].apply(len).describe()

count    50000.0
mean       200.0
std          0.0
min        200.0
25%        200.0
50%        200.0
75%        200.0
max        200.0
Name: sequenced, dtype: float64

### Unique labels

In [39]:
y_unique = review_data['sentiment'].unique()
y_unique

array([1, 0])

### Checking the counts of each label in case up/down-sampling is required

In [40]:
review_data['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

        * Equal counts of labels 1 and 0

### Splitting into train and test dataframes       

In [41]:
train_df = review_data.iloc[:40000]
test_df = review_data.iloc[40000:]

In [42]:
len(train_df)

40000

In [43]:
len(test_df)

10000

In [44]:
train_df['sentiment'].value_counts()

1    25000
0    15000
Name: sentiment, dtype: int64

In [45]:
test_df['sentiment'].value_counts()

0    10000
Name: sentiment, dtype: int64

##### Shuffling is required

In [46]:
review_data.head(5)

Unnamed: 0,review,sentiment,review_int,word_count,sequenced
0,i went and saw this movie last night after bei...,1,"[16, 433, 4, 227, 17, 24, 254, 328, 114, 125, ...",170,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,actor turned director bill paxton follows up h...,1,"[297, 672, 169, 972, 5826, 1119, 68, 37, 2424,...",421,"[1138, 7, 1, 3990, 2, 51, 8, 21, 61, 1506, 119..."
2,as a recreational golfer with some knowledge o...,1,"[23, 5, 25748, 28252, 26, 61, 1859, 6, 1, 4005...",296,"[1, 533, 6, 1, 3144, 15072, 1507, 47, 20218, 4..."
3,"i saw this film in a sneak preview , and it i...",1,"[16, 227, 17, 28, 15, 5, 5174, 4070, 3, 4, 14,...",162,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,bill paxton has taken the true story of the 19...,1,"[972, 5826, 58, 621, 1, 307, 76, 6, 1, 16605, ...",241,"[1, 421, 9449, 1059, 4, 2553, 696, 1, 3275, 10..."


In [47]:
random_seed = 123

In [48]:
review_data = review_data.sample(frac = 1, replace = False, random_state=random_seed)

In [49]:
review_data.head()

Unnamed: 0,review,sentiment,review_int,word_count,sequenced
11872,sherlock holmes ( basil rathbone ) begins th...,1,"[6186, 2411, 30, 7986, 7891, 29, 847, 17, 76, ...",393,"[2403, 24571, 2, 20, 39, 3823, 85, 57, 24571, ..."
40828,probably the worst bollywood film i ' ve seen ...,0,"[256, 1, 265, 2880, 28, 16, 8, 153, 124, 2, 12...",179,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36400,a bit quirky and bordering bad taste ; but in...,1,"[5, 241, 2646, 4, 12599, 88, 1288, 123, 27, 11...",174,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5166,lackawanna blues is a drama through and throug...,1,"[18071, 3907, 10, 5, 464, 159, 4, 159, 2, 14, ...",231,"[10, 5249, 7, 23, 7483, 45, 42, 47, 135, 54, 3..."
30273,one more of those brilliant young men who went...,1,"[39, 65, 6, 162, 529, 204, 356, 47, 433, 42, 5...",134,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [50]:
train_df = review_data.iloc[:40000]
test_df = review_data.iloc[40000:]

In [51]:
train_df['sentiment'].value_counts()

0    20039
1    19961
Name: sentiment, dtype: int64

In [52]:
test_df['sentiment'].value_counts()

1    5039
0    4961
Name: sentiment, dtype: int64

In [53]:
train_df[:3]

Unnamed: 0,review,sentiment,review_int,word_count,sequenced
11872,sherlock holmes ( basil rathbone ) begins th...,1,"[6186, 2411, 30, 7986, 7891, 29, 847, 17, 76, ...",393,"[2403, 24571, 2, 20, 39, 3823, 85, 57, 24571, ..."
40828,probably the worst bollywood film i ' ve seen ...,0,"[256, 1, 265, 2880, 28, 16, 8, 153, 124, 2, 12...",179,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36400,a bit quirky and bordering bad taste ; but in...,1,"[5, 241, 2646, 4, 12599, 88, 1288, 123, 27, 11...",174,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [54]:
test_df[:3]

Unnamed: 0,review,sentiment,review_int,word_count,sequenced
1554,"if you listen to ween ( the pod , god / sata...",1,"[57, 31, 1661, 7, 64534, 30, 1, 9630, 3, 519, ...",190,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 31, 1661, 7..."
33704,"after seeing jeremy brett as sherlock holmes ,...",1,"[114, 334, 3644, 7538, 23, 6186, 2411, 3, 69, ...",184,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
23967,"the cast of "" all that "" returns for good hu...",0,"[1, 195, 6, 20, 42, 18, 20, 1644, 25, 62, 477,...",210,"[477, 15, 1, 20, 62, 14063, 20, 5, 291, 756, 7..."


### Building the neural network

In [55]:
#!pip install tensorflow==2.0.0-alpha0

In [56]:
# version 2.0.0
import tensorflow as tf

#### Building a Stateful LSTM Model in Keras

In [82]:
model = tf.keras.Sequential()

In [83]:
batch_size = 100

# Input layer
model.add(tf.keras.layers.Input(shape = (ntimesteps,), batch_size = batch_size, dtype = tf.int32, name = 'input_x'))
print(model)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7fdfd276c8d0>


In [84]:
# Embedding layer. input_dim = n_unique_words + 1
embed_size = 50
model.add(tf.keras.layers.Embedding(input_dim = n_unique_words+1, output_dim = embed_size, input_length=ntimesteps, \
                                    name = 'embed_x'))
print(model)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7fdfd276c8d0>


In [85]:
# Checking for embedding
model.compile('rmsprop', 'mse')
test_output = model.predict(train_df.iloc[0]['sequenced'].reshape(1,ntimesteps)) 
# reshape is important. Data needs to be of the format (batch_size, ntimesteps)
assert test_output.shape == (1, ntimesteps, embed_size)

##### LSTM layer with output dropout and stateful = True

In [86]:
dropout = 0.5
model.add(tf.keras.layers.LSTM(units = 64, batch_input_shape = (batch_size, ntimesteps, embed_size),
                               dropout =  dropout, stateful = False, name = 'lstm_64', 
                              kernel_regularizer=tf.keras.regularizers.l2(l=0.01))) 
# return_sequences = True if adding another LSTM layer

##### Output layer with predicted probability

In [87]:
model.add(tf.keras.layers.Dense(units = 1, activation = tf.nn.sigmoid, name = 'predicted'))

##### Compiling

In [88]:
model.compile(optimizer = tf.optimizers.Adam(learning_rate=0.001), loss = tf.keras.losses.binary_crossentropy, metrics = ['accuracy'])

##### Testing the output of the layer 

In [89]:
test = [x for x in train_df.iloc[0:batch_size]['sequenced'].values]

In [90]:
model.predict(np.array(test))[:10]

array([[0.49884742],
       [0.50319916],
       [0.5022807 ],
       [0.50312054],
       [0.49897352],
       [0.50171447],
       [0.498179  ],
       [0.50145274],
       [0.49652454],
       [0.5027386 ]], dtype=float32)

##### Fitting

In [66]:
train_X = np.array([x for x in train_df['sequenced'].values])
train_X.shape

(40000, 200)

In [67]:
train_y = train_df['sentiment'].values
train_y.shape

(40000,)

In [91]:
model.fit(x = train_X, y = train_y, batch_size = batch_size, epochs = 10, verbose = 1, shuffle = False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdfc9a63860>

### Testing

In [69]:
test_X = np.array([x for x in test_df['sequenced'].values])
test_X.shape

(10000, 200)

In [70]:
test_y = test_df['sentiment'].values
test_y.shape

(10000,)

In [92]:
test_loss, test_accuracy = model.evaluate(test_X, test_y, batch_size=batch_size, verbose = 1, workers=1)



##### Manual Testing

In [93]:
pred_prob_y = model.predict(x = test_X)

In [94]:
pred_prob_y[:5]

array([[0.99985313],
       [0.26783568],
       [0.997473  ],
       [0.99478924],
       [0.99971807]], dtype=float32)

In [95]:
pred_prob_y = np.squeeze(pred_prob_y)
pred_prob_y[:5]

array([0.99985313, 0.26783568, 0.997473  , 0.99478924, 0.99971807],
      dtype=float32)

In [96]:
pred_y = np.where(pred_prob_y >0.5, 1, 0)
pred_y[:5]

array([1, 0, 1, 1, 1])

In [97]:
test_accuracy = (np.sum(test_y == pred_y))/len(test_y)*100
test_accuracy

85.66