In [1]:
import tensorflow as tf

In [2]:
import tensorflow.keras as keras
import numpy as np
import pandas as pd

In [3]:
reviews=pd.read_csv("imdb_reviews.csv")

In [4]:
test=pd.read_csv("test_reviews.csv")

In [5]:
reviews.head()

Unnamed: 0,Reviews,Sentiment
0,<START this film was just brilliant casting lo...,positive
1,<START big hair big boobs bad music and a gian...,negative
2,<START this has to be one of the worst films o...,negative
3,<START the <UNK> <UNK> at storytelling the tra...,positive
4,<START worst mistake of my life br br i picked...,negative


In [6]:
reviews.shape

(25000, 2)

In [7]:
test.head()

Unnamed: 0,Reviews,Sentiment
0,<START please give this one a miss br br <UNK>...,negative
1,<START this film requires a lot of patience be...,positive
2,<START many animation buffs consider <UNK> <UN...,positive
3,<START i generally love this type of movie how...,negative
4,<START like some other people wrote i'm a die ...,positive


In [8]:
test.shape

(25000, 2)

In [9]:
words=pd.read_csv("word_indexes.csv")

In [10]:
words.head()

Unnamed: 0,Words,Indexes
0,tsukino,52009
1,nunnery,52010
2,sonja,16819
3,vani,63954
4,woods,1411


Mapping words to numbers.

In [11]:
words.shape

(88584, 2)

# Pre-Processing 

1. Converting word_indexes dataframe to python dictionary- Words as keys, indices as values.

In [12]:
words=dict(zip(words.Words,words.Indexes))

In [13]:
words

{'tsukino': 52009,
 'nunnery': 52010,
 'sonja': 16819,
 'vani': 63954,
 'woods': 1411,
 'spiders': 16118,
 'hanging': 2348,
 'woody': 2292,
 'trawling': 52011,
 "hold's": 52012,
 'comically': 11310,
 'localized': 40833,
 'disobeying': 30571,
 "'royale": 52013,
 "harpo's": 40834,
 'canet': 52014,
 'aileen': 19316,
 'acurately': 52015,
 "diplomat's": 52016,
 'rickman': 25245,
 'arranged': 6749,
 'rumbustious': 52017,
 'familiarness': 52018,
 "spider'": 52019,
 'hahahah': 68807,
 "wood'": 52020,
 'transvestism': 40836,
 "hangin'": 34705,
 'bringing': 2341,
 'seamier': 40837,
 'wooded': 34706,
 'bravora': 52021,
 'grueling': 16820,
 'wooden': 1639,
 'wednesday': 16821,
 "'prix": 52022,
 'altagracia': 34707,
 'circuitry': 52023,
 'crotch': 11588,
 'busybody': 57769,
 "tart'n'tangy": 52024,
 'burgade': 14132,
 'thrace': 52026,
 "tom's": 11041,
 'snuggles': 52028,
 'francesco': 29117,
 'complainers': 52030,
 'templarios': 52128,
 '272': 40838,
 '273': 52031,
 'zaniacs': 52133,
 '275': 34709,


2. Integer encoding for unknown values.

In [14]:
words["<PAD>"]=0
words["<START"]=1
words["<UNK>"]=2
words["<UNUSED>"]=3

3. Encoding reviews into integers according to mapping specified

In [15]:
def review_encoder(text):
    arr=[words[word] for word in text]
    return arr

# Splitting The Dataset

In [16]:
train_data,train_labels=reviews['Reviews'],reviews['Sentiment']
test_data, test_labels=test['Reviews'],test['Sentiment']

# Tokenization

In [17]:
train_data=train_data.apply(lambda review:review.split())
test_data=test_data.apply(lambda review:review.split())

In [18]:
train_data[0]

['<START',
 'this',
 'film',
 'was',
 'just',
 'brilliant',
 'casting',
 'location',
 'scenery',
 'story',
 'direction',
 "everyone's",
 'really',
 'suited',
 'the',
 'part',
 'they',
 'played',
 'and',
 'you',
 'could',
 'just',
 'imagine',
 'being',
 'there',
 'robert',
 '<UNK>',
 'is',
 'an',
 'amazing',
 'actor',
 'and',
 'now',
 'the',
 'same',
 'being',
 'director',
 '<UNK>',
 'father',
 'came',
 'from',
 'the',
 'same',
 'scottish',
 'island',
 'as',
 'myself',
 'so',
 'i',
 'loved',
 'the',
 'fact',
 'there',
 'was',
 'a',
 'real',
 'connection',
 'with',
 'this',
 'film',
 'the',
 'witty',
 'remarks',
 'throughout',
 'the',
 'film',
 'were',
 'great',
 'it',
 'was',
 'just',
 'brilliant',
 'so',
 'much',
 'that',
 'i',
 'bought',
 'the',
 'film',
 'as',
 'soon',
 'as',
 'it',
 'was',
 'released',
 'for',
 '<UNK>',
 'and',
 'would',
 'recommend',
 'it',
 'to',
 'everyone',
 'to',
 'watch',
 'and',
 'the',
 'fly',
 'fishing',
 'was',
 'amazing',
 'really',
 'cried',
 'at',
 'the

Now that we have tokenized the words based on white spaces, we can easily map these words into numbers. It was important as we had mapping for distinct words and not strings individually.

# Applying Review Encoder

In [19]:
train_data=train_data.apply(review_encoder)
test_data=test_data.apply(review_encoder)

In [20]:
train_data.head(n=10)

0    [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...
1    [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...
2    [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...
3    [1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153,...
4    [1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...
5    [1, 778, 128, 74, 12, 630, 163, 15, 4, 1766, 7...
6    [1, 6740, 365, 1234, 5, 1156, 354, 11, 14, 532...
7    [1, 4, 2, 716, 4, 65, 7, 4, 689, 4367, 6308, 2...
8    [1, 43, 188, 46, 5, 566, 264, 51, 6, 530, 664,...
9    [1, 14, 20, 47, 111, 439, 3445, 19, 12, 15, 16...
Name: Reviews, dtype: object

# Encoding Sentiments

In [21]:
def encode_sentiments(x):
    if x=='positive':
        return 1
    else:
        return 0

train_labels=train_labels.apply(encode_sentiments)
test_labels=test_labels.apply(encode_sentiments)

In [22]:
train_labels.head(n=10)

0    1
1    0
2    0
3    1
4    0
5    0
6    1
7    0
8    1
9    0
Name: Sentiment, dtype: int64

In [23]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

Since, different reviews are of different lengths, it is important for us to remove ambiguity and pre process them. Thus, all reviews must be equal in lengths.

In [30]:
train_data

0        [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...
1        [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...
2        [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...
3        [1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153,...
4        [1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...
                               ...                        
24995    [1, 14, 9, 6, 2758, 20, 21, 1517, 7, 2078, 5, ...
24996    [1, 4679, 2784, 299, 6, 1042, 37, 80, 81, 233,...
24997    [1, 11, 6, 230, 245, 6401, 9, 6, 1225, 446, 2,...
24998    [1, 1446, 7079, 69, 72, 3305, 13, 610, 930, 8,...
24999    [1, 17, 6, 194, 337, 7, 4, 204, 22, 45, 254, 8...
Name: Reviews, Length: 25000, dtype: object

In [61]:
for i in train_data:
    x=len(i)
    print(x)

218
189
141
550
147
43
123
562
233
130
450
99
117
238
109
129
163
752
212
177
129
140
256
888
93
142
220
193
171
221
174
647
233
162
597
234
51
336
139
231
704
142
861
132
122
570
55
214
103
186
113
169
469
138
302
766
351
146
59
206
107
152
186
431
147
684
383
324
252
263
787
211
314
118
390
132
710
306
167
115
95
158
156
82
502
314
190
174
60
145
214
659
408
515
461
202
238
170
107
171
158
145
790
258
287
67
123
975
775
236
195
274
214
91
1038
815
183
206
50
118
147
141
60
56
439
439
213
144
533
303
203
563
129
153
55
92
174
187
183
165
78
198
156
223
127
61
362
84
57
176
159
57
159
165
213
194
149
130
203
19
98
466
525
130
322
153
408
215
472
143
136
354
260
319
125
209
282
810
142
240
148
198
193
123
128
103
479
345
263
165
205
333
184
92
177
335
120
121
259
180
160
114
59
343
513
133
206
152
206
572
153
139
151
129
129
196
433
199
140
311
151
200
584
127
513
781
932
526
161
646
135
52
267
174
185
219
81
219
131
153
270
644
155
546
284
85
293
155
358
45
231
124
178
118
260
393
127


Let's take length of each review= 550. This means that if length of any of the reviews>550, rest of the words will be omitted. And, if length of any of the reviews<550, we will pad it with zero to extend its length to 550.

In [64]:
train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=words["<PAD>"],padding='post',maxlen=550)
test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=words["<PAD>"],padding='post',maxlen=550)

#Padding=post signifies, padding is done after the last word of the review.

# Model

1. Layer1- Word Embeddings
2. Layer2- Global Average Pooling Layer: to reduce number of params. Thus, reducing overfitting.
It takes average of all the values at the same index position in the vector and stores it in output. Hence, number of params are reduced.
3. Layer3- Activation function is ReLu. Thus, transforms the combined units.
4. Layer4- Final layer is the output layer that uses sigmoid function. Hence, giving us values between 0 and 1.

In [66]:
model=keras.Sequential([keras.layers.Embedding(10000,16,embeddings_initializer="uniform",input_length=550),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16,activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

Embedding layer:

1. Each word will be converted to a vector of 16 values.
2. 10000 is the vocabulary size.
3. Weights of hidden layers set to uniform.
4. Constant input length=550.

Dense Layer:
1. 16 activation units
2. Activation function is ReLu, that is, to get the element wise maximum- max(x,0).

Output Layer:
1. Uses sigmoid function 
2. Returns a value between 0 and 1.
3. 1 is the input tensor.

Other layers in CNN:
sSoftmax function- Probability distribution of output vector values. They range between 0 to 1 and sum to 1.