In [1]:
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

In [2]:
import pandas as pd

# Load IMDB Dataset

In [3]:
train, test, _ = imdb.load_data(path='imdb.pkl',
                                n_words=10000,
                                valid_portion=0.1) # 10% of data as "validation set"

In [4]:
trainX, trainY = train
testX,  testY  = test

### All about trainX

In [5]:
pd.Series(trainX).tail()

22495    [2544, 547, 44, 581, 3692, 2383, 6, 2609, 133,...
22496    [44, 167, 30, 155, 254, 563, 6, 240, 55, 2, 17...
22497    [119, 40, 40, 87, 1028, 87, 40, 40, 119, 12, 1...
22498    [119, 119, 704, 1028, 119, 119, 12, 13, 9, 11,...
22499    [1017, 360, 84, 4926, 774, 2739, 1017, 1, 1527...
dtype: object

In [6]:
print( list(pd.Series(trainX).iloc[5555]) )

[17, 27, 10, 6, 303, 5, 1357, 27, 4, 14, 10, 67, 4493, 8, 163, 1, 2, 8769, 1394, 3, 99, 14, 10, 1286, 1252, 303, 4, 14, 10, 45, 333, 27, 5, 16, 393, 14, 4, 12, 13, 9, 11, 12, 13, 9, 11, 14, 10, 67, 7336, 3, 1139, 3, 5, 1, 4, 50, 35, 61, 46, 117, 14, 10, 33, 7336, 3, 26, 14, 10, 4, 12, 13, 9, 11, 12, 13, 9, 11, 14, 10, 55, 2, 1, 7, 2, 1394, 7, 1818, 2958, 7, 1548, 273, 8213, 3, 5, 55, 2, 8936, 3, 2175, 1194, 3, 46, 4689, 14, 4, 14, 56, 156, 693, 15, 36, 136, 22, 2, 3550, 18, 34, 80, 65, 34, 21, 6, 414, 448, 3, 36, 1, 2449, 1194, 3, 2449, 3, 36, 332, 3, 745, 2564, 3, 5, 48, 31, 4, 26, 105, 3487, 14, 2639, 2, 1, 7, 2, 1394, 4]


In [7]:
pd.Series(trainX).shape

(22500,)

### All about trainY

In [8]:
pd.Series(trainY).tail()

22495    1
22496    1
22497    1
22498    1
22499    1
dtype: int64

In [9]:
pd.Series(trainY).shape

(22500,)

In [10]:
pd.Series(trainY).value_counts()

0    11271
1    11229
dtype: int64

In [11]:
pd.Series(trainY).value_counts().index.tolist()

[0, 1]

In [12]:
len(pd.Series(trainY).value_counts().index.tolist())

2

# Data Preprocessing

### Sequence Padding

Pad each sequence to the same length: the length of the longest sequence.
If maxlen is provided, any sequence longer than maxlen is truncated to
maxlen. Truncation happens off either the beginning (default) or the
end of the sequence. Supports post-padding and pre-padding (default).

In [13]:
trainX = pad_sequences(trainX, maxlen=100, value=0.0)
testX  = pad_sequences(testX,  maxlen=100, value=0.0)

In [14]:
trainX.shape

(22500, 100)

In [15]:
pd.DataFrame(trainX).tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
22495,49,6,179,7,534,3,6,303,1708,4,...,30,4,12,13,9,11,12,13,9,11
22496,255,1247,6,64,2494,319,167,37,2,589,...,1,99,167,64,63,1,6,779,27,4
22497,60,40,184,59,3443,288,15,4,885,14,...,17,25,5307,6,1,47,2,7930,84,4
22498,19,5953,19,60,974,4,26,57,14,1512,...,12,13,9,11,1479,87,6,609,87,903
22499,741,204,58,7,2,1072,5,309,8896,1,...,6,1,12,13,9,11,12,13,9,11


In [16]:
pd.DataFrame(testX).tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2495,5,5548,9090,4,15,721,7,64,1119,693,...,9,11,12,13,9,11,12,13,9,11
2496,9080,8,852,19,329,19,102,88,34,742,...,8,401,59,10,187,31,15,2,9295,4
2497,26,2,27,145,640,31,45,6888,895,4,...,813,3,61,7,2,6853,93,852,1,4
2498,13,9,11,895,84,119,64,1,85,6681,...,10,6,1,23,1352,4239,6471,7107,1,4
2499,8,69,958,1583,4,14,10,6,230,445,...,1284,305,164,4,104,32,725,17,39,4


### Convert Labels to Vectors
Converting labels to binary vectors

In [17]:
trainY = to_categorical(trainY, nb_classes=2)
testY  = to_categorical(testY,  nb_classes=2)

In [18]:
trainY

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [19]:
pd.DataFrame(trainY).tail()

Unnamed: 0,0,1
22495,0.0,1.0
22496,0.0,1.0
22497,0.0,1.0
22498,0.0,1.0
22499,0.0,1.0


# Network Building

In [20]:
# The first element is the "batch size" which we set to "None"
# The second element is set to "100" coz we set the max sequence length to "100"
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128) # input_dim: Vocabulary size (number of ids)
net = tflearn.lstm(net, 128, dropout=0.8) # Long Short Term Memory Recurrent Layer
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, 
                         optimizer='adam', 
                         learning_rate=1e-4,
                         loss='categorical_crossentropy')

# Training

In [21]:
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)

Training Step: 7040  | total loss: [1m[32m0.10768[0m[0m
| Adam | epoch: 010 | loss: 0.10768 - acc: 0.9734 | val_loss: 0.55273 - val_acc: 0.8196 -- iter: 22500/22500
Training Step: 7040  | total loss: [1m[32m0.10768[0m[0m
| Adam | epoch: 010 | loss: 0.10768 - acc: 0.9734 | val_loss: 0.55273 - val_acc: 0.8196 -- iter: 22500/22500
--
