In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
seed = 123
np.random.seed(seed)

### Import IMDB dataset 

In [3]:
from keras.datasets import imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=10000,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Combining independent and dependent variables to form 'X' and 'y' lists repectively

In [4]:
X = np.concatenate((x_train, x_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

'X' represents sequence of word indexes for each movie review

'y' represents review category i.e. positive or negative where '1' indicates 'positive review' and '0' indicates 'negative review'

In [5]:
# Number of instances in X and y
print('X:{}'.format(len(X)))
print('y:{}'.format(len(y)))

X:50000
y:50000


In [6]:
# Number of unique words in all the 50,000 moview reviews
len(np.unique(X))

49579

#### Function to obtain a list with word counts of all the reviews

In [7]:
def elements(X):
    num = []
    i = 0
    for i in X[i]:
        num.append(len(X[i]))
    return num

In [8]:
# List 'word_count' is created by calling function 'elements'
word_count = elements(X)

In [9]:
# Converting list to numpy array
word_count = np.array(word_count)

In [10]:
# Getting average number of words per review
np.mean(word_count)

229.27522935779817

In [11]:
# Getting the maximum number of words for a review
np.max(word_count)

1011

In [12]:
# Getting the manimum number of words for a review
np.min(word_count)

43

In [13]:
# Getting the median value of words for all the reviews
np.median(word_count)

163.0

#### Number of positive and negative reviews

In [14]:
unique, counts = np.unique(y, return_counts=True)
print(np.asarray((unique, counts)).T)

[[    0 25000]
 [    1 25000]]


#### Importing sequence class to pad sequences of words to a maximum limit

In [15]:
from keras.preprocessing import sequence
max_words = 1000
X = sequence.pad_sequences(X, maxlen=max_words)   
# 'sequence.pad_sequences' pads sequences of words in all reviews to the same length which is set at 10000 words per review 

### Train (75%) - Test (25%) Split 

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

### Building Neural Network model

In [17]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [18]:
most_freq = 10000 # to get top 10,000 most frequently words used in all the reviews

#### Create model

In [19]:
model = Sequential()
model.add(Embedding(most_freq, 50, input_length=max_words))
model.add(Flatten())
model.add(Dense(200, kernel_initializer = 'normal', activation='relu'))
model.add(Dense(300, kernel_initializer = 'normal', activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#### Compile model

In [20]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 50)          500000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               10000200  
_________________________________________________________________
dense_2 (Dense)              (None, 300)               60300     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 301       
Total params: 10,560,801
Trainable params: 10,560,801
Non-trainable params: 0
_________________________________________________________________
None


#### Fit the model

In [21]:
model.fit(X_train, y_train, epochs=2, batch_size=500, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1f2f573b6d8>

### Evaluation

#### Accuracy

In [22]:
scores = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 88.90%


#### Confusion matrix

In [26]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5 )
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[5310  943]
 [ 445 5802]]


#### Classification report

In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.92      0.85      0.88      6253
          1       0.86      0.93      0.89      6247

avg / total       0.89      0.89      0.89     12500

