# Data Preprocessing

In [1]:
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
#Get indexes
idx = imdb.get_word_index()

In [3]:
#sort 'idx' in ascending order, with most used words at top
idx_arr = sorted(idx,key=idx.get)

In [4]:
#10 most used words
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

mapping word to index

In [5]:
idx2word = { w:i for i,w in idx.items() }

In [6]:
from keras.utils import get_file
import pickle

In [7]:
#this file will be downloaded once. If previously downloaded, file will be used. 
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [8]:
len(x_train)

25000

Here's the first review. the words has been replaced by indexes. you can have a lookup use 'idx2word'

In [9]:
print(x_train[0])

[23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215]


Lets have a look at the first index

In [10]:
idx2word[23022]

'bromwell'

Here's the whole 1st review mapped from idx to word

In [11]:
print( [ idx2word[words] for words in x_train[0] ] )

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', "i'm", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', 'student', 'welcome', 'to', 'brom

Lets have a look at ten labels 

In [12]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

1 represents positive feedback and 0 represents negative feedback

* for simplicity rare words will be interpreted into single index

In [13]:
import numpy as np
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

I should create a rectangular matrix so that it can be passed to embedding layer.
there are 25,000 reviews and lets make each review with 500 words. 

In [14]:
from keras.preprocessing import sequence
pad_seq = 500
trn = sequence.pad_sequences(trn, maxlen=pad_seq, value=0)
test = sequence.pad_sequences(test, maxlen=pad_seq, value=0)

In [15]:
trn.shape

(25000, 500)

In [16]:
n_factors = 32

## First let's try a simple Neural Network and find out the performance
* with 32 vector size 

In [25]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Dropout

In [None]:
model = Sequential([
                Embedding(vocab_size,n_factors, input_length=pad_seq),
                Flatten(),
                Dense(100, activation='relu'),
                Dropout(0.7),
                Dense(1, activation='sigmoid')
                ])

In [None]:
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='Adam')

In [None]:
model.fit(trn, labels_train, 
          epochs=2,
          batch_size=64,
          validation_data=(test, labels_test))

### OMG, this is a great result for the first attempt
   

## CNN 
You heard, right ! Lets use CNN Model(1D) (though we have a great result)

In [None]:
from keras.layers import Convolution1D, MaxPooling1D
cnn = Sequential([
    Embedding(vocab_size, n_factors, input_length=pad_seq, dropout=0.2),  #dropout in input
    Dropout(0.2), #dropout in weights
    Convolution1D(64,5,border_mode='same',activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100,activation='relu'),
    Dropout(0.7),
    Dense(1,activation='sigmoid')
])

In [23]:
from keras.optimizers import Adam

In [None]:
cnn.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
cnn.fit(trn, labels_train, 
       epochs=2,
       batch_size=64,
       validation_data=(test, labels_test))

In [None]:
cnn.fit(trn, labels_train, 
       epochs=4,
       batch_size=64,
       validation_data=(test, labels_test))

## That's very good accuracy
That's well past the Stanford paper's accuracy - another win for CNNs!

In [None]:
cnn.save_weights('cnn.h5')

# Pre trained vectors
* I have downloaded the glove pre trained vectors from http://files.fast.ai/models/glove/6B.50d.tgz. 
* Untar the file and place it in the current working directory
> Note: I have already downloaded

In [17]:
import bcolz
vecs = bcolz.open('6B.50d/6B.50d.dat')[:]
#the pickle files are stored in python2.4. to read in python3.* add encoding 
words = pickle.load( open('6B.50d/6B.50d_words.pkl', 'rb'), encoding='latin1' )
wordidx = pickle.load( open('6B.50d/6B.50d_idx.pkl', 'rb'), encoding='latin1' )

* I will use pretrained vectors in embedding layer and make that non trainable
* The glove word ids and imdb word ids use different indexes. So  a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist). Courtesy:fast.ai 

In [18]:
import re #regular expressions
from numpy.random import normal

def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [None]:
emb = create_emb()


In [None]:
glove_factors=50 
#glove expects it to be like . Hint: Observe the trained weights file name

glove = Sequential([
    Embedding(vocab_size,glove_factors,input_length=pad_seq, trainable=False, weights =[emb] ),
    Dropout(0.25),
    Convolution1D(64,5, activation='relu', border_mode='same'),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid') ])

In [None]:
glove.compile(optimizer=Adam(), loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
glove.fit(trn, labels_train,
          epochs=4,
          batch_size=64,
          validation_data=(test,labels_test)
        )

Not so great

But let's **fine-tune** the embedding weights - especially since the words we couldn't find in glove just have random embeddings.

In [None]:
glove.lr = 1e-4
glove.layers[0].trainable=True

In [None]:
glove.fit(trn, labels_train,
         epochs=4,
         batch_size=64,
         validation_data=(test,labels_test))

#### Little improvement

In [None]:
glove.save_weights('glove_weights.h5')

# Multi - size CNN
* This is an implementation of a multi-size CNN as shown in Ben Bowles' excellent blog post.
* Let's create multiple layers using Functional API

In [None]:
from keras.layers import Input,Merge
from keras.models import Model
x_in = Input((vocab_size,glove_factors))
conv= []
for layer_size in range(3,6):
    x = Convolution1D(64,layer_size,activation='relu', border_mode='same')(x_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    conv.append(x)
out=Merge(mode='concat')(conv)
graph = Model(x_in,out)

In [None]:
emb = create_emb()

In [None]:
multi = Sequential([
    Embedding(vocab_size, glove_factors, input_length=pad_seq, weights=[emb]),
    Dropout(0.2),
    graph,
    Dropout(0.5),
    Dense(100,activation='relu'),
    Dropout(0.7),
    Dense(1,activation='sigmoid')
])

In [None]:
multi.compile(optimizer=Adam(),loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
multi.fit(trn, labels_train, 
         epochs=4,
         batch_size=64,
         validation_data=(test,labels_test))

### Lets make embedding layer non trainable

In [None]:
multi.layers[0].trainable = False

In [None]:
multi.fit( trn, labels_train, 
         epochs=4,
         batch_size=64,
         validation_data=(test,labels_test))

# Highest accuracy. Even Jeremy does'nt know the reason behind this.

# Predicting user input review

In [None]:
review = 'this is a good movie'
review = review.split()
review_length = len(review)
for i in range(500-review_length):
    review.append(0)
review = [review]
review

In [None]:
preds = multi.predict(review)

# LSTM
LSTM is a custom version of RNN's to avoid exploding gradients. 

In [31]:
from keras.layers import LSTM
from keras.regularizers import l2

In [38]:
lstm= Sequential([
            Embedding(vocab_size, 32, input_length=pad_seq, W_regularizer=l2(0.01) ),
            LSTM(100),
            Dense(1,activation='sigmoid')
])
lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  


In [39]:
lstm.fit(trn, labels_train,
         epochs=1,
         batch_size=64,
         validation_data=(test, labels_test) )

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x22da4e87940>

* This takes considerable amount of train. 
* The (resource spent : accuracy) is high when compared to CNN . 
* CNN is best choice here for NLP. 
