Set the seed

In [1]:
import numpy as np
np.random.seed(42)
your_local_path="C:/Users/s.mudalapuram/Documents/PythonMe/data/"

Data can be downloaded from Kaggle at the following URL

- https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [3]:
import pandas as pd

#Change filepath based on where you have stored the data
df = pd.read_csv(your_local_path + 'labeledTrainData.zip',header=0, delimiter="\t", quoting=3)

print(df.shape)

(25000, 3)


Split Data into Training and Test Data

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

In [5]:
X_train.shape

(20000,)

## Prepare Data

1.Convert reviews to Number sequences using Tokenizer

In [6]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

#Vocablury size
top_words = 5000
t = Tokenizer(num_words=top_words)

#Fit tokenizer of training data
t.fit_on_texts(X_train.tolist())

#Get the word index for each of the word in the review
X_train = t.texts_to_sequences(X_train.tolist())
X_test = t.texts_to_sequences(X_test.tolist())

In [7]:
#Length of different reviews is different
print('Length of review# 32 is: ', len(X_train[32]))
print('Length of review# 1208 is: ', len(X_train[1208]))

Length of review# 32 is:  317
Length of review# 1208 is:  117


2.Pad the sequences - to make every review equal in size

In [8]:
from tensorflow.python.keras.preprocessing import sequence

#Length for each review
max_review_length = 300

X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,
                                 padding='post')

X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, 
                                padding='post')

In [9]:
#Length of different reviews should be SAME now
print('Length of review# 32 is: ', len(X_train[32]))
print('Length of review# 1208 is: ', len(X_train[1208]))

Length of review# 32 is:  300
Length of review# 1208 is:  300


In [10]:
X_train[1208]

array([  11,   17,    6,    3,  977,    3,   62,    4,    3,  183,  251,
        311,    1,  317,    2,    9,   63,  585,   21,  622,   14,    1,
         17,   18,    9,    6,    3,   82,   62,   10,   13, 4142,   31,
         11,   19,    1,  112,    2,    1,   62,  117,   82,   10,   37,
         11,   19,   85,    9,    6,    3,  278,   62,   42,   68,    3,
        543,   12,   10,   13,   46,    2,   10,  229,  788,   15,    1,
         12,    6,  396,   85,   34,  485,    5,  127,  130,  111,   12,
         94,   10,  383,   12, 1441,   25,    5,   64,   11,   19,  318,
          1,  183,  657,    2,   31,    1,  845,  138,   36,   11,   19,
         11,   19,   22,   67, 1631,    1,   17, 1689,   36, 4960,   39,
         98,  143,   62,   12,  563,    8,    1,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## Build the Graph

In [11]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten

In [12]:
# Define how many numbers per word for Word embeddings
embedding_vector_length = 50 

#Build a model
model = Sequential()

Add Embedding layer

In [13]:
model.add(
    Embedding(top_words+1, #Vocablury Size, why +1
                    embedding_vector_length, #How many numbers per word
                    input_length=max_review_length) #Words in each review
         )

Output from Embedding is 3 dimension 
- batch_size x max_review_length x embedding_vector_length. 

We need to flatten the output for Dense layer

In [14]:
#Flatten the input
model.add(Flatten())

#Dense Layers
model.add(Dense(200,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(60,activation='relu'))
model.add(Dense(30,activation='relu'))

#Output layer
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Execute the graph

In [16]:
#Change number of epochs appropriately
model.fit(X_train,y_train,
          epochs=3,
          batch_size=128,
          shuffle=True, 
          validation_data=(X_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras._impl.keras.callbacks.History at 0x196cd067278>

In [18]:
model.predict(X_test[0:4])

array([[0.05418413],
       [0.9932783 ],
       [0.35033822],
       [0.991521  ]], dtype=float32)