## Load the modules

In [1]:
## Import Libraries 
import numpy as np      # for array operations
import pandas as pd     # for reading data operations

from keras.preprocessing.text import Tokenizer          # for tokenizing text
from keras.preprocessing.sequence import pad_sequences  # for padding sentences with zeros. To make the sentence length same
from keras.utils import to_categorical                  # for one-hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Embedding
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load the data

In [2]:
##Loading data using Pandas
train = pd.read_csv("train.csv", encoding='ISO-8859-1')
test = pd.read_csv("test.csv", encoding='ISO-8859-1')

train.head(6)

Unnamed: 0,Label,Message
0,ham,oh how abt 2 days before Christmas
1,info,"Welcome to OVATION HOLD R.No. 184, 114, 395, 3..."
2,info,Thank you for using your ICICI bank CREDITcard...
3,ham,schedule a meeting with the entire team in the...
4,ham,Tommy is my brother
5,spam,OTP is 817453 for the txn of INR 8262.00 at SP...


## Preprocesing data

In [3]:
## Define the sequence lengths, max number of words and embedding dimensions
MAX_SEQUENCE_LENGTH = 300   # Sequence length of each sentence. If more, crop. If less, pad with zeros
MAX_NB_WORDS = 20000        # Top 20000 frequently occuring words

In [4]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   # get the frequently occuring words
tokenizer.fit_on_texts(train.Message)           
train_sequences = tokenizer.texts_to_sequences(train.Message)
test_sequences = tokenizer.texts_to_sequences(test.Message)

word_index = tokenizer.word_index               # dictionary containing words and their index
# print(tokenizer.word_index)                   # print to check
print('Found %s unique tokens.' % len(word_index)) # total words in the corpus
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) # get only the top frequent words on train
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)   # get only the top frequent words on test

print(train_data.shape)
print(test_data.shape)

Found 38393 unique tokens.
(29000, 300)
(1000, 300)


In [5]:
print word_index

{u'1733palkun': 32884, u'j85784373': 14176, u'woods': 8790, u'11546': 13084, u'45787': 36662, u'hanging': 30079, u'pti84l': 21804, u'blgt': 4663, u'8z316': 36281, u'blgr': 4394, u'taughannock': 18632, u'5983': 17733, u'em4rnj': 13091, u'19394': 35321, u'f3fbmu': 16057, u'19391': 17600, u'jc84716483': 36231, u'82443': 33993, u'iguaz': 21402, u'66338615': 21523, u'ka88l3454': 16337, u'fqpv00': 19897, u'7d374': 11153, u'metroborgari': 8916, u'tourister': 4869, u'taj': 2377, u'18f': 8095, u'bringing': 6390, u'kottayam': 547, u'tcby': 20942, u'kaushal': 8371, u'46133': 11404, u'wednesday': 663, u'9137574318': 11102, u'73851771': 35344, u'9651574847': 32563, u'9246468633': 27301, u'ka45y5353': 13029, u'pygu1q': 10969, u"tom's": 2393, u'frederick': 22652, u'0055': 17638, u'270': 2612, u'271': 2464, u'272': 2744, u'273': 6669, u'274': 3990, u'275': 4041, u'276': 2088, u'277': 3896, u'278': 3648, u'279': 5509, u'121hydkot': 18169, u'14225zv414449': 32425, u'hrucy3': 14291, u'cooking': 6328, u'1

In [14]:
[i for i in word_index.keys() if word_index[i]==1 ]

[u'for']

In [15]:
train_labels = train['Label']
test_labels = test['Label']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()                  # converts the character array to numeric array. Assigns levels to unique labels.
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

[u'ham' u'info' u'spam']
(array([0, 1, 2]), array([ 9666, 12916,  6418]))
(array([0, 1, 2]), array([334, 459, 207]))


In [16]:
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)

('Shape of data tensor:', (29000, 300))
('Shape of label tensor:', (29000, 3))
('Shape of label tensor:', (1000, 3))


## Network architecture

In [20]:
(5*100 * 64)+64

32064

In [17]:
print('Training model.')
EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
                    EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH
                    ))
model.add(Conv1D(64, 5, activation='relu'))
# model.add(MaxPooling1D(5))
model.add(GlobalMaxPool1D())
# model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))

Training model.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [18]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          2000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 296, 64)           32064     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total params: 2,040,771
Trainable params: 2,040,771
Non-trainable params: 0
____________________________________________________

## Train the model

In [22]:
model.fit(train_data, labels_train,
          batch_size=64,
          epochs=2,
          validation_data=(test_data, labels_test))

Train on 29000 samples, validate on 1000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0xcce6d10>

In [24]:
model.evaluate(test_data, labels_test)
# model.predict_classes(test_data)



[0.016397827221484477, 0.995]

## Keras layers

In [26]:
model.layers

[<keras.layers.embeddings.Embedding at 0xd044a90>,
 <keras.layers.convolutional.Conv1D at 0xcce6e10>,
 <keras.layers.pooling.GlobalMaxPooling1D at 0xd044f50>,
 <keras.layers.core.Dense at 0xe077d90>,
 <keras.layers.core.Dense at 0xb084d90>]

In [25]:
# print(model.layers)
for layer in model.layers:
    print(layer)

<keras.layers.embeddings.Embedding object at 0xd044a90>
<keras.layers.convolutional.Conv1D object at 0xcce6e10>
<keras.layers.pooling.GlobalMaxPooling1D object at 0xd044f50>
<keras.layers.core.Dense object at 0xe077d90>
<keras.layers.core.Dense object at 0xb084d90>


## Keras API for output of an intermediate layer

In [30]:
import keras.backend as K
emd = K.function(inputs=[model.layers[0].input], 
                 outputs=[model.layers[0].output])

In [31]:
print(train_data[0:2].shape)
out = emd([train_data[0:2]])[0]
print(out.shape)

(2, 300)
(2, 300, 100)
