# SPAM CLASSIFICATION with LSTM Network in Keras

In [2]:
# Import the necessary libraries, modules
import pandas as pd # Pandas library for reading '.csv' files as dataframes
import numpy as np  # Numpy library for creating and modifying arrays.
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding # Import layers from Keras
from keras.models import Sequential
import os
os.chdir("F:\\Class material\\ANN\\Deep Learning\\RNN\\Batch49_CSE7321c_RNN\\spam_ham")

### Reading the data

In [3]:
raw_data = pd.read_csv('train.csv', encoding='latin-1') # Read the data as a DataFrame using Pandas
raw_test_data = pd.read_csv('test.csv', encoding='latin-1')

print(raw_data.shape) # Print the dimensions of train DataFrame
print(raw_data.columns) # Print the column names of the DataFrame
print('\n')
raw_data.head(5) # Print the top few records

(29000, 2)
Index(['Label', 'Message'], dtype='object')




Unnamed: 0,Label,Message
0,ham,oh how abt 2 days before Christmas
1,info,"Welcome to OVATION HOLD R.No. 184, 114, 395, 3..."
2,info,Thank you for using your ICICI bank CREDITcard...
3,ham,schedule a meeting with the entire team in the...
4,ham,Tommy is my brother


### Check the labels and their frequencies

In [4]:
# Print the unique classes and their counts/frequencies
classes = np.unique(raw_data['Label'], return_counts=True) # np.unique returns a tuple with class names and counts
print(classes[0]) #Print the list of unique classes
print(classes[1]) #Print the list of frequencies of the above classes

['ham' 'info' 'spam']
[ 9666 12916  6418]


In [5]:
pd.value_counts(raw_data['Label'])

info    12916
ham      9666
spam     6418
Name: Label, dtype: int64

### Converting unstructured text to structured numeric form
This includes:
1. Tokenizing
2. Converting sequence of words to sequence of word indeces
3. Converting varing length sequences to fixed length sequences through padding

In [6]:
max_num_words = 1000
seq_len = 100
embedding_size = 100

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(raw_data.Message)


In [8]:
#Check the frequency of vovabulary present in entire corpus
A=tokenizer.word_counts
sorted_by_value = sorted(A.items(), key=lambda kv: kv[1],reverse=True)
print(sorted_by_value[0:100])
print(len(sorted_by_value))


[('for', 18686), ('your', 15182), ('at', 12313), ('on', 11836), ('is', 10770), ('the', 8981), ('to', 8388), ('you', 7629), ('in', 6008), ('of', 5096), ('10', 4973), ('a', 4655), ('no', 4391), ('12', 4146), ('pnr', 4011), ('with', 3929), ('from', 3885), ('dear', 3648), ('rs', 3479), ('id', 3456), ('and', 3449), ('customer', 3430), ('c', 3271), ('t', 3256), ('11', 3179), ('will', 3163), ('please', 3150), ('car', 3004), ('00', 2981), ('thank', 2860), ('2011', 2682), ('2014', 2636), ('2010', 2621), ('2013', 2563), ('2012', 2563), ('by', 2479), ('booking', 2406), ('card', 2390), ('ksrtc', 2235), ('com', 2184), ('order', 2167), ('http', 2165), ('service', 2158), ('hi', 2093), ('www', 2071), ('book', 2065), ('visit', 2064), ('due', 2028), ('apply', 2021), ('mins', 1993), ('auto', 1984), ('reach', 1971), ('01', 1968), ('07', 1965), ('delivered', 1960), ('agency', 1959), ('otp', 1958), ('yourbus', 1946), ('boarding', 1945), ('gate', 1945), ('txn', 1943), ('carry', 1941), ('repair', 1937), ('cho

In [8]:
#Check how indexing is given by tokenizer to the vocabulary
A=tokenizer.word_index
sorted_by_value = sorted(A.items(), key=lambda kv: kv[1])
print(sorted_by_value)
print(len(sorted_by_value))


[('for', 1), ('your', 2), ('at', 3), ('on', 4), ('is', 5), ('the', 6), ('to', 7), ('you', 8), ('in', 9), ('of', 10), ('10', 11), ('a', 12), ('no', 13), ('12', 14), ('pnr', 15), ('with', 16), ('from', 17), ('dear', 18), ('rs', 19), ('id', 20), ('and', 21), ('customer', 22), ('c', 23), ('t', 24), ('11', 25), ('will', 26), ('please', 27), ('car', 28), ('00', 29), ('thank', 30), ('2011', 31), ('2014', 32), ('2010', 33), ('2013', 34), ('2012', 35), ('by', 36), ('booking', 37), ('card', 38), ('ksrtc', 39), ('com', 40), ('order', 41), ('http', 42), ('service', 43), ('hi', 44), ('www', 45), ('book', 46), ('visit', 47), ('due', 48), ('apply', 49), ('mins', 50), ('auto', 51), ('reach', 52), ('01', 53), ('07', 54), ('delivered', 55), ('agency', 56), ('otp', 57), ('yourbus', 58), ('boarding', 59), ('gate', 60), ('txn', 61), ('carry', 62), ('repair', 63), ('choosing', 64), ('09', 65), ('ch', 66), ('06', 67), ('08', 68), ('03', 69), ('04', 70), ('05', 71), ('2015', 72), ('i', 73), ('we', 74), ('02',

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(raw_data.Message) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(raw_data.Message) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=100) #pad_sequences makes every sequence a fixed size list by padding with 0s 

x_test = tokenizer.texts_to_sequences(raw_test_data.Message) 
x_test = pad_sequences(x_test, maxlen=100)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((29000, 100), (1000, 100))

In [10]:
print(raw_data.Message[3])
print(x_train[3])

schedule a meeting with the entire team in the office tomorrow
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 154  12 250  16   6 836   9   6 440 223]


### Prepare the target vectors for the network

In [11]:
unique_labels = list(raw_data.Label.unique())
print(unique_labels)

['ham', 'info', 'spam']


In [12]:
from keras.utils import to_categorical # This convers the labels to one-hot vectors(Dummies)

y_train = np.array([unique_labels.index(i) for i in raw_data.Label]) # Convert the word labels to indeces
y_train = to_categorical(y_train) # Dummify the labels
y_test = np.array([unique_labels.index(i) for i in raw_test_data.Label])
y_test = to_categorical(y_test)

In [13]:
import keras.backend as K # This 'K' can be used to create user defined functions in keras

# Define a custom function in keras to compute recall.
# Arguments:
# y_true - Actual labels
# y_pred - Predicted labels
def recall(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    PP = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = TP / (PP + K.epsilon())
    return recall

### Building and training an LSTM model

In [14]:
# Building an LSTM model
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model.add(LSTM(10, return_sequences=True)) # Add an LSTM layer
model.add(LSTM(10, return_sequences=False))
model.add(Dense(3, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          100000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 10)           4440      
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                840       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 33        
Total params: 105,313
Trainable params: 105,313
Non-trainable params: 0
_________________________________________________________________


In [37]:
#(10000*100)
#(100*10 + 10*10 +10)*4
#(10*10 + 10*10+10)*4
#(10*3)+3

In [16]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [17]:
# Mention the optimizer, Loss function and metrics to be computed
model.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

model.fit(x_train, y_train, epochs=2, validation_split=0.25)

Train on 21750 samples, validate on 7250 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x27252ba1ba8>

### Prediction and evaluation on test data
1. Check the network output on test data. What do these values represent?
2. Predict the class labels on test data
2. Evaluate the model on test data

Hint: Check model.predict, model.predict_classes, model.evaluate in keras

In [18]:
test_prob = model.predict(x_test)
test_prob.shape

(1000, 3)

In [19]:
test_prob[:5]

array([[9.9688303e-01, 8.1337802e-04, 2.3036185e-03],
       [4.8663354e-04, 9.9893636e-01, 5.7696429e-04],
       [4.9995334e-04, 9.9884027e-01, 6.5981323e-04],
       [4.9083063e-01, 4.1742055e-03, 5.0499511e-01],
       [6.2253419e-04, 9.9847716e-01, 9.0032269e-04]], dtype=float32)

In [20]:
test_classes = model.predict_classes(x_test)
test_classes.shape

(1000,)

In [21]:
test_classes = np.argmax(test_prob, axis=1)
test_classes.shape
from sklearn.metrics import accuracy_score
accuracy_score(test_classes,np.argmax(y_test, axis=1))

0.994

### Understanding an intermediate layer in keras
Please understand the below code to get output from an intermediate layer in keras. you can do this for every layer to fully understand how the tensors/arrays are flowing through the layers.

In [22]:
model.layers

[<keras.layers.embeddings.Embedding at 0x2724aa85080>,
 <keras.layers.recurrent.LSTM at 0x2724fb3aba8>,
 <keras.layers.recurrent.LSTM at 0x2724fb3ab38>,
 <keras.layers.core.Dense at 0x2724fb3bf28>]

In [23]:
import keras.backend as K 
# Create a user defined function in keras, where we mention the input and output
# This function returns a list
eo = K.function([model.layers[0].input],
                  [model.layers[0].output, model.layers[1].output])

out = eo([x_train[0:5]]) 
print(type(out))
print(len(out))
print(out[0].shape)

<class 'list'>
2
(5, 100, 100)


In [24]:
print(out[1].shape)

(5, 100, 10)


In [25]:
print(out[0])

[[[ 0.09128542 -0.01787114 -0.10190784 ...  0.05319512  0.06658543
    0.02552317]
  [ 0.09128542 -0.01787114 -0.10190784 ...  0.05319512  0.06658543
    0.02552317]
  [ 0.09128542 -0.01787114 -0.10190784 ...  0.05319512  0.06658543
    0.02552317]
  ...
  [-0.01734999 -0.0346527   0.01665212 ...  0.08058941  0.0228016
   -0.02947346]
  [ 0.04850977 -0.10366695 -0.01529533 ...  0.07064736  0.07124909
    0.03664686]
  [ 0.03806077 -0.03191898  0.01156641 ...  0.04118325  0.06875876
   -0.03486183]]

 [[ 0.09128542 -0.01787114 -0.10190784 ...  0.05319512  0.06658543
    0.02552317]
  [ 0.09128542 -0.01787114 -0.10190784 ...  0.05319512  0.06658543
    0.02552317]
  [ 0.09128542 -0.01787114 -0.10190784 ...  0.05319512  0.06658543
    0.02552317]
  ...
  [ 0.03126795 -0.02687435  0.08895392 ...  0.03932009 -0.07602625
   -0.01967862]
  [ 0.05288116 -0.01075801  0.01138811 ...  0.05650621  0.00928571
   -0.04701073]
  [ 0.0478854  -0.00526749  0.00696993 ...  0.03383213  0.01366613
   -0.0