# Reading dataset

In [55]:
import pandas as pd
data = pd.read_csv('D:\MLWorkshop\SMSSpamCollection.csv',sep = '\t',  names = ['label', 'message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
data.shape

(5572, 2)

In [57]:
text= data['message']
print (text[0:5])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object


In [58]:
class_label=data['label']
print (class_label[0:5])

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object


# Converting labels into one-hot index vector

In [59]:
import numpy as np
from keras.utils.np_utils import to_categorical
classes_list= ["ham","spam"]
label_index= class_label.apply(classes_list.index)
label1= np.asarray(label_index)
lebel= to_categorical(label1)

In [60]:
print (label_index[0:5])
print (lebel[0:5])

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64
[[1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]


## Processing Text
1. Tokenization
2. Create index for each token
3. Create Vocabulary
3. Create text to sequence vector or encoding
4. padding

# Tokenization

In [61]:
from keras.preprocessing.text import Tokenizer
tk= Tokenizer(filters= '#$%@!\n\t', lower= True, split= " " )
tk.fit_on_texts(text)

# Creating index for each token

In [62]:
index= tk.word_index
print (index)



# Creating Vocabulary for tokens

In [63]:
vocabsize=len(index)
print (vocabsize)

13146


# Encoding

In [64]:
encoded_text= tk.texts_to_sequences(text)
print (text[0])
print (encoded_text[0])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
[44, 400, 5437, 5438, 5439, 685, 63, 9, 1707, 85, 124, 446, 3417, 147, 5440, 1487, 87, 55, 5441, 844]


# Padding each sentence into equal length (post-padding)

In [65]:
from keras.preprocessing.sequence import pad_sequences
max_length=30
padded_docs=pad_sequences(encoded_text, maxlen=max_length, padding='post')
print (text[2])
print (encoded_text[2])
print (padded_docs[2])

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
[51, 417, 9, 21, 4, 726, 992, 1, 168, 2055, 1072, 569, 2056, 2548, 248, 2549, 69, 2055, 1, 2057, 1, 288, 417, 3419, 80, 3420, 600, 3421]
[  51  417    9   21    4  726  992    1  168 2055 1072  569 2056 2548
  248 2549   69 2055    1 2057    1  288  417 3419   80 3420  600 3421
    0    0]


# Data split into training and testing

In [66]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split (padded_docs,lebel, test_size=0.30, random_state=42)

In [67]:
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(3900, 30) (3900, 2)
(1672, 30) (1672, 2)


# Building CNN deep learning Network

In [68]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten
from keras.layers import Conv1D, MaxPool1D
from sklearn.metrics import classification_report

In [69]:
model=Sequential()
model.add(Embedding(input_dim=vocabsize+1, output_dim=100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=3, strides=1, activation='relu'))
model.add(MaxPool1D(2))
model.add(Flatten())
model.add(Dense(20))
model.add(Dense(2))
model.add(Activation('sigmoid'))
model.summary() # max(x,0) Relu

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 100)           1314700   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 28, 32)            9632      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 14, 32)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 448)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 20)                8980      
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 42        
_________________________________________________________________
activation_4 (Activation)    (None, 2)                

In [70]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model.fit(x_train, y_train, batch_size=32, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1b6d5c91438>

In [71]:
predictions_test_CNN=model.predict(x_test)
print (predictions_test_CNN)
predictions_test1_CNN=np.zeros_like(predictions_test_CNN)
predictions_test1_CNN[np.arange(len(predictions_test_CNN)),predictions_test_CNN.argmax(1)]=1
#print(predictions_test1)
print (classification_report(y_test, predictions_test1_CNN))

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1448
           1       0.91      0.95      0.93       224

   micro avg       0.98      0.98      0.98      1672
   macro avg       0.95      0.97      0.96      1672
weighted avg       0.98      0.98      0.98      1672
 samples avg       0.98      0.98      0.98      1672

