In [48]:
# Load, explore and plot data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Text pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
# Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional

In [49]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [50]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
# rename the columns
df = df[['v1','v2']]
df.rename(columns={'v1':'label', 'v2':'message'}, inplace=True)
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [52]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [53]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [54]:
ham_df = df[df['label']==0]
ham_df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
6,0,Even my brother is not like to speak with me. ...
...,...,...
5565,0,Huh y lei...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [55]:
spam_df = df[df['label']==1]
spam_df

Unnamed: 0,label,message
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [56]:
ham_msg_df = ham_df.sample(n = len(spam_df), random_state = 44)
spam_msg_df = spam_df

In [57]:
# Get length column for each text
df['text_length'] = df['message'].apply(len)
msg_label = df['label'].values
df.head()

Unnamed: 0,label,message,text_length
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [58]:
x_train, x_test, y_train, y_test = train_test_split(df['message'], msg_label, test_size=0.2, random_state=434)

In [59]:
# Defining pre-processing parameters
max_len = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500

In [60]:
tokenizer = Tokenizer(num_words = vocab_size,
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

In [61]:
# Get the word_index
word_index = tokenizer.word_index
total_words = len(word_index)
total_words

8004

In [62]:
training_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [63]:
testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [64]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

Shape of training tensor:  (4457, 50)
Shape of testing tensor:  (1115, 50)


In [65]:
# Define parameter
vocab_size = 500
embedding_dim = 16
drop_value = 0.2
n_dense = 24
# Define Dense Model Architecture
model = Sequential()
model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length = max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))

In [66]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 16)            8000      
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_5 (Dense)             (None, 24)                408       
                                                                 
 dropout_4 (Dropout)         (None, 24)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 25        
                                                                 
Total params: 8433 (32.94 KB)
Trainable params: 8433 (32.94 KB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [67]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [68]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded,
                    y_train,
                    epochs=num_epochs,
                    validation_data=(testing_padded, y_test),
                    callbacks =[early_stop],
                    verbose=2)

Epoch 1/30
140/140 - 2s - loss: 0.4848 - accuracy: 0.8557 - val_loss: 0.3186 - val_accuracy: 0.8771 - 2s/epoch - 11ms/step
Epoch 2/30
140/140 - 1s - loss: 0.2970 - accuracy: 0.8629 - val_loss: 0.2318 - val_accuracy: 0.8816 - 534ms/epoch - 4ms/step
Epoch 3/30
140/140 - 0s - loss: 0.1878 - accuracy: 0.9264 - val_loss: 0.1400 - val_accuracy: 0.9623 - 492ms/epoch - 4ms/step
Epoch 4/30
140/140 - 0s - loss: 0.1169 - accuracy: 0.9650 - val_loss: 0.1027 - val_accuracy: 0.9677 - 453ms/epoch - 3ms/step
Epoch 5/30
140/140 - 0s - loss: 0.0820 - accuracy: 0.9773 - val_loss: 0.0867 - val_accuracy: 0.9713 - 348ms/epoch - 2ms/step
Epoch 6/30
140/140 - 0s - loss: 0.0668 - accuracy: 0.9800 - val_loss: 0.0797 - val_accuracy: 0.9722 - 318ms/epoch - 2ms/step
Epoch 7/30
140/140 - 0s - loss: 0.0562 - accuracy: 0.9836 - val_loss: 0.0764 - val_accuracy: 0.9731 - 304ms/epoch - 2ms/step
Epoch 8/30
140/140 - 0s - loss: 0.0482 - accuracy: 0.9845 - val_loss: 0.0738 - val_accuracy: 0.9758 - 302ms/epoch - 2ms/step
Ep

In [69]:
model.evaluate(testing_padded, y_test)



[0.074632927775383, 0.9784753322601318]

In [70]:
train_dense_results = model.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 0s - loss: 0.0256 - accuracy: 0.9917 - 52ms/epoch - 3ms/step
5/5 - 0s - loss: 0.0746 - accuracy: 0.9785 - 30ms/epoch - 6ms/step
Train accuracy: 99.17
Valid accuracy: 97.85


In [71]:
# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(SpatialDropout1D(drop_lstm))
model1.add(LSTM(n_lstm, return_sequences=False))
model1.add(Dropout(drop_lstm))
model1.add(Dense(1, activation='sigmoid'))

In [72]:
model1.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 16)            8000      
                                                                 
 spatial_dropout1d_2 (Spati  (None, 50, 16)            0         
 alDropout1D)                                                    
                                                                 
 lstm_2 (LSTM)               (None, 128)               74240     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 82369 (321.75 KB)
Trainable params: 82369 (321.75 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [73]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])

In [74]:
num_epochs = 5
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(training_padded,
                     y_train,
                     epochs=num_epochs,
                     validation_data=(testing_padded, y_test),
                     callbacks =[early_stop],
                     verbose=2)

Epoch 1/5
140/140 - 13s - loss: 0.2419 - accuracy: 0.9206 - val_loss: 0.0971 - val_accuracy: 0.9686 - 13s/epoch - 93ms/step
Epoch 2/5
140/140 - 11s - loss: 0.0772 - accuracy: 0.9796 - val_loss: 0.1348 - val_accuracy: 0.9516 - 11s/epoch - 80ms/step
Epoch 3/5
140/140 - 10s - loss: 0.0835 - accuracy: 0.9776 - val_loss: 0.0956 - val_accuracy: 0.9731 - 10s/epoch - 68ms/step
Epoch 4/5
140/140 - 10s - loss: 0.0663 - accuracy: 0.9791 - val_loss: 0.0968 - val_accuracy: 0.9686 - 10s/epoch - 69ms/step
Epoch 5/5
140/140 - 11s - loss: 0.0525 - accuracy: 0.9879 - val_loss: 0.1053 - val_accuracy: 0.9776 - 11s/epoch - 75ms/step


In [75]:
train_dense_results = model1.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model1.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 2s - loss: 0.0364 - accuracy: 0.9926 - 2s/epoch - 86ms/step
5/5 - 0s - loss: 0.1053 - accuracy: 0.9776 - 401ms/epoch - 80ms/step
Train accuracy: 99.26
Valid accuracy: 97.76


In [76]:
model2 = Sequential()
model2.add(Embedding(vocab_size,
                     embedding_dim,
                     input_length = max_len))
model2.add(Bidirectional(LSTM(n_lstm,
                              return_sequences = False)))
model2.add(Dropout(drop_lstm))
model2.add(Dense(1, activation='sigmoid'))

In [77]:
model2.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 16)            8000      
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               148480    
 onal)                                                           
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_8 (Dense)             (None, 1)                 257       
                                                                 
Total params: 156737 (612.25 KB)
Trainable params: 156737 (612.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [78]:
model2.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics=['accuracy'])

In [79]:
num_epochs = 5
early_stop = EarlyStopping(monitor = 'val_loss',
                           patience = 2)
history = model2.fit(training_padded,
                     y_train,
                     epochs = num_epochs,
                     validation_data = (testing_padded, y_test),
                     callbacks = [early_stop],
                     verbose = 2)

Epoch 1/5
140/140 - 24s - loss: 0.2439 - accuracy: 0.9138 - val_loss: 0.1266 - val_accuracy: 0.9587 - 24s/epoch - 171ms/step
Epoch 2/5
140/140 - 17s - loss: 0.0823 - accuracy: 0.9744 - val_loss: 0.0851 - val_accuracy: 0.9794 - 17s/epoch - 118ms/step
Epoch 3/5
140/140 - 18s - loss: 0.0582 - accuracy: 0.9836 - val_loss: 0.0963 - val_accuracy: 0.9767 - 18s/epoch - 129ms/step
Epoch 4/5
140/140 - 17s - loss: 0.0432 - accuracy: 0.9890 - val_loss: 0.0807 - val_accuracy: 0.9767 - 17s/epoch - 121ms/step
Epoch 5/5
140/140 - 17s - loss: 0.0390 - accuracy: 0.9901 - val_loss: 0.0773 - val_accuracy: 0.9821 - 17s/epoch - 122ms/step


In [80]:
train_dense_results = model2.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model2.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 2s - loss: 0.0277 - accuracy: 0.9937 - 2s/epoch - 139ms/step
5/5 - 1s - loss: 0.0773 - accuracy: 0.9821 - 644ms/epoch - 129ms/step
Train accuracy: 99.37
Valid accuracy: 98.21


In [81]:
model3 = Sequential()
model3.add(Embedding(vocab_size,
                     embedding_dim,
                     input_length = max_len))
model3.add(SpatialDropout1D(0.2))
model3.add(GRU(128, return_sequences = False))
model3.add(Dropout(0.2))
model3.add(Dense(1, activation = 'sigmoid'))

In [82]:
model3.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 50, 16)            8000      
                                                                 
 spatial_dropout1d_3 (Spati  (None, 50, 16)            0         
 alDropout1D)                                                    
                                                                 
 gru_1 (GRU)                 (None, 128)               56064     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 1)                 129       
                                                                 
Total params: 64193 (250.75 KB)
Trainable params: 64193 (250.75 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [83]:
model3.compile(loss = 'binary_crossentropy',
                       optimizer = 'adam',
                       metrics=['accuracy'])

In [84]:
num_epochs = 5
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model3.fit(training_padded,
                     y_train,
                     epochs=num_epochs,
                     validation_data=(testing_padded, y_test),
                     callbacks =[early_stop],
                     verbose=2)

Epoch 1/5
140/140 - 11s - loss: 0.4195 - accuracy: 0.8602 - val_loss: 0.3098 - val_accuracy: 0.8771 - 11s/epoch - 76ms/step
Epoch 2/5
140/140 - 8s - loss: 0.1441 - accuracy: 0.9587 - val_loss: 0.1233 - val_accuracy: 0.9614 - 8s/epoch - 56ms/step
Epoch 3/5
140/140 - 7s - loss: 0.0796 - accuracy: 0.9809 - val_loss: 0.0885 - val_accuracy: 0.9749 - 7s/epoch - 47ms/step
Epoch 4/5
140/140 - 8s - loss: 0.0554 - accuracy: 0.9859 - val_loss: 0.1136 - val_accuracy: 0.9605 - 8s/epoch - 58ms/step
Epoch 5/5
140/140 - 7s - loss: 0.0504 - accuracy: 0.9868 - val_loss: 0.0939 - val_accuracy: 0.9722 - 7s/epoch - 48ms/step


In [85]:
train_dense_results = model3.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model3.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 1s - loss: 0.0463 - accuracy: 0.9861 - 1s/epoch - 60ms/step
5/5 - 0s - loss: 0.0939 - accuracy: 0.9722 - 293ms/epoch - 59ms/step
Train accuracy: 98.61
Valid accuracy: 97.22


In [86]:
# Comparing the four different models
print(f"Dense model loss and accuracy: {model.evaluate(testing_padded, y_test)} " )
print(f"LSTM model loss and accuracy: {model1.evaluate(testing_padded, y_test)} " )
print(f"Bi-LSTM model loss and accuracy: {model2.evaluate(testing_padded, y_test)} " )
print(f"GRU model loss and accuracy: {model3.evaluate(testing_padded, y_test)}")

Dense model loss and accuracy: [0.074632927775383, 0.9784753322601318] 
LSTM model loss and accuracy: [0.10526051372289658, 0.9775784611701965] 
Bi-LSTM model loss and accuracy: [0.07731813937425613, 0.9820627570152283] 
GRU model loss and accuracy: [0.09390531480312347, 0.9721972942352295]


In [87]:
predict_msg = ["Have friends and colleagues who could benefit from these weekly updates? Send them to this link to subscribe",
               "Call me"]
def predict_spam(predict_msg):
  new_seq = tokenizer.texts_to_sequences(predict_msg)
  padded = pad_sequences(new_seq,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)
  return(model3.predict(padded))
predict_spam(predict_msg)





array([[0.9475514 ],
       [0.00902579]], dtype=float32)

In [88]:
predict_msg = ["Have friends and colleagues who could benefit from these weekly updates? Send them to this link to subscribe",
               "Call me"]
def predict_spam(predict_msg):
  new_seq = tokenizer.texts_to_sequences(predict_msg)
  padded = pad_sequences(new_seq,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)
  return(model2.predict(padded))
predict_spam(predict_msg)





array([[0.14027189],
       [0.00414612]], dtype=float32)

In [89]:
predict_msg = ["Have friends and colleagues who could benefit from these weekly updates? Send them to this link to subscribe",
               "Call me"]
def predict_spam(predict_msg):
  new_seq = tokenizer.texts_to_sequences(predict_msg)
  padded = pad_sequences(new_seq,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)
  return(model1.predict(padded))
predict_spam(predict_msg)



array([[0.11663651],
       [0.00160044]], dtype=float32)

In [91]:
predict_msg = ["Have friends and colleagues who could benefit from these weekly updates? Send them to this link to subscribe",
               "Call me"]
def predict_spam(predict_msg):
  new_seq = tokenizer.texts_to_sequences(predict_msg)
  padded = pad_sequences(new_seq,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)
  return(model.predict(padded))
predict_spam(predict_msg)



array([[0.24758899],
       [0.00672089]], dtype=float32)