# SPAM DETECTION USING GRU



In [1]:
!pip install matplotlib



In [2]:
from keras.layers import SimpleRNN,GRU, Embedding, Dense, Flatten
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.utils import plot_model

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

Using TensorFlow backend.


## Read data

In [81]:
data = pd.read_csv("./spams.csv")
data1 = pd.read_csv("./spam_text_mails_data.csv")
print(data.tail())
print(data1.head())
print(data1.tail())


                                                   Mail Category
5724   4 color printing special  request additional ...     spam
5725  naturally irresistible your corporate identity...     spam
5726  you got this mail that you have received a sch...      ham
5727  the stock trading gunslinger  fanny is merrill...     spam
5728  unbelievable new homes made easy  im wanting t...     spam
  Category                                               Mail
0     spam  You got this mail because you have won 1lakh d...
1      ham  Go until jurong point, crazy.. Available only ...
2      ham                      Ok lar... Joking wif u oni...
3     spam  Free entry in 2 a wkly comp to win FA Cup fina...
4      ham  U dun say so early hor... U c already then say...
     Category                                               Mail
5569     spam  This is the 2nd time we have tried 2 contact u...
5570      ham               Will ü b going to esplanade fr home?
5571      ham  Pity, * was in mood for that

In [50]:
#SEPARATING MAILS AND LABELS(HAM OR SPAM)
mails = []
labels = []
for index, row in data.iterrows():
    mails.append(row['Mail'])
    if row['Category'] == 'ham':
        labels.append(0)
    else:
        labels.append(1)
for index, row in data1.iterrows():
    mails.append(row['Mail'])
    if row['Category'] == 'ham':
        labels.append(0)
    else:
        labels.append(1)
mails = np.asarray(mails)
labels = np.asarray(labels)
   

print("Number of mails: ", len(mails))
print("Number of labels: ", len(labels))


Number of mails:  11303
Number of labels:  11303


## TOKENIZATION


In [70]:
max_vocab = 10000
max_len = 500

# Ignore all words except the 10000 most common words
tokenizer = Tokenizer(num_words=max_vocab)
# Calculate the frequency of words
tokenizer.fit_on_texts(mails)
# Convert array of mails to list of sequences of integers
sequences = tokenizer.texts_to_sequences(mails)

# Dict keeping track of words to integer index
word_index = tokenizer.word_index

## PADDING

In [71]:
# Convert the array of sequences(of integers) to 2D array with padding
# maxlen specifies the maximum length of sequence (truncated if longer, padded if shorter)
data = pad_sequences(sequences, maxlen=max_len)

print("data shape: ", data.shape)


data shape:  (11303, 500)


## SEPARATING THE DATA FOR TRAINING AND TESTING

In [72]:
# We will use 80% of data for training & validation(80% train, 20% validation) and 20% for testing
train_samples = int(len(mails)*0.8)

mails_train = data[:train_samples]
labels_train = labels[:train_samples]

mails_test = data[train_samples:len(mails)-2]
labels_test = labels[train_samples:len(mails)-2]

## CREATING GRU MODEL

In [73]:
embedding_mat_columns=32
# Construct the SimpleRNN model
model = Sequential()
    ## Add embedding layer to convert integer encoding to word embeddings(the model learns the
    ## embedding matrix during training), embedding matrix has max_vocab as no. of rows and chosen
    ## no. of columns
model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))


model.add(GRU(units=embedding_mat_columns))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
gru_4 (GRU)                  (None, 32)                6240      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 326,273
Trainable params: 326,273
Non-trainable params: 0
_________________________________________________________________


## TRAINING AND TESTING THE MODEL

In [78]:
 #plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

    # Training the model
model.fit(mails_train, labels_train, epochs=5, batch_size=60, validation_split=0.31)

    # Testing the model
pred = model.predict_classes(mails_test)
acc = model.evaluate(mails_test, labels_test)
print("Test loss is {0:.2f} accuracy is {1:.2f}  ".format(acc[0],acc[1]))

Train on 6238 samples, validate on 2804 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss is 0.23 accuracy is 0.93  


## TESTING WITH CUSTOM MESSAGE

In [79]:
# Constructing a custom mail to check model
custom_mail = 'Congratulations you have won a prize of million dollars and world tour'

custom_mail = custom_mail.lower().split(' ')
test_seq = np.array([word_index[word] for word in custom_mail])

test_seq = np.pad(test_seq, (500-len(test_seq), 0), 'constant', constant_values=(0))
test_seq = test_seq.reshape(1, 500)
        

pred = model.predict_classes(test_seq)
print(pred)

[[1]]


In [80]:
# Constructing a custom mail to check model
custom_mai = 'Congratulations brother for your results in education'
custom_mai = custom_mai.lower().split(' ')
test_seq = np.array([word_index[word] for word in custom_mai])

test_seq = np.pad(test_seq, (500-len(test_seq), 0), 'constant', constant_values=(0))
test_seq = test_seq.reshape(1, 500)
        

pred = model.predict_classes(test_seq)
print(pred)

[[0]]
