In [277]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import pickle
pd.options.mode.chained_assignment = None

In [278]:
from keras.models import load_model
model = load_model('model.h5')

In [279]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [280]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 64)           1280000   
                                                                 
 dropout (Dropout)           (None, 150, 64)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 150, 64)          25088     
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 150, 64)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 150, 128)         49920     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 150, 128)          0

In [281]:
benchmark = pd.read_csv('SpamAssassin.csv')
benchmark

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [282]:
benchmark.rename(columns={'email': 'Message', 'label': 'Label'}, inplace=True)
benchmark.dropna(inplace=True)
benchmark.reset_index(drop=True, inplace=True)

In [283]:
spam = benchmark[benchmark['Label'] == 1]
ham = benchmark[benchmark['Label'] == 0]
ham_500 = ham.iloc[0:500, :]
bench = pd.concat([spam, ham_500], axis=0)

In [284]:
benchmark_message = benchmark.Message.astype(str)

seq = tokenizer.texts_to_sequences(benchmark_message)
pad = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=150, padding='post', truncating='post')

prediction = model.predict(pad)

benchmark['Prediction'] = pd.DataFrame(prediction)
benchmark.loc[(benchmark['Prediction'] > 0.5), 'Classification'] = 'spam'
benchmark.fillna('not spam', inplace=True)

In [285]:
benchmark

Unnamed: 0,Message,Label,Prediction,Classification
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,0.005452,not spam
1,martin a posted tassos papadopoulos the greek ...,0,0.009881,not spam
2,man threatens explosion in moscow thursday aug...,0,0.011408,not spam
3,klez the virus that won t die already the most...,0,0.009555,not spam
4,in adding cream to spaghetti carbonara which ...,0,0.018249,not spam
...,...,...,...,...
2994,abc s good morning america ranks it the NUMBE...,1,0.995949,spam
2995,hyperlink hyperlink hyperlink let mortgage le...,1,0.996177,spam
2996,thank you for shopping with us gifts for all ...,1,0.996081,spam
2997,the famous ebay marketing e course learn to s...,1,0.996218,spam


In [286]:
benchmark.to_csv('Benchmark Results_Combined.csv')

In [287]:
benchmark['Label'].value_counts()

0    2500
1     499
Name: Label, dtype: int64

In [288]:
benchmark['Classification'].value_counts()

not spam    2413
spam         586
Name: Classification, dtype: int64

In [289]:
spam_message = spam.Message.astype(str)

seq1 = tokenizer.texts_to_sequences(spam_message)
pad1 = tf.keras.preprocessing.sequence.pad_sequences(seq1, maxlen=150, padding='post', truncating='post')

prediction = model.predict(pad1)

spam['Prediction'] = pd.DataFrame(prediction)
spam.loc[(spam['Prediction'] > 0.5), 'Classification'] = 'spam'
spam.fillna('not spam', inplace=True)

In [290]:
spam

Unnamed: 0,Message,Label,Prediction,Classification
2500,save up to NUMBER on life insurance why spend...,1,not spam,not spam
2501,NUMBER fight the risk of cancer URL NUMBER sli...,1,not spam,not spam
2502,NUMBER fight the risk of cancer URL NUMBER sli...,1,not spam,not spam
2503,adult club offers free membership instant acc...,1,not spam,not spam
2504,i thought you might like these NUMBER slim dow...,1,not spam,not spam
...,...,...,...,...
2994,abc s good morning america ranks it the NUMBE...,1,not spam,not spam
2995,hyperlink hyperlink hyperlink let mortgage le...,1,not spam,not spam
2996,thank you for shopping with us gifts for all ...,1,not spam,not spam
2997,the famous ebay marketing e course learn to s...,1,not spam,not spam


In [291]:
spam.to_csv('Benchmark Results_Spam.csv')

In [292]:
spam['Label'].value_counts()

1    499
Name: Label, dtype: int64

In [293]:
spam['Classification'].value_counts()

not spam    499
Name: Classification, dtype: int64

In [294]:
ham_message = ham.Message.astype(str)

seq2 = tokenizer.texts_to_sequences(ham_message)
pad2 = tf.keras.preprocessing.sequence.pad_sequences(seq2, maxlen=150, padding='post', truncating='post')

prediction = model.predict(pad2)

ham['Prediction'] = pd.DataFrame(prediction)
ham.loc[(ham['Prediction'] > 0.5), 'Classification'] = 'spam'
ham.fillna('not spam', inplace=True)

In [295]:
ham

Unnamed: 0,Message,Label,Prediction,Classification
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,0.005452,not spam
1,martin a posted tassos papadopoulos the greek ...,0,0.009881,not spam
2,man threatens explosion in moscow thursday aug...,0,0.011408,not spam
3,klez the virus that won t die already the most...,0,0.009555,not spam
4,in adding cream to spaghetti carbonara which ...,0,0.018249,not spam
...,...,...,...,...
2495,man killed trying to surf on tube train ananov...,0,0.006994,not spam
2496,hi gianni a very good resource for this is URL...,0,0.005793,not spam
2497,gianni ponzi wrote i have a prob when trying t...,0,0.004669,not spam
2498,neale pickett neale woozle org writes skip mon...,0,0.004650,not spam


In [296]:
ham.to_csv('Benchmark Results_Ham.csv')

In [297]:
ham['Label'].value_counts()

0    2500
Name: Label, dtype: int64

In [298]:
ham['Classification'].value_counts()

not spam    2394
spam         106
Name: Classification, dtype: int64

In [299]:
bench_message = bench.Message.astype(str)

seq3 = tokenizer.texts_to_sequences(bench_message)
pad3 = tf.keras.preprocessing.sequence.pad_sequences(seq3, maxlen=150, padding='post', truncating='post')

prediction = model.predict(pad3)

bench['Prediction'] = pd.DataFrame(prediction)
bench.loc[(bench['Prediction'] > 0.5), 'Classification'] = 'spam'
bench.fillna('not spam', inplace=True)

In [300]:
bench

Unnamed: 0,Message,Label,Prediction,Classification
2500,save up to NUMBER on life insurance why spend...,1,not spam,not spam
2501,NUMBER fight the risk of cancer URL NUMBER sli...,1,not spam,not spam
2502,NUMBER fight the risk of cancer URL NUMBER sli...,1,not spam,not spam
2503,adult club offers free membership instant acc...,1,not spam,not spam
2504,i thought you might like these NUMBER slim dow...,1,not spam,not spam
...,...,...,...,...
495,on NUMBER NUMBER NUMBER NUMBER NUMBER am gary ...,0,0.996177,spam
496,robert harley gordon mohr wrote definitional n...,0,0.996081,spam
497,on sun NUMBER sep NUMBER gordon mohr wrote ok ...,0,0.996218,spam
498,j james rogers jamesr best com writes j an ex...,0,0.996104,spam


In [301]:
bench.to_csv('Benchmark Results_Balanced.csv')

In [302]:
bench['Label'].value_counts()

0    500
1    499
Name: Label, dtype: int64

In [303]:
bench['Classification'].value_counts()

not spam    519
spam        480
Name: Classification, dtype: int64