In [1]:
import tensorflow as tf
from tensorflow import keras

In [58]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [61]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [19]:
import numpy as np
import pandas as pd
import re

In [187]:
np.random.seed(0)

In [225]:
dataset = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [226]:
dataset = dataset[['v2','v1']]

In [227]:
dataset.columns = ['features', 'target']

In [26]:
from nltk.corpus import stopwords

In [27]:
stp = stopwords.words('english')

In [28]:
dataset['features'] = dataset['features'].apply(lambda x: x.lower())
dataset['features'] = dataset['features'].apply(lambda x: ' '.join([word for word in x.split() if word not in stp]))
dataset['features'] = dataset['features'].apply(lambda x: re.sub(r'[^\w\s]+',' ', x))
dataset['features'] = dataset['features'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]+',' ', x))

In [33]:
X_train, X_test, y_train, y_test = train_test_split(dataset['features'],dataset['target'],test_size=0.2)

In [39]:
len(set(' '.join(dataset['features'].tolist()).split()))

8638

In [44]:
l = [sent.split() for sent in dataset['features'].tolist()]

In [45]:
d = {k:len(v) for k,v in enumerate(l)}

In [52]:
np.argmax(list(d.values()))

1084

In [53]:
d[1084]

97

In [54]:
MAX_WORDS = 8638
MAX_SEQUENCE_LENGTH = 100
OUTPUT_DIM = 50

In [55]:
tokenizer = Tokenizer(num_words=MAX_WORDS)

In [57]:
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [70]:
X_train = pad_sequences(X_train,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
X_test = pad_sequences(X_test,maxlen=MAX_SEQUENCE_LENGTH,padding='post')

In [62]:
lbl_enc = LabelEncoder()
y_train = lbl_enc.fit_transform(y_train)
y_test = lbl_enc.transform(y_test)

In [63]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [64]:
# Model Building

In [188]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [194]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_WORDS, output_dim=OUTPUT_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.Flatten(),
    keras.layers.Dense(units=2, activation='softmax')
])

In [195]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [196]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4457, 100), (4457, 2), (1115, 100), (1115, 2))

In [197]:
history = model.fit(X_train,y_train, batch_size=15, epochs=5, validation_data=(X_test,y_test), callbacks=[callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [198]:
model.evaluate(X_test,y_test)



[0.05046442896127701, 0.9874439239501953]

In [174]:
lbl_enc.classes_

array(['ham', 'spam'], dtype=object)

In [175]:
X_test[0]

array([ 354, 7320, 2386,  442, 2271, 1643,   17,    1,  294,   15,    1,
        469,   32,   17,   17,  469,   32,  486,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [176]:
y_pred = model.predict(X_test)[0]

In [177]:
y_pred.round()

array([1., 0.], dtype=float32)

In [178]:
np.argmax(y_pred)

0

In [179]:
mapping = {0:'ham',1:'spam'}

In [180]:
mapping[np.argmax(y_pred)]

'ham'

In [201]:
# saving and loading model

In [200]:
model.save('model.h5')

In [204]:
from tensorflow.keras.models import load_model

In [205]:
k = load_model('model.h5')

In [208]:
k.predict(X_test)[0].round()

array([1., 0.], dtype=float32)

In [210]:
from tensorflow.keras.models 

ImportError: cannot import name 'plot_model' from 'tensorflow.keras.models' (C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\keras\models\__init__.py)

In [214]:
class Dummy:
    def __init__(self):
        self.a = 10
        #self.result = self.A_plus_10()
        
    def A_plus_10(self):
        self.a = self.a + 10
    
    def p(self):
        return self.a

In [215]:
d = Dummy()

In [216]:
d.p()

10

In [221]:
tokenizer.sequences_to_texts(X_test[:50])

['look amy ure beautiful intelligent woman like u lot know u don t like like don t worry',
 'wishing family merry x mas happy new year advance',
 'ur cash balance currently 500 pounds maximize ur cash in send collect 83600 150p msg cc 08718720201 po box 114 14 tcr w1',
 'you deep sigh fucking love much barely stand',
 'lol no need cash nitros hurry come out',
 'disturb u anymore jia you',
 'hey what s charles sorry late reply',
 'then',
 'wait 4 sch finish ard 5',
 'found way get another app phone eh go net cafe take job geeee need babe crave see',
 'alex knows guy sells mids he s south tampa think could set like 8',
 'message some text missing sender name missing number missing sent date missing missing u lot thats everything missing sent via fullonsms com',
 'todays voda numbers ending selected receive match please call 08712300220 quoting claim code 3100 standard rates app',
 'huh got lesson 4 lei n thinkin going sch earlier n tot kent vale',
 'god bless get good sleep dear i pray',

In [237]:
model.predict(X_test)[:50].round()

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [248]:
dataset[dataset['target'] == 'spam']['features'].tolist()

["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv",
 'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
 'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info',
 'URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18',
 'XXXMobileMovieClub: To use your credit, click the WAP link in

In [246]:
dataset[dataset['target'] == 'ham'].count()

features    4825
target      4825
dtype: int64

In [259]:
dataset[dataset['target'] == 'spam'].sample(4825, replace=True)

Unnamed: 0,features,target
5066,83039 62735=å£450 UK Break AccommodationVouche...,spam
4147,Please call Amanda with regard to renewing or ...,spam
4656,PRIVATE! Your 2003 Account Statement for shows...,spam
1306,Enjoy the jamster videosound gold club with yo...,spam
5228,PRIVATE! Your 2003 Account Statement for <fone...,spam
...,...,...
4584,U have a Secret Admirer who is looking 2 make ...,spam
1780,BIG BROTHER ALERT! The computer has selected u...,spam
1886,Dear 0776xxxxxxx U've been invited to XCHAT. T...,spam
2099,"SMS SERVICES. for your inclusive text credits,...",spam


In [260]:
dataset = pd.concat([dataset[dataset['target'] == 'ham'],dataset[dataset['target'] == 'spam'].sample(4825, replace=True)])

In [261]:
dataset

Unnamed: 0,features,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
6,Even my brother is not like to speak with me. ...,ham
...,...,...
4861,**FREE MESSAGE**Thanks for using the Auction S...,spam
946,Ur cash-balance is currently 500 pounds - to m...,spam
899,Your free ringtone is waiting to be collected....,spam
158,Customer service annoncement. You have a New Y...,spam


In [263]:
dataset[dataset['target'] == 'ham'].shape == dataset[dataset['target'] == 'spam'].shape

True

In [265]:
from sklearn.utils import shuffle

In [266]:
dataset = shuffle(dataset)

Unnamed: 0,features,target
260,Yup,ham
1459,Bought one ringtone and now getting texts cost...,spam
5485,Also fuck you and your family for going to rho...,ham
1406,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD...",spam
1373,"Bears Pic Nick, and Tom, Pete and ... Dick. In...",spam
...,...,...
478,"K, can I pick up another 8th when you're done?",ham
126,"Just so that you know,yetunde hasn't sent mone...",ham
3877,What you need. You have a person to give na.,ham
5205,Had your mobile 11mths ? Update for FREE to Or...,spam


In [269]:
dataset['features'] = dataset['features'].apply(lambda x: x.lower())
dataset['features'] = dataset['features'].apply(lambda x: ' '.join([word for word in x.split() if word not in stp]))
dataset['features'] = dataset['features'].apply(lambda x: re.sub(r'[^\w\s]+',' ', x))
dataset['features'] = dataset['features'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]+',' ', x))

X_train, X_test, y_train, y_test = train_test_split(dataset['features'],dataset['target'],test_size=0.2)

In [270]:
MAX_WORDS = 8638
MAX_SEQUENCE_LENGTH = 100
OUTPUT_DIM = 50

tokenizer = Tokenizer(num_words=MAX_WORDS)

tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
X_test = pad_sequences(X_test,maxlen=MAX_SEQUENCE_LENGTH,padding='post')

lbl_enc = LabelEncoder()
y_train = lbl_enc.fit_transform(y_train)
y_test = lbl_enc.transform(y_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [271]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_WORDS, output_dim=OUTPUT_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.Flatten(),
    keras.layers.Dense(units=2, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

X_train.shape, y_train.shape, X_test.shape, y_test.shape

history = model.fit(X_train,y_train, batch_size=15, epochs=5, validation_data=(X_test,y_test), callbacks=[callback])

model.evaluate(X_test,y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.001861187512986362, 0.9994818568229675]

In [274]:
y_pred = model.predict(X_test)

In [275]:
y_pred.round()

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [276]:
y_test

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [283]:
model.predict(X_test)[2:3].round()

array([[0., 1.]], dtype=float32)

In [284]:
tokenizer.sequences_to_texts(X_test[2:3])

['hungry gay guys feeling hungry 4 it now call 08718730555 10p min stop texts call 08712460324 10p min']

In [285]:
label_mapping = {'ham': 0, 'spam': 1}
inverse_mapping = {0: 'ham', 1: 'spam'}

In [302]:
def predict_label(sentence):
    # cleaning sentence
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]+', ' ', sentence)
    sentence = re.sub(r'[^a-zA-Z0-9]', ' ', sentence)
    seq = tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    prediction = model.predict(seq).round()
    prediction = np.argmax(prediction)
    return inverse_mapping[prediction]

In [303]:
predict_label('hungry gay guys feeling hungry 4 it now call 08718730555 10p min stop texts call 08712460324 10p min')

'spam'

In [304]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 100, 50)           431900    
_________________________________________________________________
flatten_5 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 2)                 10002     
Total params: 441,902
Trainable params: 441,902
Non-trainable params: 0
_________________________________________________________________


In [None]:
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_12 (Embedding)     (None, 100, 50)           431900    
_________________________________________________________________
flatten_5 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 2)                 10002     
=================================================================
Total params: 441,902
Trainable params: 441,902
Non-trainable params: 0
_________________________________________________________________