In [5]:
#@author: Praveen Dominic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download("all")

In [12]:
df = pd.read_csv("SMSSpamCollection", sep='\t', names=['label','text'])
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
df.label = pd.get_dummies(df.label).iloc[:,-1].values
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
sentences = df.text.tolist()
sentences[:3]

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [21]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

for i in range(len(sentences)):
    sentences[i] = re.sub("[^a-zA-Z]"," ",sentences[i]).lower()
    words = nltk.word_tokenize(sentences[i])
    words = [WordNetLemmatizer().lemmatize(word) for word in words if word not in stopwords.words("english")]
    sentences[i] = " ".join(words)

sentences[:3]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply']

In [42]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Conv1D, MaxPool1D, GlobalMaxPooling1D, Activation, Embedding

In [49]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
seq = tokenizer.texts_to_sequences(sentences)
# tokenizer

maxlen = 250

X = pad_sequences(seq, padding='post', maxlen=maxlen)
X


array([[   6, 3568,  279, ...,    0,    0,    0],
       [   8,  246, 1255, ...,    0,    0,    0],
       [  10,  351,  628, ...,    0,    0,    0],
       ...,
       [7115, 1100, 3565, ...,    0,    0,    0],
       [ 134, 7116, 3523, ...,    0,    0,    0],
       [2201,  332,  156, ...,    0,    0,    0]])

In [79]:
vocab_size = len(tokenizer.word_index)+1
word_vec_dim = 300
LAYERS = [
    Embedding(input_dim=vocab_size, output_dim=word_vec_dim, input_length=maxlen),

    Conv1D(filters = 32, kernel_size=8, activation='relu'),
    MaxPool1D(2),    
    
    Dense(16, activation='relu'),
    GlobalMaxPooling1D(),

    Dense(1, activation=tf.keras.activations.sigmoid)
]

model = tf.keras.models.Sequential(LAYERS)
# model.summary()

In [82]:
model.compile(tf.keras.optimizers.Adam(learning_rate=1e-3), loss = "binary_crossentropy", metrics = ['accuracy'])
              

In [83]:
y = df.label.tolist()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size = 0.2)
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [84]:
model.fit(X_train, y_train, epochs = 5, batch_size=32, validation_data=(X_test, y_test), verbose = 2)

140/140 - 24s - loss: 0.2327 - accuracy: 0.9186 - val_loss: 0.0467 - val_accuracy: 0.9874 - 24s/epoch - 169ms/step


<keras.callbacks.History at 0x1e8c7f14490>

In [98]:
test_sent = ["How are you"]

def encode_sent(sent):
    token1 = Tokenizer()
    token1.fit_on_texts(test_sent)
    seq = token1.texts_to_sequences(test_sent)
    padded_seq = pad_sequences(seq, padding='post', maxlen=250)

    return padded_seq

pred = model.predict(encode_sent(test_sent))>0.5

if pred >0.5:
    print("Spam")
else:
    print("Not Spam")

Not Spam
