In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# NN 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,Embedding, Flatten

# preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

df = pd.read_csv('/kaggle/input/spamspam/spam.csv')

In [2]:
df.head()

Unnamed: 0,Category,Message
0,not spam,"Go until jurong point, crazy.. Available only ..."
1,not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,not spam,U dun say so early hor... U c already then say...
4,not spam,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
x = df["Message"]
y = df["Category"]

In [4]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

In [5]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=.3,random_state=1)

In [6]:
tok = Tokenizer()
tok.fit_on_texts(xtrain)

In [7]:
vocabulary = tok.index_word
vocab_length = len(vocabulary)
vocab_length

7382

In [8]:
# Sequence
train_sequence = tok.texts_to_sequences(xtrain)

In [9]:
# Length of all documents
doc_length = []
for doc in train_sequence:
  doc_length.append(len(doc))

In [10]:
max(doc_length)

189

In [11]:
# 90% quantile
# 90% document length is less than or equal to 30
np.quantile(doc_length, 0.9)

30.0

In [12]:
# 99% quantile
# 99% document length is less than or equal to 51
np.quantile(doc_length, 0.99)

51.00999999999976

In [13]:
max_length = 51

In [14]:
# Padding
train_matrix = sequence.pad_sequences(train_sequence,maxlen=max_length)
train_matrix

array([[   0,    0,    0, ...,  111,  500, 1013],
       [   0,    0,    0, ...,   74,   13, 3494],
       [   0,    0,    0, ...,   52,   39,  850],
       ...,
       [   0,    0,    0, ...,  121,  741, 7381],
       [   0,    0,    0, ..., 1790, 7382, 1919],
       [   0,    0,    0, ...,  267,   31,   10]], dtype=int32)

In [15]:
# Testing data
test_sequence = tok.texts_to_sequences(xtest)
test_matrix = sequence.pad_sequences(test_sequence,maxlen=max_length)
test_matrix

array([[   0,    0,    0, ...,   72,    5,  719],
       [   0,    0,    0, ...,  142,   10, 1592],
       [   0,    0,    0, ..., 5282, 2962,   69],
       ...,
       [   0,    0,    0, ...,    0,  205, 1753],
       [   0,    0,    0, ...,  171,   12,    5],
       [   0,    0,    0, ...,   78,   16,   90]], dtype=int32)

In [16]:
# model
model = Sequential()
model.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 51
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token

model.add(Flatten()) # 2D into 1D
model.add(Dense(32,activation="relu"))
model.add(Dense(16,activation="relu"))
model.add(Dense(1,activation="sigmoid"))# because it is a binary classification problem

In [17]:
model.compile(optimizer="adam",loss="binary_crossentropy")
model.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f36166f2250>

In [18]:
# prediction
ypred = model.predict(test_matrix)
ypred = np.where(ypred >= 0.5, 1, 0)
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1442
           1       0.99      0.92      0.95       230

    accuracy                           0.99      1672
   macro avg       0.99      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [19]:
sms1 = "Hey how are you ? let's catchup"
sms2 = "FREE FREE, claim your prize worth $20000 and click on the following link http:fake.com"

In [20]:
def predict_sms(sms):
  data_seq = tok.texts_to_sequences([sms])
  data_matrix = sequence.pad_sequences(data_seq,maxlen=max_length)
  y_pred = model.predict(data_matrix)
  y_pred = np.where(y_pred >= 0.5, 1, 0)
  output = le.inverse_transform(y_pred[0])[0]
  return output

In [21]:
predict_sms(sms1)



'not spam'

In [22]:
predict_sms(sms2)



'spam'