In [24]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [3]:
file_content = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [4]:
dataset = file_content[['v2','v1']]

In [5]:
dataset.columns = ['features', 'target']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset['features'],dataset['target'],test_size=0.2)

In [8]:
MAX_SEQUENCE_LENGTH = 300

In [9]:
MAX_NB_WORDS = 20000

In [10]:
EMB_DIM = 100

In [11]:
# Tokenize for word Embeddings

In [12]:
import tensorflow as tf
from tensorflow import keras

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [14]:
# Tokenize features

In [15]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [16]:
tokenizer.fit_on_texts(X_train)

In [17]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [18]:
# Label encode targets and convert to categorical

In [19]:
lbl = LabelEncoder()
lbl.fit(y_train)

LabelEncoder()

In [20]:
y_train.shape

(4457,)

In [21]:
y_train = lbl.transform(y_train)
y_test = lbl.transform(y_test)

In [22]:
y_train.shape

(4457,)

In [25]:
y_train = to_categorical(np.asarray(y_train))

In [26]:
y_train.shape

(4457, 2)

In [27]:
y_test = to_categorical(np.asarray(y_test))

In [28]:
# Padding

In [29]:
X_train = pad_sequences(X_train,maxlen=MAX_SEQUENCE_LENGTH)

In [30]:
X_test = pad_sequences(X_test,maxlen=MAX_SEQUENCE_LENGTH)

In [31]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457, 300), (1115, 300), (4457, 2), (1115, 2))

# Model Building

## 1. CNN

In [32]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_NB_WORDS, output_dim=EMB_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.Dropout(0.5),
    keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    keras.layers.MaxPooling1D(5),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    keras.layers.MaxPooling1D(5),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dense(units=2, activation='softmax')
])

In [33]:
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [34]:
history = model.fit(X_train, y_train, batch_size=64 ,epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
y_pred = model.predict(X_test)

In [36]:
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       967
           1       1.00      0.74      0.85       148

   micro avg       0.97      0.97      0.97      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.97      0.96      1115
 samples avg       0.97      0.97      0.97      1115



## 2. RNN

In [38]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_NB_WORDS, output_dim=EMB_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.SimpleRNN(units=2, input_shape=(-1,1)),
    keras.layers.Dense(units=2,activation='softmax')
])

In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
y_pred = model.predict(X_test)

In [42]:
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       967
           1       0.84      0.57      0.68       148

   micro avg       0.93      0.93      0.93      1115
   macro avg       0.89      0.78      0.82      1115
weighted avg       0.93      0.93      0.92      1115
 samples avg       0.93      0.93      0.93      1115



## 3. LSTM

In [48]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_NB_WORDS, output_dim=EMB_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.LSTM(units=2, activation='relu', return_sequences=True),
    keras.layers.Flatten(),
    keras.layers.Dense(units=2,activation='softmax')
])

In [49]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [50]:
history = model.fit(X_train,y_train, batch_size=16, epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [51]:
y_pred = model.predict(X_test)

In [52]:
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       967
           1       0.99      0.89      0.94       148

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115
 samples avg       0.98      0.98      0.98      1115

