# Import Modules

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



print("Tensorflow Version",tf.__version__)

# Data Collection

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv')
data.head()

In [None]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
data.head()

In [None]:
val_count = data.v1.value_counts()

plt.figure(figsize=(8,4))
plt.bar(val_count.index, val_count.values)
plt.title("Spam/Ham Data Distribution")

# Data Preprocessing

## Data Cleaning

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
# Let's take a look at the updated text
data.v2 = data.v2.apply(round1)
data.head()

## Data Analyzing

In [None]:
# Convert the v1 columns to binary numbers: spam to '1' and ham to '0'
data = data.replace("ham", 0)
data = data.replace("spam", 1)

In [None]:
# Check the nulls
data.info()

In [None]:
# check for NANs
data.isna().sum()

In [None]:
# Check for duplicates
data.duplicated().sum()

In [None]:
# Drop duplicates
data = data.drop_duplicates().reset_index(drop=True)

In [None]:
data

## Splitting Dataset

In [None]:
x = data['v2']
y = data['v1']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7) # Splits Dataset into Training and Testing set
print("Train Data size:", len(x_train))
print("Test Data size", len(x_test))

## Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1000
print("Vocabulary Size :", vocab_size)

In [None]:
x_train.head().reset_index()

In [None]:
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen = 50)
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen = 50)

print("Training X Shape:", x_train.shape)
print("Testing X Shape:", x_test.shape)

In [None]:
x_train[0]

# LSTM

In [None]:
#LSTM hyperparameters
MAX_SEQUENCE_LENGTH = 50
n_lstm = 200
drop_lstm =0.2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional
embeding_dim = 16
drop_value = 0.2
n_dense = 24
num_epochs = 5
early_stop = EarlyStopping(monitor='val_loss', patience=3)
#LSTM Spam detection architecture
model1 = Sequential()
model1.add(Embedding(vocab_size, embeding_dim, input_length=MAX_SEQUENCE_LENGTH))
model1.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
model1.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
model1.add(Dense(1, activation='relu'))
model1.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(x_train, y_train, epochs=num_epochs, verbose=2)

In [None]:
# Predicting the Test set results
y_pred_lstm = model1.predict(x_test)
y_pred_lstm = (y_pred_lstm > 0.5)

In [None]:
scores_lstm = model1.evaluate(x_test, y_test,  verbose=0)

print("Our %s is %.2f%%" % (model1.metrics_names[1], scores_lstm[1]*100))

# Bi-LSTM

In [None]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embeding_dim, input_length=MAX_SEQUENCE_LENGTH))
model2.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
# Training
num_epochs = 5
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model2.fit(x_train, y_train, epochs=num_epochs, 
                    callbacks =[early_stop], verbose=2)

In [None]:
scores_bilstm = model2.evaluate(x_test, y_test,  verbose=0)

print("Our %s is %.2f%%" % (model2.metrics_names[1], scores_bilstm[1]*100))