In [None]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Dropout

from imblearn.over_sampling import SMOTE, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

import seaborn as sns

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

import re

import string

In [None]:
data = pd.read_csv('./spam.csv', encoding='ISO-8859-1')

data.dropna(axis=1, inplace=True)
data['length'] = data['v2'].apply(lambda x: len(x))
data.head()

In [None]:
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)


data['message'] = data['v2'].apply(lambda x: convert_to_lower(x))
data['message'] = data['message'].apply(lambda x: remove_numbers(x))
data['message'] = data['message'].apply(lambda x: remove_punctuation(x))
data['message'] = data['message'].apply(lambda x: remove_stopwords(x))
data['message'] = data['message'].apply(lambda x: remove_extra_white_spaces(x))
data['message'] = data['message'].apply(lambda x: lemmatizing(x))

data['length_after_cleaning'] = data['message'].apply(lambda x: len(x))

data.head()

In [None]:
data = data.replace('spam',1)
data = data.replace('ham',0)

data = data.drop(['v2','length','length_after_cleaning'], axis=1)

In [None]:
# Split the data into input features (X) and target variable (y)
X = data['message'].values
y = data['v1'].values

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature engineering using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()
print(type(X_train_tfidf))

# Define the pipeline with KMeansSMOTE
pipeline = make_pipeline(KMeansSMOTE(random_state=42))

# Apply SMOTE oversampling to the training data
X_train_oversampled, y_train_oversampled = pipeline.fit_resample(X_train_tfidf, y_train)

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(X_train_tfidf.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Train the model
data1=model.fit(X_train_oversampled, y_train_oversampled, epochs=15, batch_size=32, validation_split=0.3)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
y_pred_label = label_encoder.inverse_transform([int(pred) for pred in y_pred])
accuracy = accuracy_score(y_test, y_pred_label)
print("Accuracy:", accuracy)


In [None]:
test_cases = ["Phony �350 award - Todays Voda numbers ending XXXX are selected to receive a �350 award.\
                If you have a match please call 08712300220 quoting claim code 3100 standard rates app",
              "Congratulations, you will now receive notifications. Follow this link to find out about our promotions and discounts"]

for test_case in test_cases:
    test_case = convert_to_lower(test_case)
    test_case = remove_numbers(test_case)
    test_case = remove_punctuation(test_case)
    test_case = remove_stopwords(test_case)
    test_case = remove_extra_white_spaces(test_case)
    test_case = lemmatizing(test_case)

    test_case=[test_case]
    test_case_tfidf = tfidf_vectorizer.transform(test_case).toarray()

    prediction = model.predict(test_case_tfidf)
    prediction_label = label_encoder.inverse_transform([int(prediction)])
    print("Predicted Label: Spam" if prediction_label[0] else "Predicted Label: Ham")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_mtx = confusion_matrix(y_test, y_pred_label)
print(confusion_mtx)

In [None]:
fig, ax = plt.subplots(2,1)

ax[0].plot(data1.history['loss'], color='b', label="Training loss")
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(data1.history['accuracy'], color='b', label="Training accuracy")
legend = ax[1].legend(loc='best', shadow=True)