# SMS Spam Detection Modeling

In [1]:
import string
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, Normalizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/smlovullo2304/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/smlovullo2304/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/smlovullo2304/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the Data

Import raw data from file

In [3]:
with open('../data/SMSSpamCollection.txt', 'r') as file_stream:
    sms_messages_raw = file_stream.readlines()

Collect labels and messages from imported data

In [4]:
labels = [re.search("(^.*)\t", message).group(1) for message in sms_messages_raw]
sms_messages = [message[message.index('\t')+1:-1] for message in sms_messages_raw]

Stitch the data together in a data frame

In [5]:
messages_df = pd.DataFrame(data={'Labels': labels, 'Messages': sms_messages})
messages_df['Labels'] = messages_df['Labels'].apply(lambda x: 1 if x == 'spam' else 0)
messages_df

Unnamed: 0,Labels,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [6]:
def get_punc_count(s: str) -> int:
    count = 0
    for char in s:
        count += 1 if char in string.punctuation else 0
    return count

def get_numeric_count(s: str) -> int:
    count = 0
    tokens = word_tokenize(s)
    for word in tokens:
        count += 1 if word.isnumeric() else 0
    return count

def get_uppercase_count(s: str) -> int:
    count = 0
    for char in s:
        count += 1 if char.isupper() else 0
    return count

Construct dataset of various countable features from SMS Messages

In [7]:
counts_dict = {
    'character_counts': [],
    'punc_counts': [],
    'numeric_counts': [],
    'uppercase_counts': []
}
for message in messages_df['Messages']:
    counts_dict['character_counts'].append(len(message))
    counts_dict['punc_counts'].append(get_punc_count(message))
    counts_dict['numeric_counts'].append(get_numeric_count(message))
    counts_dict['uppercase_counts'].append(get_uppercase_count(message))

In [8]:
counts_df = messages_df[['Labels']].join(pd.DataFrame(counts_dict))
counts_df

Unnamed: 0,Labels,character_counts,punc_counts,numeric_counts,uppercase_counts
0,0,111,9,0,3
1,0,29,6,0,2
2,1,155,6,3,10
3,0,49,6,0,2
4,0,61,2,0,2
...,...,...,...,...,...
5569,1,160,8,3,9
5570,0,36,1,0,1
5571,0,57,7,0,2
5572,0,125,1,0,2


## Model Set 1: Building Models Using String Feature Counts

In [9]:
X = counts_df.drop(labels=['Labels'], axis=1)
y = counts_df['Labels']

In [10]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=256)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

In [11]:
model_metrics = {'k_folds': n_splits, 'metrics': {}}

In [12]:
clf_lr = Pipeline([
    ('normalize', Normalizer()),
    ('scale', RobustScaler()),
    ('classify', LogisticRegression()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    # remove outliers for logistic regression training
    indices_with_outliars = set()
    for index in X_train[X_train['character_counts'] > X_train['character_counts'].quantile(0.99)].index.to_list():
        indices_with_outliars.add(index)
    for index in X_train[X_train['punc_counts'] > X_train['punc_counts'].quantile(0.99)].index.to_list():
        indices_with_outliars.add(index)
    for index in X_train[X_train['numeric_counts'] > X_train['numeric_counts'].quantile(0.99)].index.to_list():
        indices_with_outliars.add(index)
    for index in X_train[X_train['uppercase_counts'] > X_train['uppercase_counts'].quantile(0.99)].index.to_list():
        indices_with_outliars.add(index)
    indices_with_outliars = list(indices_with_outliars)

    clf_lr.fit(X_train.drop(labels=indices_with_outliars, axis=0), y_train.drop(labels=indices_with_outliars, axis=0))
    y_hat = clf_lr.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Logistic Regression Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")


Logistic Regression Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9031161138037073 and its std: 0.005495514396836686
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.5498761511300105 and its std: 0.025538791597159535


In [13]:
clf_dt = Pipeline([
    ('scale', RobustScaler()),
    ('classify', DecisionTreeClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_dt.fit(X_train, y_train)
    y_hat = clf_dt.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Decision Tree Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Decision Tree Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9446052452778911 and its std: 0.007795479410800483
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.7990096277181007 and its std: 0.025899539177808233


In [14]:
clf_nb = Pipeline([
    ('classify', MultinomialNB())
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_nb.fit(X_train, y_train)
    y_hat = clf_nb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Multinomial Naive Bayes Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Multinomial Naive Bayes Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.910294273074542 and its std: 0.003936748625486309
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.6491286899227834 and its std: 0.026152021965490553


In [15]:
k_neighbor_min = 1
k_neighbor_max = 20

knn_classifier_results = {}
max_accuracy = 0
max_f1_score = 0 
for k_neighbors in range(k_neighbor_min,k_neighbor_max+1):
    clf_knn = Pipeline([
        ('scale', RobustScaler()),
        ('classify', KNeighborsClassifier(n_neighbors=k_neighbors)),
    ])
    cv_accuracies = np.array([])
    cv_spam_f1_scores = np.array([])
    for train_index, val_index in skf.split(X_train_full, y_train_full):
        X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        clf_knn.fit(X_train, y_train)
        y_hat = clf_nb.predict(X_val)
        report_results = classification_report(y_val, y_hat, output_dict=True)
        cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
        cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

    mean_accuracy = cv_accuracies.mean()
    std_accuracy = cv_accuracies.std()
    mean_f1_score = cv_spam_f1_scores.mean()
    std_f1_score = cv_spam_f1_scores.std()

    if max_accuracy <= mean_accuracy:
        max_accuracy = mean_accuracy
    if max_f1_score <= mean_f1_score:
        max_f1_score = mean_f1_score

    knn_classifier_results[k_neighbors] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

for k_neighbor_val, cv_results in knn_classifier_results.items():
    if cv_results['cv_f1_score'] >= max_f1_score:
        model_name = f"K-Nearest Neighbors Classifier (k={k_neighbor_val})"
        model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

        print(model_name)
        print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {cv_results['cv_accuracy']} and its std: {cv_results['cv_accuracy_std']}")
        print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {cv_results['cv_f1_score']} and its std: {cv_results['cv_f1_score_std']}")


K-Nearest Neighbors Classifier (k=13)
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9093964055100079 and its std: 0.011244653382689313
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.6457398827758349 and its std: 0.0416740246792815


In [16]:
clf_svc = Pipeline([
    ('scale', RobustScaler()),
    ('classify', SVC()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_svc.fit(X_train, y_train)
    y_hat = clf_svc.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Support Vector Machine Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Support Vector Machine Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.955595063741551 and its std: 0.006825329726119994
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8158899147953942 and its std: 0.027547681502392497


In [17]:
clf_mlp = Pipeline([
    ('scale', RobustScaler()),
    ('classify', MLPClassifier(max_iter=1000)),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_mlp.fit(X_train, y_train)
    y_hat = clf_mlp.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Multilayer Perceptron Neural Network Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Multilayer Perceptron Neural Network Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9571643188235116 and its std: 0.0027073481205980555
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8268105723239625 and its std: 0.010917818147143508


In [18]:
clf_rf = Pipeline([
    ('scale', RobustScaler()),
    ('classify', RandomForestClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_rf.fit(X_train, y_train)
    y_hat = clf_rf.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Random Forest Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Random Forest Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9580594182985813 and its std: 0.007169250330647891
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8348478747540483 and its std: 0.03286401898325739


In [19]:
clf_et = Pipeline([
    ('scale', RobustScaler()),
    ('classify', ExtraTreesClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_et.fit(X_train, y_train)
    y_hat = clf_et.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Extra Trees Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Extra Trees Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9562684644149517 and its std: 0.009023321874893403
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8341297145903652 and its std: 0.03129939647919094


In [20]:
clf_gb = Pipeline([
    ('scale', RobustScaler()),
    ('classify', GradientBoostingClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_gb.fit(X_train, y_train)
    y_hat = clf_gb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Gradient Boosting Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Gradient Boosting Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9594094910238409 and its std: 0.005680735615527291
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8364816758413353 and its std: 0.024233801042667244


In [21]:
clf_xgb = Pipeline([
    ('scale', RobustScaler()),
    ('classify', XGBClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_xgb.fit(X_train, y_train)
    y_hat = clf_xgb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "XGBoost Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

XGBoost Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9555963219640349 and its std: 0.0075189103983331635
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8250229924446109 and its std: 0.03363168869030436


In [22]:
decision_scores = ['cv_accuracy', 'cv_f1_score']
for score_type in decision_scores:
    models_by_score = {}
    for model_name, metrics in model_metrics['metrics'].items():
        models_by_score[metrics[score_type]] = model_name
    scores = np.array(list(models_by_score.keys()))
    max_score = scores.max()
    best_scoring_model_name = models_by_score[max_score]
    print(f"Model with best {score_type}: {best_scoring_model_name}")
    print(f"\tAverage Accuracy: {model_metrics['metrics'][best_scoring_model_name]['cv_accuracy']} with Standard Deviation: {model_metrics['metrics'][best_scoring_model_name]['cv_accuracy_std']}")
    print(f"\tAverage F1-Score: {model_metrics['metrics'][best_scoring_model_name]['cv_f1_score']} with Standard Deviation: {model_metrics['metrics'][best_scoring_model_name]['cv_f1_score_std']}")


Model with best cv_accuracy: Gradient Boosting Classifier
	Average Accuracy: 0.9594094910238409 with Standard Deviation: 0.005680735615527291
	Average F1-Score: 0.8364816758413353 with Standard Deviation: 0.024233801042667244
Model with best cv_f1_score: Gradient Boosting Classifier
	Average Accuracy: 0.9594094910238409 with Standard Deviation: 0.005680735615527291
	Average F1-Score: 0.8364816758413353 with Standard Deviation: 0.024233801042667244


Before hyperparameter tuning and by only using a dataset that contains various counted metrics about each sms message, it is possible to train a model with a high estimated accuracy and f1-score. Multiple runs of examining model cross-validation results found that the following models often performed better than the rest:

* Random Forest Classifier
* XGBoost Classifier
* Gradient Boosting Classifier
* Extra Trees Classifier

The random forest classifier more often outperformed the other models, so it was the chosen algorithm to construct the model

In [23]:
classifier = Pipeline([
    ('scale', RobustScaler()),
    ('classify', RandomForestClassifier()),
])
param_grid = {
    'classify__n_estimators': [50, 100, 250, 400, 500],
    'classify__max_features': ['sqrt', 'log2'],
    'classify__min_samples_split': [2, 5, 10],
    'classify__min_samples_leaf': [1, 2, 4],
    'classify__bootstrap': [True, False]
}

grid = GridSearchCV(classifier, param_grid, cv=n_splits, scoring='f1')
grid.fit(X_train_full, y_train_full)

print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

Best parameters: {'classify__bootstrap': True, 'classify__max_features': 'sqrt', 'classify__min_samples_leaf': 1, 'classify__min_samples_split': 2, 'classify__n_estimators': 500}
Best cross-validation score: 0.8488186232560245


In [24]:
classifier = Pipeline([
    ('scale', RobustScaler()),
    ('classify', RandomForestClassifier(
        n_estimators=grid.best_params_['classify__n_estimators'],
        max_features=grid.best_params_['classify__max_features'],
        min_samples_split=grid.best_params_['classify__min_samples_split'],
        min_samples_leaf=grid.best_params_['classify__min_samples_leaf'],
        bootstrap=grid.best_params_['classify__bootstrap']
    ))
])
classifier.fit(X_train_full, y_train_full)
y_hat = classifier.predict(X_test)
report_results = classification_report(y_test, y_hat, output_dict=True)
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat))
print(f"Accuracy: {report_results['accuracy']}")
print(f"F1-Score: {report_results['1']['f1-score']}")

[[951  15]
 [ 24 125]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       966
           1       0.89      0.84      0.87       149

    accuracy                           0.97      1115
   macro avg       0.93      0.91      0.92      1115
weighted avg       0.96      0.97      0.96      1115

Accuracy: 0.9650224215246637
F1-Score: 0.8650519031141869


In [25]:
classifier = Pipeline([
    ('scale', RobustScaler()),
    ('classify', GradientBoostingClassifier()),
])
param_grid = {
    'classify__n_estimators': [50, 100, 150],
    'classify__learning_rate': [0.01, 0.1, 1],
    'classify__max_depth': [1, 3, 5, 7]
}

grid = GridSearchCV(classifier, param_grid, cv=n_splits, scoring='f1')
grid.fit(X_train_full, y_train_full)

print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

Best parameters: {'classify__learning_rate': 0.1, 'classify__max_depth': 3, 'classify__n_estimators': 150}
Best cross-validation score: 0.8396048177490301


In [26]:
classifier = Pipeline([
    ('scale', RobustScaler()),
    ('classify', GradientBoostingClassifier(
        learning_rate=grid.best_params_['classify__learning_rate'],
        n_estimators=grid.best_params_['classify__n_estimators'],
        max_depth=grid.best_params_['classify__max_depth']
    )),
])
classifier.fit(X_train_full, y_train_full)
y_hat = classifier.predict(X_test)
report_results = classification_report(y_test, y_hat, output_dict=True)
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat))
print(f"Accuracy: {report_results['accuracy']}")
print(f"F1-Score: {report_results['1']['f1-score']}")

[[947  19]
 [ 25 124]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       966
           1       0.87      0.83      0.85       149

    accuracy                           0.96      1115
   macro avg       0.92      0.91      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Accuracy: 0.9605381165919282
F1-Score: 0.8493150684931506


## Model Set 2: Building Models Based on Word Content Using Count Vectorizers

In [27]:
class TextCleaner():

    def __init__(self):
        self.punct_table = str.maketrans('', '', string.punctuation)
        self.stopwords = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.transformations = [
            self._strip_punct,
            self._convert_to_lowercase,
            self._remove_stopwords,
            self._remove_numbers,
            self._remove_special_characters,
            self._lemmatize
        ]

    def fit(self, X, y):
        return self

    def fit_transform(self, X, y):
        return self.clean_text(X)

    def transform(self, X):
        return self.clean_text(X)

    def clean_text(self, X):
        cleaned_text = []
        for text in X:
            for transformation in self.transformations:
                text = transformation(text)
            cleaned_text.append(text)
        return cleaned_text

    def _strip_punct(self, text: str) -> str:
        return text.translate(self.punct_table)

    def _convert_to_lowercase(self, text: str) -> str:
        return text.lower()

    def _remove_stopwords(self, text: str) -> str:
        words = nltk.word_tokenize(text)
        words = [w for w in words if w not in self.stopwords]
        return ' '.join(words)

    def _remove_numbers(self, text: str) -> str:
        words = nltk.word_tokenize(text)
        words = [w for w in words if not re.search(r'\d', w)]
        return ' '.join(words)

    def _remove_special_characters(self, text: str) -> str:
        pattern = r'[^a-zA-Z0-9\s]'
        return re.sub(pattern, '', text)

    def _lemmatize(self, text: str) -> str:
        words = nltk.word_tokenize(text)
        words = [self.lemmatizer.lemmatize(w) for w in words]
        return ' '.join(words)

In [28]:
text_df = messages_df[['Labels','Messages']]
text_df

Unnamed: 0,Labels,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [29]:
X = text_df['Messages']
y = text_df['Labels']

In [30]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=256)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

In [31]:
model_metrics = {'k_folds': n_splits, 'metrics': {}}

In [32]:
clf_lr = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('normalize', Normalizer()),
    ('classify', LogisticRegression()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_lr.fit(X_train, y_train)
    y_hat = clf_lr.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Logistic Regression Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")


Logistic Regression Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.957613000961282 and its std: 0.005471407807306032
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8139606254612863 and its std: 0.027601870294266553


In [33]:
clf_dt = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classify', DecisionTreeClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_dt.fit(X_train, y_train)
    y_hat = clf_dt.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Decision Tree Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Decision Tree Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9475190369061819 and its std: 0.006580278723761395
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.7979445539217128 and its std: 0.02695581217757847


In [34]:
clf_nb = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classify', MultinomialNB())
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_nb.fit(X_train, y_train)
    y_hat = clf_nb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Multinomial Naive Bayes Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Multinomial Naive Bayes Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9603035839209232 and its std: 0.004801481163606562
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8257880687618349 and its std: 0.02461974826260868


In [35]:
k_neighbor_min = 1
k_neighbor_max = 20

knn_classifier_results = {}
max_accuracy = 0
max_f1_score = 0 
for k_neighbors in range(k_neighbor_min,k_neighbor_max+1):
    clf_knn = Pipeline([
        ('text_cleaner', TextCleaner()),
        ('vectorizer', TfidfVectorizer()),
        ('scale', RobustScaler(with_centering=False)),
        ('classify', KNeighborsClassifier(n_neighbors=k_neighbors)),
    ])
    cv_accuracies = np.array([])
    cv_spam_f1_scores = np.array([])
    for train_index, val_index in skf.split(X_train_full, y_train_full):
        X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        clf_knn.fit(X_train, y_train)
        y_hat = clf_nb.predict(X_val)
        report_results = classification_report(y_val, y_hat, output_dict=True)
        cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
        cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

    mean_accuracy = cv_accuracies.mean()
    std_accuracy = cv_accuracies.std()
    mean_f1_score = cv_spam_f1_scores.mean()
    std_f1_score = cv_spam_f1_scores.std()

    if max_accuracy <= mean_accuracy:
        max_accuracy = mean_accuracy
    if max_f1_score <= mean_f1_score:
        max_f1_score = mean_f1_score

    knn_classifier_results[k_neighbors] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

for k_neighbor_val, cv_results in knn_classifier_results.items():
    if cv_results['cv_f1_score'] >= max_f1_score:
        model_name = f"K-Nearest Neighbors Classifier (k={k_neighbor_val})"
        model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

        print(model_name)
        print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {cv_results['cv_accuracy']} and its std: {cv_results['cv_accuracy_std']}")
        print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {cv_results['cv_f1_score']} and its std: {cv_results['cv_f1_score_std']}")


K-Nearest Neighbors Classifier (k=20)
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9699483625492593 and its std: 0.0019296100013589
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8737314769701712 and its std: 0.009275010232237887


In [36]:
clf_svc = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classify', SVC()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_svc.fit(X_train, y_train)
    y_hat = clf_svc.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Support Vector Machine Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Support Vector Machine Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9748821045532555 and its std: 0.0015233037251525922
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.897413292655861 and its std: 0.006500453797215195


In [37]:
clf_mlp = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classify', MLPClassifier(max_iter=1000)),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_mlp.fit(X_train, y_train)
    y_hat = clf_mlp.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Multilayer Perceptron Neural Network Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Multilayer Perceptron Neural Network Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9820592572461033 and its std: 0.0030880450019682903
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.9294262803300576 and its std: 0.013069477234270873


In [38]:
clf_rf = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classify', RandomForestClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_rf.fit(X_train, y_train)
    y_hat = clf_rf.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Random Forest Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Random Forest Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.974207445657371 and its std: 0.006232975088793353
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8932841777999251 and its std: 0.02835684799360344


In [39]:
clf_et = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classify', ExtraTreesClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_et.fit(X_train, y_train)
    y_hat = clf_et.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Extra Trees Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Extra Trees Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9739849919222117 and its std: 0.003431433899103548
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8944985584853645 and its std: 0.015588336041731963


In [40]:
clf_gb = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('scale', RobustScaler(with_centering=False)),
    ('classify', GradientBoostingClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_gb.fit(X_train, y_train)
    y_hat = clf_gb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Gradient Boosting Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Gradient Boosting Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.960754027570171 and its std: 0.0023437170098915628
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8319414437487234 and its std: 0.013390609538186339


In [41]:
clf_xgb = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('scale', RobustScaler(with_centering=False)),
    ('classify', XGBClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_xgb.fit(X_train, y_train)
    y_hat = clf_xgb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "XGBoost Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

XGBoost Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9686038260029293 and its std: 0.005348017582151105
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8731281141412838 and its std: 0.023246521627147913


In [42]:
decision_scores = ['cv_accuracy', 'cv_f1_score']
for score_type in decision_scores:
    models_by_score = {}
    for model_name, metrics in model_metrics['metrics'].items():
        models_by_score[metrics[score_type]] = model_name
    scores = np.array(list(models_by_score.keys()))
    max_score = scores.max()
    best_scoring_model_name = models_by_score[max_score]
    print(f"Model with best {score_type}: {best_scoring_model_name}")
    print(f"\tAverage Accuracy: {model_metrics['metrics'][best_scoring_model_name]['cv_accuracy']} with Standard Deviation: {model_metrics['metrics'][best_scoring_model_name]['cv_accuracy_std']}")
    print(f"\tAverage F1-Score: {model_metrics['metrics'][best_scoring_model_name]['cv_f1_score']} with Standard Deviation: {model_metrics['metrics'][best_scoring_model_name]['cv_f1_score_std']}")


Model with best cv_accuracy: Multilayer Perceptron Neural Network Classifier
	Average Accuracy: 0.9820592572461033 with Standard Deviation: 0.0030880450019682903
	Average F1-Score: 0.9294262803300576 with Standard Deviation: 0.013069477234270873
Model with best cv_f1_score: Multilayer Perceptron Neural Network Classifier
	Average Accuracy: 0.9820592572461033 with Standard Deviation: 0.0030880450019682903
	Average F1-Score: 0.9294262803300576 with Standard Deviation: 0.013069477234270873


A multilayer perceptron neural network ultimately showed the most consistent best performance when constructing a model based on the word content of the sms messages. It will be chosen for hyperparameter tuning.

In [43]:
classifier = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MLPClassifier(max_iter=1000)),
])

param_grid = {
    'classifier__hidden_layer_sizes': [(100,), (50, 50)],
    'classifier__activation': ['relu'],
    'classifier__solver': ['adam'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}

grid = GridSearchCV(classifier, param_grid, cv=n_splits, scoring='f1')
grid.fit(X_train_full, y_train_full)

print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

Best parameters: {'classifier__activation': 'relu', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (100,), 'classifier__learning_rate': 'constant', 'classifier__solver': 'adam'}
Best cross-validation score: 0.9296886003484083


In [44]:
classifier = Pipeline([
    ('text_cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MLPClassifier(
        max_iter=1000,
        hidden_layer_sizes=grid.best_params_['classifier__hidden_layer_sizes'],
        activation=grid.best_params_['classifier__activation'],
        solver=grid.best_params_['classifier__solver'],
        alpha=grid.best_params_['classifier__alpha'],
        learning_rate=grid.best_params_['classifier__learning_rate']
    )),
])
classifier.fit(X_train_full, y_train_full)
y_hat = classifier.predict(X_test)
report_results = classification_report(y_test, y_hat, output_dict=True)
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat))
print(f"Accuracy: {report_results['accuracy']}")
print(f"F1-Score: {report_results['1']['f1-score']}")

[[961   5]
 [ 15 134]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.96      0.90      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.9820627802690582
F1-Score: 0.9305555555555556


The word-based model appears to perform better than the counts-based model. Let's examine the performance of models combining these two feature sets and see how they perform.

## Model Set 3: Combination of String Feature Counts and Word Counts

In [45]:
class TextCounter():

    def __init__(self):
        self.count_types = [
            CountTypes.CHAR,
            CountTypes.PUNC,
            CountTypes.NUM,
            CountTypes.UPPER
        ]
        self.count_methods = [
            self._get_char_count,
            self._get_punc_count,
            self._get_numeric_count,
            self._get_uppercase_count
        ]

    def fit(self, X, y):
        return self

    def fit_transform(self, X, y):
        return self.count_features(X)

    def transform(self, X):
        return self.count_features(X)

    def count_features(self, X):
        # generates features dict to store values in lists organized by count type
        features = {}
        for count_type in self.count_types:
            features[count_type] = []

        # for each body of text, each count method is performed and values are stored in lists in the features dict organized by count type
        for text in X:
            for count_method in self.count_methods:
                count_type, count_val = count_method(text)
                features[count_type].append(count_val)

        # converts dict into 2d list
        features_as_2d_list = []
        for count_vals in features.values():
            features_as_2d_list.append(count_vals)

        # returns array with data organized by row instead of by column
        return np.array(features_as_2d_list).transpose()

    def _get_char_count(self, s: str) -> int:
        return (CountTypes.CHAR, len(s))

    def _get_punc_count(self, s: str) -> int:
        count = 0
        for char in s:
            count += 1 if char in string.punctuation else 0
        return (CountTypes.PUNC, count)

    def _get_numeric_count(self, s: str) -> int:
        count = 0
        tokens = word_tokenize(s)
        for word in tokens:
            count += 1 if word.isnumeric() else 0
        return (CountTypes.NUM, count)

    def _get_uppercase_count(self, s: str) -> int:
        count = 0
        for char in s:
            count += 1 if char.isupper() else 0
        return (CountTypes.UPPER, count)


class CountTypes:
    CHAR = 'char_count'
    PUNC = 'punc_count'
    NUM = 'numeric_count'
    UPPER = 'upper_count'


In [46]:
class TextCleaner():

    def __init__(self):
        self.punct_table = str.maketrans('', '', string.punctuation)
        self.stopwords = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.transformations = [
            self._strip_punct,
            self._convert_to_lowercase,
            self._remove_stopwords,
            self._remove_numbers,
            self._remove_special_characters,
            self._lemmatize
        ]

    def fit(self, X, y):
        return self

    def fit_transform(self, X, y):
        return self.clean_text(X)

    def transform(self, X):
        return self.clean_text(X)

    def clean_text(self, X):
        cleaned_text = []
        for text in X:
            for transformation in self.transformations:
                text = transformation(text)
            cleaned_text.append(text)
        return cleaned_text

    def _strip_punct(self, text: str) -> str:
        return text.translate(self.punct_table)

    def _convert_to_lowercase(self, text: str) -> str:
        return text.lower()

    def _remove_stopwords(self, text: str) -> str:
        words = word_tokenize(text)
        words = [w for w in words if w not in self.stopwords]
        return ' '.join(words)

    def _remove_numbers(self, text: str) -> str:
        words = word_tokenize(text)
        words = [w for w in words if not re.search(r'\d', w)]
        return ' '.join(words)

    def _remove_special_characters(self, text: str) -> str:
        pattern = r'[^a-zA-Z0-9\s]'
        return re.sub(pattern, '', text)

    def _lemmatize(self, text: str) -> str:
        words = word_tokenize(text)
        words = [self.lemmatizer.lemmatize(w) for w in words]
        return ' '.join(words)


In [47]:
messages_df

Unnamed: 0,Labels,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [48]:
feature_counts_pipe = Pipeline([
    ('counter', TextCounter()),
    ('scaler', RobustScaler())
])

vectorizer_pipe = Pipeline([
    ('cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer())
])

In [49]:
str_feature_counts = pd.DataFrame(feature_counts_pipe.fit_transform(messages_df['Messages']), columns=[CountTypes.CHAR, CountTypes.PUNC, CountTypes.NUM, CountTypes.UPPER])
str_feature_counts

Unnamed: 0,char_count,punc_count,numeric_count,upper_count
0,0.569767,1.50,0.0,0.333333
1,-0.383721,0.75,0.0,0.000000
2,1.081395,0.75,3.0,2.666667
3,-0.151163,0.75,0.0,0.000000
4,-0.011628,-0.25,0.0,0.000000
...,...,...,...,...
5569,1.139535,1.25,3.0,2.333333
5570,-0.302326,-0.50,0.0,-0.333333
5571,-0.058140,1.00,0.0,0.000000
5572,0.732558,-0.50,0.0,0.000000


In [50]:
word_counts = pd.DataFrame(vectorizer_pipe.fit_transform(messages_df['Messages']).toarray(), columns=vectorizer_pipe.named_steps['vectorizer'].get_feature_names_out())
word_counts

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,zebra,zed,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
X = str_feature_counts.join(word_counts)
y = messages_df['Labels']

In [52]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=256)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

In [53]:
clf_lr = Pipeline([
    ('normalize', Normalizer()),
    ('classify', LogisticRegression()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_lr.fit(X_train, y_train)
    y_hat = clf_lr.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Logistic Regression Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")


Logistic Regression Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9562682127704549 and its std: 0.008359373664032017
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8292108572756243 and its std: 0.03199913229674043


In [54]:
clf_dt = Pipeline([
    ('classify', DecisionTreeClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_dt.fit(X_train, y_train)
    y_hat = clf_dt.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Decision Tree Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Decision Tree Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9695014419229666 and its std: 0.004269172183391407
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8831848731406394 and its std: 0.017429786650813954


In [55]:
clf_nb = Pipeline([
    ('classify', GaussianNB())
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_nb.fit(X_train, y_train)
    y_hat = clf_nb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Gaussian Naive Bayes Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Gaussian Naive Bayes Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.8824855430236596 and its std: 0.01000423966951353
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.6565170645486595 and its std: 0.023798473321950585


In [56]:
k_neighbor_min = 1
k_neighbor_max = 20

knn_classifier_results = {}
max_accuracy = 0
max_f1_score = 0 
for k_neighbors in range(k_neighbor_min,k_neighbor_max+1):
    clf_knn = Pipeline([
        ('classify', KNeighborsClassifier(n_neighbors=k_neighbors)),
    ])
    cv_accuracies = np.array([])
    cv_spam_f1_scores = np.array([])
    for train_index, val_index in skf.split(X_train_full, y_train_full):
        X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        clf_knn.fit(X_train, y_train)
        y_hat = clf_nb.predict(X_val)
        report_results = classification_report(y_val, y_hat, output_dict=True)
        cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
        cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

    mean_accuracy = cv_accuracies.mean()
    std_accuracy = cv_accuracies.std()
    mean_f1_score = cv_spam_f1_scores.mean()
    std_f1_score = cv_spam_f1_scores.std()

    if max_accuracy <= mean_accuracy:
        max_accuracy = mean_accuracy
    if max_f1_score <= mean_f1_score:
        max_f1_score = mean_f1_score

    knn_classifier_results[k_neighbors] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

for k_neighbor_val, cv_results in knn_classifier_results.items():
    if cv_results['cv_f1_score'] >= max_f1_score:
        model_name = f"K-Nearest Neighbors Classifier (k={k_neighbor_val})"
        model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

        print(model_name)
        print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {cv_results['cv_accuracy']} and its std: {cv_results['cv_accuracy_std']}")
        print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {cv_results['cv_f1_score']} and its std: {cv_results['cv_f1_score_std']}")


K-Nearest Neighbors Classifier (k=9)
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9324940989365503 and its std: 0.013638437071826843
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.7941413827877633 and its std: 0.03415718218810415


In [57]:
clf_svc = Pipeline([
    ('classify', SVC()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_svc.fit(X_train, y_train)
    y_hat = clf_svc.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Support Vector Machine Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Support Vector Machine Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9598521336936884 and its std: 0.009835475584386926
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8334421594734494 and its std: 0.04532670826393174


In [58]:
clf_mlp = Pipeline([
    ('classify', MLPClassifier(max_iter=1000)),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_mlp.fit(X_train, y_train)
    y_hat = clf_mlp.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Multilayer Perceptron Neural Network Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Multilayer Perceptron Neural Network Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9874409264543793 and its std: 0.003123829947718625
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.9517370420860048 and its std: 0.012402514341603539


In [59]:
clf_rf = Pipeline([
    ('classify', RandomForestClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_rf.fit(X_train, y_train)
    y_hat = clf_rf.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Random Forest Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Random Forest Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9793666611304879 and its std: 0.0035230439313148126
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.916398806329908 and its std: 0.015494963354653805


In [60]:
clf_et = Pipeline([
    ('classify', ExtraTreesClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_et.fit(X_train, y_train)
    y_hat = clf_et.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Extra Trees Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Extra Trees Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9773487239107569 and its std: 0.003283409197855285
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.9079467322462392 and its std: 0.014082362105129118


In [61]:
clf_gb = Pipeline([
    ('classify', GradientBoostingClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_gb.fit(X_train, y_train)
    y_hat = clf_gb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "Gradient Boosting Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

Gradient Boosting Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9746566310841347 and its std: 0.004180883451861918
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.8989856626825181 and its std: 0.01779338809931539


In [62]:
clf_xgb = Pipeline([
    ('classify', XGBClassifier()),
])
cv_accuracies = np.array([])
cv_spam_f1_scores = np.array([])
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    clf_xgb.fit(X_train, y_train)
    y_hat = clf_xgb.predict(X_val)
    report_results = classification_report(y_val, y_hat, output_dict=True)
    cv_accuracies = np.append(cv_accuracies, report_results['accuracy'])
    cv_spam_f1_scores = np.append(cv_spam_f1_scores, report_results['1']['f1-score'])

mean_accuracy = cv_accuracies.mean()
std_accuracy = cv_accuracies.std()
mean_f1_score = cv_spam_f1_scores.mean()
std_f1_score = cv_spam_f1_scores.std()

model_name = "XGBoost Classifier"
model_metrics['metrics'][model_name] = {'cv_accuracy': mean_accuracy, 'cv_accuracy_std': std_accuracy, 'cv_f1_score': mean_f1_score, 'cv_f1_score_std': std_f1_score}

print(model_name)
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Accuracy: {mean_accuracy} and its std: {std_accuracy}")
print(f"Stratified K-Fold ({n_splits}) Cross Validation Average Spam F1-Score: {mean_f1_score} and its std: {std_f1_score}")

XGBoost Classifier
Stratified K-Fold (5) Cross Validation Average Accuracy: 0.9771240053751263 and its std: 0.00392839316736369
Stratified K-Fold (5) Cross Validation Average Spam F1-Score: 0.9108522042033937 and its std: 0.017396327304070192


In [63]:
decision_scores = ['cv_accuracy', 'cv_f1_score']
for score_type in decision_scores:
    models_by_score = {}
    for model_name, metrics in model_metrics['metrics'].items():
        models_by_score[metrics[score_type]] = model_name
    scores = np.array(list(models_by_score.keys()))
    max_score = scores.max()
    best_scoring_model_name = models_by_score[max_score]
    print(f"Model with best {score_type}: {best_scoring_model_name}")
    print(f"\tAverage Accuracy: {model_metrics['metrics'][best_scoring_model_name]['cv_accuracy']} with Standard Deviation: {model_metrics['metrics'][best_scoring_model_name]['cv_accuracy_std']}")
    print(f"\tAverage F1-Score: {model_metrics['metrics'][best_scoring_model_name]['cv_f1_score']} with Standard Deviation: {model_metrics['metrics'][best_scoring_model_name]['cv_f1_score_std']}")


Model with best cv_accuracy: Multilayer Perceptron Neural Network Classifier
	Average Accuracy: 0.9874409264543793 with Standard Deviation: 0.003123829947718625
	Average F1-Score: 0.9517370420860048 with Standard Deviation: 0.012402514341603539
Model with best cv_f1_score: Multilayer Perceptron Neural Network Classifier
	Average Accuracy: 0.9874409264543793 with Standard Deviation: 0.003123829947718625
	Average F1-Score: 0.9517370420860048 with Standard Deviation: 0.012402514341603539


In [64]:
classifier = Pipeline([
    ('classifier', MLPClassifier(max_iter=1000)),
])

param_grid = {
    'classifier__hidden_layer_sizes': [(100,), (50, 50)],
    'classifier__activation': ['relu'],
    'classifier__solver': ['adam'],
    'classifier__alpha': [0.0001, 0.001],
    'classifier__learning_rate': ['constant', 'adaptive']
}

grid = GridSearchCV(classifier, param_grid, cv=n_splits, scoring='f1')
grid.fit(X_train_full, y_train_full)

print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

Best parameters: {'classifier__activation': 'relu', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (50, 50), 'classifier__learning_rate': 'constant', 'classifier__solver': 'adam'}
Best cross-validation score: 0.952645581406794


In [65]:
classifier = Pipeline([
    ('classifier', MLPClassifier(
        max_iter=1000,
        hidden_layer_sizes=grid.best_params_['classifier__hidden_layer_sizes'],
        activation=grid.best_params_['classifier__activation'],
        solver=grid.best_params_['classifier__solver'],
        alpha=grid.best_params_['classifier__alpha'],
        learning_rate=grid.best_params_['classifier__learning_rate']
    )),
])
classifier.fit(X_train_full, y_train_full)
y_hat = classifier.predict(X_test)
report_results = classification_report(y_test, y_hat, output_dict=True)
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat))
print(f"Accuracy: {report_results['accuracy']}")
print(f"F1-Score: {report_results['1']['f1-score']}")

[[958   8]
 [ 10 139]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.95      0.93      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.9838565022421525
F1-Score: 0.9391891891891893


## Results

Between the counts-based model and the word-based model, the word-based model shows a better f1-score and accuracy. For the combined model