# Model Experiments

## Setup

In [1]:
# Setting up execution path
import os

print(f"Current working directory: {os.path.basename(os.getcwd())}")

# Change to root directory
os.chdir("../")
print(f"Current working directory (Changed): {os.path.basename(os.getcwd())}")

Current working directory: notebooks
Current working directory (Changed): SMS-Spam-Detector


In [2]:
from os.path import dirname, normpath

In [3]:
import numpy as np
import pandas as pd

In [4]:
from src.constants import CONFIGS, SCHEMA
from src.exception import CustomException
from src.logger import logger
from src.utils.basic_utils import create_directories, read_yaml

In [5]:
# Read the configuration files
configs = read_yaml(CONFIGS).data_standardizer
schema = read_yaml(SCHEMA)

# Column names, class names
col_names = schema.column_names
class_names = schema.class_names

# Train and test data paths
train_filepath = normpath(configs.train_data_path)
test_filepath = normpath(configs.test_data_path)

[2024-02-23 11:42:02 PM]:ProjectLogger INFO:basic_utils 44 - yaml file: conf\configs.yaml loaded successfully
[2024-02-23 11:42:02 PM]:ProjectLogger INFO:basic_utils 44 - yaml file: conf\schema.yaml loaded successfully


In [6]:
list(col_names.values())

['label', 'message']

In [7]:
col_names.label_col

'label'

In [8]:
list(class_names)

['spam', 'ham']

In [9]:
# Load the training set array
train_df, test_df = pd.read_csv(train_filepath), pd.read_csv(test_filepath)

# Get text and label columns
msg_col = col_names.text_col
label_col = col_names.label_col

# Split train_array into features and target
x_train, y_train = train_df[msg_col], train_df[label_col]
x_test, y_test = test_df[msg_col], test_df[label_col]


# Log the shapes
print(f"The shape of x_train: {x_train.shape}")
print(f"The shape of y_train: {y_train.shape}")
print(f"The shape of x_test: {x_test.shape}")
print(f"The shape of y_test: {y_test.shape}")

The shape of x_train: (1195,)
The shape of y_train: (1195,)
The shape of x_test: (299,)
The shape of y_test: (299,)


In [10]:
import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\quant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def extract_tokens(msg_str: str) -> list:
    """_summary_

    Args:
        msg_str (str): _description_

    Returns:
        list: _description_
    """
    en_stopwords = stopwords.words("english")

    nopunc_words = "".join(
        [char for char in msg_str if char not in string.punctuation]
    ).split()
    tokens = [word for word in nopunc_words if word.lower() not in en_stopwords]
    tokens_str = " ".join(tokens)
    return tokens_str


def get_tokens(messages: pd.Series) -> np.array:
    """_summary_

    Args:
        messages (pd.Series): _description_

    Returns:
        np.array: _description_
    """
    return np.array([extract_tokens(message) for message in messages])


def get_length(messages: pd.Series) -> np.array:
    """_summary_

    Args:
        messages (np.array): _description_

    Returns:
        np.array: _description_
    """
    return np.array([len(message) for message in messages]).reshape(-1, 1)

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [13]:
tfidf_pipeline = Pipeline(
    [
        ("tokenization", FunctionTransformer(get_tokens, validate=False)),
        ("bag_of_words", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
    ]
)

# Pipeline for scaling the length feature
length_pipeline = Pipeline(
    [
        ("length_extractor", FunctionTransformer(get_length, validate=False)),
        ("min_max_scaler", MinMaxScaler()),
    ]
)

# Combine the pipelines
standardizer = FeatureUnion(
    [("tfidf_transform", tfidf_pipeline), ("length_transform", length_pipeline)]
)

# Define ColumnTransformer
# column_transformer = ColumnTransformer(
#     [
#         ("tfidf_transform", tfidf_pipeline, "message"),
#         ("length_transform", length_pipeline, "message"),
#     ]
# )

In [14]:
type(standardizer)

sklearn.pipeline.FeatureUnion

In [15]:
x_train_normalized = standardizer.fit_transform(x_train)
x_test_normalized = standardizer.transform(x_test)

In [16]:
x_train_normalized.shape, x_test_normalized.shape

((1195, 4010), (299, 4010))

In [17]:
def ordinal_labels(class_names, data: pd.Series):
    class_map = class_names.to_dict()
    data_ordinal = list(map(lambda x: class_map.get(x), data))
    return np.array(data_ordinal)

In [18]:
y_train_ordinal = ordinal_labels(class_names, y_train)
y_test_ordinal = ordinal_labels(class_names, y_test)

In [19]:
y_train_arr = np.array(y_train_ordinal).reshape(-1, 1)
y_test_arr = np.array(y_test_ordinal).reshape(-1, 1)

In [20]:
y_train_arr.shape, y_test_arr.shape

((1195, 1), (299, 1))

In [21]:
from scipy.sparse import hstack

In [22]:
# Stack the arrays horizontally
train_sparse_matrix = hstack((x_train_normalized, y_train_arr)).tocsr()
test_sparse_matrix = hstack((x_test_normalized, y_test_arr)).tocsr()

In [23]:
train_sparse_matrix.shape, test_sparse_matrix.shape

((1195, 4011), (299, 4011))

In [24]:
type(train_sparse_matrix), type(train_sparse_matrix)

(scipy.sparse._csr.csr_matrix, scipy.sparse._csr.csr_matrix)

In [25]:
from scipy.sparse import save_npz, load_npz

In [26]:
# Output file paths
train_matrix_path = normpath(configs.train_matrix_path)
test_matrix_path = normpath(configs.test_matrix_path)

In [27]:
save_npz(train_matrix_path, train_sparse_matrix)
save_npz(test_matrix_path, test_sparse_matrix)

In [28]:
train_mat = load_npz(train_matrix_path)
test_mat = load_npz(test_matrix_path)

In [29]:
train_mat, type(train_mat)

(<1195x4011 sparse matrix of type '<class 'numpy.float64'>'
 	with 15160 stored elements in Compressed Sparse Row format>,
 scipy.sparse._csr.csr_matrix)

In [30]:
test_mat, type(test_mat)

(<299x4011 sparse matrix of type '<class 'numpy.float64'>'
 	with 3308 stored elements in Compressed Sparse Row format>,
 scipy.sparse._csr.csr_matrix)

In [31]:
x_train_mat = train_mat[:, :-1]
y_train_mat = train_mat[:, -1]

In [32]:
x_train_mat.shape, y_train_mat.shape

((1195, 4010), (1195, 1))

In [33]:
type(x_train_mat), type(y_train_mat)

(scipy.sparse._csr.csr_matrix, scipy.sparse._csr.csr_matrix)

In [34]:
np.array(y_train_mat.toarray().squeeze(), dtype="int32")

array([1, 1, 0, ..., 1, 1, 1])

## Another Preprocessor for Test

In [35]:
def extract_tokens_list(msg_str):
    return extract_tokens(msg_str).split(" ")

In [36]:
tfidf_pipeline_2 = Pipeline(
    [
        ("bag_of_words", CountVectorizer(analyzer=extract_tokens_list)),
        ("tfidf", TfidfTransformer()),
    ]
)

# Pipeline for scaling the length feature
length_pipeline_2 = Pipeline(
    [
        ("length_extractor", FunctionTransformer(get_length, validate=False)),
        ("min_max_scaler", MinMaxScaler()),
    ]
)

# Combine the pipelines
standardizer_2 = FeatureUnion(
    [("tfidf_transform", tfidf_pipeline_2), ("length_transform", length_pipeline_2)]
)

In [37]:
x_train_normalized_2 = standardizer_2.fit_transform(x_train)
x_test_normalized_2 = standardizer_2.transform(x_test)

In [38]:
x_train_normalized_2.shape

(1195, 4785)

In [39]:
# Stack the arrays horizontally
train_sparse_matrix_2 = hstack((x_train_normalized_2, y_train_arr)).tocsr()
test_sparse_matrix_2 = hstack((x_test_normalized_2, y_test_arr)).tocsr()

In [40]:
train_sparse_matrix_2.shape, test_sparse_matrix_2.shape

((1195, 4786), (299, 4786))

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
nb_classifier = MultinomialNB()
nb_classifier.fit(x_train_normalized, y_train_arr)

  y = column_or_1d(y, warn=True)


In [43]:
nb_classifier_2 = MultinomialNB()
nb_classifier_2.fit(x_train_normalized_2, y_train_arr)

  y = column_or_1d(y, warn=True)


In [44]:
preds_a = nb_classifier.predict(x_test_normalized)
preds_b = nb_classifier_2.predict(x_test_normalized_2)

In [45]:
from sklearn.metrics import (
    classification_report,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
)

In [46]:
print(classification_report(y_test_arr, preds_a))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       155
           1       0.96      0.93      0.94       144

    accuracy                           0.95       299
   macro avg       0.95      0.95      0.95       299
weighted avg       0.95      0.95      0.95       299



In [47]:
print(classification_report(y_test_arr, preds_b))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94       155
           1       0.96      0.91      0.94       144

    accuracy                           0.94       299
   macro avg       0.94      0.94      0.94       299
weighted avg       0.94      0.94      0.94       299



In [48]:
print(round(accuracy_score(y_test_arr, preds_a), 2))
print(round(precision_score(y_test_arr, preds_a), 2))
print(round(recall_score(y_test_arr, preds_a), 2))
print(round(f1_score(y_test_arr, preds_a), 2))

0.95
0.96
0.93
0.94


In [49]:
print(round(accuracy_score(y_test_arr, preds_b), 2))
print(round(precision_score(y_test_arr, preds_b), 2))
print(round(recall_score(y_test_arr, preds_b), 2))
print(round(f1_score(y_test_arr, preds_b), 2))

0.94
0.96
0.91
0.94


In [None]:
def get_tokens(messages: pd.Series) -> np.array:
    return np.array([extract_tokens(message) for message in messages])


def get_length(messages: pd.Series) -> np.array:
    """_summary_

    Args:
        messages (np.array): _description_

    Returns:
        np.array: _description_
    """
    return np.array([len(message) for message in messages]).reshape(-1, 1)

In [None]:
x_train.shape

(1195,)

In [None]:
# class ordinal mapping dictionary
class_map = class_names.to_dict()

# Convert target labels to numpy arrays
y_train_ord_arr = np.array([class_map.get(i) for i in y_train]).reshape(-1, 1)
y_test_ord_arr = np.array([class_map.get(i) for i in y_test]).reshape(-1, 1)

In [None]:
y_train_ord_arr

array([[1],
       [1],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [52]:
nb_classifier.classes_

array([0, 1])

In [61]:
class_names_rev = {v: k for k, v in class_names.to_dict().items()}
class_names_rev

{0: 'spam', 1: 'ham'}

In [62]:
classes = []
for cls_id in nb_classifier.classes_:
    if cls_id in class_names_rev:
        classes.append(class_names_rev[cls_id])

In [64]:
classes

['spam', 'ham']

In [58]:
for item in zip(class_names.to_dict().items(), nb_classifier.classes_):
    print(item)

(('spam', 0), 0)
(('ham', 1), 1)


In [69]:
def get_class_names(model_class, class_map):
    cls_names = {v: k for k, v in class_map.to_dict().items()}
    return [cls_names[cls_id] for cls_id in model_class if cls_id in cls_names]

In [70]:
get_class_names(nb_classifier.classes_, class_names)

['spam', 'ham']