In [1]:
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import pandas as pd

In [2]:
def preprocess_sms(df):
    df = df.sample(frac = 1, ignore_index = True, random_state = 42)
    X = df.text
    Y = df.spam.to_numpy()
    return X, Y

In [3]:
def preprocess_text(X):
    stop = set(stopwords.words('english') + list(string.punctuation))
    if isinstance(X, str):
        X = np.array([X])

    X_preprocessed = []
    for i, sms in enumerate(X):
        sms = np.array([i.lower() for i in word_tokenize(sms) if i.lower() not in stop]).astype(X.dtype)
        X_preprocessed.append(sms)
        
    if len(X) == 1:
        return X_preprocessed[0]
    return X_preprocessed

In [4]:
def get_word_list(X):
    word_list = []

    num_sms = len(X)

    for i in range(num_sms):
        sms = X[i] 
        sms = set(sms) 

        for word in sms:
            if word not in word_list:
                word_list.append(word)
    
    word_list_length = len(word_list)
    indexed_word_list = {word: idx for idx, word in enumerate(word_list)}

    return indexed_word_list, word_list_length

In [5]:
def sms_matrix( word_list_length, indexed_word_list, X_treated):
    matrix = np.zeros((len(X_treated),  word_list_length), dtype=int)
    
    for i, sms in enumerate(X_treated):
        for word in sms:
            if word in indexed_word_list:
                index = indexed_word_list[word]
                matrix[i, index] += 1
    return matrix

In [6]:
def sms_vector(word_list_length, indexed_word_list, X_treated):
    vector = np.zeros(word_list_length, dtype=int)

    for word in X_treated:
        if word in indexed_word_list:
            index = indexed_word_list[word]
            vector[index] += 1

    return vector

In [7]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))
def initialize_parameters(word_list_length):
    w = np.random.randn(word_list_length) * 0.01
    b = 0.0
    return w, b

In [8]:
def forward_pass(X_matrix, w, b):
    z = np.dot(X_matrix, w) + b
    Y_hat = sigmoid(z) 
    return Y_hat

In [9]:
def cost_func(Y_hat, Y):
    m = Y.shape[0]

    loss = -(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat))
    cost = np.sum(loss)/m

    return cost

In [10]:
def gradient_descent(w, b, X_matrix, Y, Y_hat, learning_rate):
    m = Y.shape[0]
    
    dl_dw = np.dot(X_matrix.T, (Y_hat - Y))/m
    dl_db = np.sum(Y_hat - Y)/m

    w = w - learning_rate * dl_dw
    b = b - learning_rate * dl_db

    return w, b

In [11]:
dataframe_sms = pd.read_csv('sms_dataset.csv')
dataframe_sms['spam'] = dataframe_sms['spam'].map({'spam': 1, 'ham': 0})
dataframe_sms.head()

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
print(f"Number of SMS: {len(dataframe_sms)}")
print(f"Proportion of spam SMSs: {dataframe_sms.spam.sum()/len(dataframe_sms):.4f}")
print(f"Proportion of ham SMSs: {1 - dataframe_sms.spam.sum()/len(dataframe_sms):.4f}")

Number of SMS: 5572
Proportion of spam SMSs: 0.1341
Proportion of ham SMSs: 0.8659


In [13]:
X, Y = preprocess_sms(dataframe_sms)
X_treated = preprocess_text(X)

In [14]:
TRAIN_SIZE = int(0.80*len(X_treated)) 

X_train = X_treated[:TRAIN_SIZE]
Y_train = Y[:TRAIN_SIZE]
X_test = X_treated[TRAIN_SIZE:]
Y_test = Y[TRAIN_SIZE:]

In [15]:
print(f"Proportion of spam in train dataset: {sum(Y_train == 1)/len(Y_train):.4f}")
print(f"Proportion of spam in test dataset: {sum(Y_test == 1)/len(Y_test):.4f}")

Proportion of spam in train dataset: 0.1324
Proportion of spam in test dataset: 0.1408


In [16]:
indexed_word_list, word_list_length = get_word_list(X_treated)

In [17]:
X_train_matrix = sms_matrix(word_list_length, indexed_word_list, X_train)

In [18]:
def logistic_regression(word_list_length, X_train_matrix, Y_train, num_epochs = 1500, learning_rate = 0.1):
    Y_train = Y_train.flatten()
    
    w, b = initialize_parameters(word_list_length)

    for epoch in range(num_epochs):
        Y_hat = forward_pass(X_train_matrix, w, b)
        cost = cost_func(Y_hat, Y_train)
        print(f"Loss at {epoch} epoch is {cost}")
        w, b = gradient_descent(w, b, X_train_matrix, Y_train, Y_hat, learning_rate)
    
    return w, b

In [19]:
w, b = logistic_regression(word_list_length, X_train_matrix, Y_train)

Loss at 0 epoch is 0.6928515734919057
Loss at 1 epoch is 0.675604325541917
Loss at 2 epoch is 0.659389842769489
Loss at 3 epoch is 0.6441411094005074
Loss at 4 epoch is 0.6297944751637482
Loss at 5 epoch is 0.6162897546236273
Loss at 6 epoch is 0.6035702570423886
Loss at 7 epoch is 0.5915827589994036
Loss at 8 epoch is 0.5802774315082676
Loss at 9 epoch is 0.5696077323974694
Loss at 10 epoch is 0.559530273443177
Loss at 11 epoch is 0.5500046703326357
Loss at 12 epoch is 0.5409933821270027
Loss at 13 epoch is 0.532461545574011
Loss at 14 epoch is 0.5243768084459434
Loss at 15 epoch is 0.5167091650692728
Loss at 16 epoch is 0.5094307963705316
Loss at 17 epoch is 0.5025159160774855
Loss at 18 epoch is 0.49594062416799495
Loss at 19 epoch is 0.48968276823117723
Loss at 20 epoch is 0.48372181307694784
Loss at 21 epoch is 0.4780387186826056
Loss at 22 epoch is 0.47261582638292565
Loss at 23 epoch is 0.4674367530796995
Loss at 24 epoch is 0.4624862931565648
Loss at 25 epoch is 0.4577503277260

In [20]:
X_test_matrix = sms_matrix(word_list_length, indexed_word_list, X_test)

In [52]:
Y_hat_test = forward_pass(X_test_matrix, w, b)
Y_pred_test = (Y_hat_test >= 0.25).astype(int)

In [53]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(Y_test, Y_pred_test)
precision = precision_score(Y_test, Y_pred_test)
recall = recall_score(Y_test, Y_pred_test)
f1 = f1_score(Y_test, Y_pred_test)

print(f"Accuracy on test dataset: {accuracy:.4f}")
print(f"Precision on test dataset: {precision:.4f}")
print(f"Recall on test dataset: {recall:.4f}")
print(f"F1 Score on test dataset: {f1:.4f}")

Accuracy on test dataset: 0.9650
Precision on test dataset: 0.9214
Recall on test dataset: 0.8217
F1 Score on test dataset: 0.8687


In [54]:
thresholds = [0.1, 0.25, 0.5, 0.75]
for t in thresholds:
    Y_pred_test = (Y_hat_test >= t).astype(int)
    precision = precision_score(Y_test, Y_pred_test)
    recall = recall_score(Y_test, Y_pred_test)
    f1 = f1_score(Y_test, Y_pred_test)
    print(f"Threshold: {t}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Threshold: 0.1, Precision: 0.6890, Recall: 0.9172, F1 Score: 0.7869
Threshold: 0.25, Precision: 0.9214, Recall: 0.8217, F1 Score: 0.8687
Threshold: 0.5, Precision: 0.9402, Recall: 0.7006, F1 Score: 0.8029
Threshold: 0.75, Precision: 0.9865, Recall: 0.4650, F1 Score: 0.6320
