## Creating the Model

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prais\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataframe_sms = pd.read_csv('sms_dataset.csv')
dataframe_sms['spam'] = dataframe_sms['spam'].map({'spam': 1, 'ham': 0})
dataframe_sms.head()

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(f"Number of SMS: {len(dataframe_sms)}")
print(f"Proportion of spam SMSs: {dataframe_sms.spam.sum()/len(dataframe_sms):.4f}")
print(f"Proportion of ham SMSs: {1 - dataframe_sms.spam.sum()/len(dataframe_sms):.4f}")

Number of SMS: 5572
Proportion of spam SMSs: 0.1341
Proportion of ham SMSs: 0.8659


In [4]:
def preprocess_sms(df):
    df = df.sample(frac = 1, ignore_index = True, random_state = 42)
    X = df.text
    Y = df.spam.to_numpy()
    return X, Y

In [5]:
X, Y = preprocess_sms(dataframe_sms)

In [6]:
def preprocess_text(X):
    stop = set(stopwords.words('english') + list(string.punctuation))
    if isinstance(X, str):
        X = np.array([X])

    X_preprocessed = []
    for i, sms in enumerate(X):
        sms = np.array([i.lower() for i in word_tokenize(sms) if i.lower() not in stop]).astype(X.dtype)
        X_preprocessed.append(sms)
        
    if len(X) == 1:
        return X_preprocessed[0]
    return X_preprocessed

In [7]:
X_treated = preprocess_text(X)

In [8]:
TRAIN_SIZE = int(0.80*len(X_treated)) 

X_train = X_treated[:TRAIN_SIZE]
Y_train = Y[:TRAIN_SIZE]
X_test = X_treated[TRAIN_SIZE:]
Y_test = Y[TRAIN_SIZE:]

In [9]:
print(f"Proportion of spam in train dataset: {sum(Y_train == 1)/len(Y_train):.4f}")
print(f"Proportion of spam in test dataset: {sum(Y_test == 1)/len(Y_test):.4f}")

Proportion of spam in train dataset: 0.1324
Proportion of spam in test dataset: 0.1408


In [10]:
def get_word_frequency(X,Y):
    word_dict = {}

    num_sms = len(X)

    for i in range(num_sms):
        sms = X[i] 
        cls = Y[i] 
        sms = set(sms) 

        for word in sms:
            if word not in word_dict.keys():
                word_dict[word] = {"spam": 1, "ham": 1}

            if cls == 0:    
                word_dict[word]["ham"] += 1
            if cls == 1:
                word_dict[word]["spam"] += 1

    return word_dict

In [11]:
word_frequency = get_word_frequency(X_train,Y_train)
class_frequency = {'ham': sum(Y_train == 0), 'spam': sum(Y_train == 1)}

In [12]:
proportion_spam = class_frequency['spam']/(class_frequency['ham'] + class_frequency['spam'])
print(f"The proportion of spam SMS in training is: {proportion_spam:.4f}")
print(f"The proportion of ham SMS in training is: {1 - proportion_spam:.4f}")

The proportion of spam SMS in training is: 0.1324
The proportion of ham SMS in training is: 0.8676


In [13]:
def prob_word_given_class(word, cls, word_frequency, class_frequency):
    amount_word_and_class = word_frequency[word][cls]
    p_word_given_class = amount_word_and_class/class_frequency[cls]

    return p_word_given_class

In [14]:
def prob_sms_given_class(treated_sms, cls, word_frequency, class_frequency):
    prob = 1

    for word in treated_sms:
        if word in word_frequency.keys(): 
            prob *= prob_word_given_class(word, cls, word_frequency, class_frequency)

    return prob

In [15]:
def log_prob_sms_given_class(treated_sms, cls, word_frequency, class_frequency):
    prob = 0

    for word in treated_sms: 
        if word in word_frequency.keys(): 
            prob += np.log(prob_word_given_class(word, cls,word_frequency, class_frequency))

    return prob

In [16]:
def log_naive_bayes(treated_sms, word_frequency, class_frequency, return_likelihood = False):    
    log_prob_sms_given_spam = log_prob_sms_given_class(treated_sms, cls = 'spam',word_frequency = word_frequency, class_frequency = class_frequency) 
    log_prob_sms_given_ham = log_prob_sms_given_class(treated_sms, cls = 'ham',word_frequency = word_frequency, class_frequency = class_frequency) 

    p_spam = class_frequency['spam']/(class_frequency['ham'] + class_frequency['spam']) 
    p_ham = class_frequency['ham']/(class_frequency['ham'] + class_frequency['spam']) 

    log_spam_likelihood = np.log(p_spam) + log_prob_sms_given_spam 
    log_ham_likelihood = np.log(p_ham) + log_prob_sms_given_ham 

    if return_likelihood == True:
        return (log_spam_likelihood, log_ham_likelihood)

    if log_spam_likelihood >= log_ham_likelihood:
        return 1
    else:
        return 0

## Model Testing

In [17]:
def get_true_positives(Y_true, Y_pred):
    if len(Y_true) != len(Y_pred):
        return "Number of true labels and predict labels must match!"
    n = len(Y_true)
    true_positives = 0

    for i in range(n):
        true_label_i = Y_true[i]
        predicted_label_i = Y_pred[i]

        if true_label_i == 1 and predicted_label_i == 1:
            true_positives += 1
    return true_positives
        
def get_true_negatives(Y_true, Y_pred):
    if len(Y_true) != len(Y_pred):
        return "Number of true labels and predict labels must match!"
    n = len(Y_true)
    true_negatives = 0

    for i in range(n):
        true_label_i = Y_true[i]
        predicted_label_i = Y_pred[i]
        
        if true_label_i == 0 and predicted_label_i == 0:
            true_negatives += 1
    return true_negatives

In [18]:
Y_pred = []

for sms in X_test:
    prediction = log_naive_bayes(sms, word_frequency, class_frequency)
    Y_pred.append(prediction)

print(f"Y_test and Y_pred matches in length? Answer: {len(Y_pred) == len(Y_test)}")

Y_test and Y_pred matches in length? Answer: True


In [19]:
true_positives = get_true_positives(Y_test, Y_pred)
true_negatives = get_true_negatives(Y_test, Y_pred)
print(f"The number of true positives is: {true_positives}\nThe number of true negatives is: {true_negatives}")
accuracy = (true_positives + true_negatives)/len(Y_test)
print(f"Accuracy is: {accuracy:.4f}")

The number of true positives is: 155
The number of true negatives is: 831
Accuracy is: 0.8843


## Exporting Parameters

In [20]:
import pickle

with open('parameters.pkl', 'wb') as f:
    pickle.dump((word_frequency, class_frequency), f)

## Here's how to load these parameters as variables

with open('parameters.pkl', 'rb') as f:
    word_frequency, class_frequency = pickle.load(f)