In [None]:
###### IMPORTS ######
import pandas as pd
import nltk
import ssl

from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#nltk downloads
nltk.download('punkt')


In [None]:
#now i want to import the data from /data folder
df = pd.read_csv('dataset/spam.csv')

In [None]:
#Now I am going to print out the first 5 rows of the data, and the shape.
print(df.head())
print(df.shape)
print(df['Category'].value_counts())


In [None]:
###### CLEANING THE DATA ######

#drop any nan values if there is
df = df.dropna()

#now I need to loop through each row in the message column and use NLTK to tokenise the words
df['tokens'] = df['Message'].apply(nltk.word_tokenize)

#now I need to go through each row in the tokens column, remove any punctuation and make all the words lowercase, remove punctuation
df['tokens'] = df['tokens'].apply(lambda x: [word.lower() for word in x if word.isalpha()])


In [None]:
class NaiveBayesClassifierNoSmoothing:
    def __init__(self) -> None:
        #dictionary for prior probabilities
        self.priors = defaultdict(float)
        #dictionary for conditional probabilities
        self.conditionals = defaultdict(lambda: defaultdict(float))
        #vocab
        self.vocab = set()
        
    def train_model(self, x, y):
        #calculae the prior probabilities
        class_counts = defaultdict(float)
        for label in y:
            class_counts[label] += 1

        #use this to calculate the prior probabilities
        for label in class_counts:
            self.priors[label] = class_counts[label] / len(y)
        
        #create a dicionary to store the word counts given label
        word_counts = defaultdict(lambda: defaultdict(float))

        #calculate the conditional probs
        for label in class_counts:
            #get all rows with the same label
            label_rows = x[y == label]
            #flatten the list of lists
            all_words = [word for row in label_rows for word in row]

            for word in all_words:
                word_counts[label][word] += 1
                self.vocab.add(word)
            
            #calculate the conditional probabilities
            for word in word_counts[label]:
                self.conditionals[label][word] = word_counts[label][word] / len(all_words)
        
    def predict(self, x):
        #create a dictionary to store the probabilities
        probs = defaultdict(float)
        for label in self.priors:
            probs[label] = np.log(self.priors[label])
            for word in x:
                if word in self.vocab:
                    probs[label] += np.log(self.conditionals[label][word])
        
        return max(probs, key=probs.get)        

In [None]:
#split the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(df['tokens'], df['Category'], test_size=0.2)

#instantiate the model
model = NaiveBayesClassifierNoSmoothing()

#train the model
model.train_model(x_train, y_train)

#make predictions
predictions = []
for row in x_test:
    predictions.append(model.predict(row))

#calculate the accuracy
correct = 0
for i in range(len(predictions)):
    if predictions[i] == y_test.iloc[i]:
        correct += 1

#calculate the accuracy
accuracy = correct / len(predictions)

print(accuracy)