<a href="https://colab.research.google.com/github/stanleystzhao/Algorithms/blob/main/Naive_Bayes_Spam_Filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes Classifier

Performs classification using the Naive Bayes algorithm.

input: /spam.csv

We use a pre-built library first,
then code up the naive bayes part by ourselves.
Then compare our result with the pre-built library.

## 1. Libraries and data import

In [None]:
import pandas as pd
import numpy as np

# function to split the data into training and testing
from sklearn.model_selection import train_test_split
# function to convert a collection of text documents to a matrix of token counts
# this is called feature extraction
from sklearn.feature_extraction.text import CountVectorizer
# for testing purposes and to compare our results with the pre-built library
from sklearn.naive_bayes import MultinomialNB

In [None]:
# read the data
data = pd.read_csv('spam.csv', encoding='latin-1')

# the last 3 columns are empty, so we drop them
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

# rename the columns
data.columns = ['label', 'text']

# inspect the data
print(data)

print(data.groupby('label').describe())


     label                                               text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]
       text                                                               
      count unique                                                top freq
label                                                                     
ham   

## 2. Text preprocessing and feature extraction

In [None]:
# convert label to a new binary variable
data["spam"] = data["label"].apply(lambda x: 1 if x == "spam" else 0)

# print(data)

In [None]:
# split the data into training and testing
# 75% for training, 25% for testing ()
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["spam"], test_size=0.25)


# inspect data
X_train.describe()
# y_train.describe()
# X_test.describe()
# y_test.describe()

Unnamed: 0,text
count,4179
unique,3926
top,"Sorry, I'll call later"
freq,25


In [None]:
# Feature extraction

# find word counts and turn strings into matrices of counts of words
cv = CountVectorizer()
X_train_counts = cv.fit_transform(X_train.values)

X_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## 3. Training with Multinomial Naive Bayes

Why multinomial? It works better with discrete distributions than Gaussian NB

The length and width of Iris flowers are continuous -> Gaussian


In [None]:
model = MultinomialNB()
model.fit(X_train_counts, y_train)

### Pre-testing

In [None]:
sample_spam = ["money cheap click reward"]

sample_ham = ["hey wanna meet tomorrow"]

sample_spam_counts = cv.transform(sample_spam)
sample_ham_counts = cv.transform(sample_ham)

if model.predict(sample_spam_counts) == 1:
    print("spam successfully detected")
else:
    print("spam not detected")

if model.predict(sample_ham_counts) == 0:
    print("ham successfully detected")
else:
    print("ham not detected")

spam successfully detected
ham successfully detected


### Testing

In [None]:
X_test_counts = cv.transform(X_test)

print('Test results: Model has a' , model.score(X_test_counts, y_test)*100, '% accuracy\n')

confusion_matrix = lambda y_test, y_pred: pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print("Confusion matrix:")
print(confusion_matrix(y_test, model.predict(X_test_counts)))

Test results: Model has a 98.49246231155779 % accuracy

Confusion matrix:
Predicted     0    1
Actual              
0          1199    9
1            12  173


## Multinomial NB coded up: What happens when you call model = MultinomialNB()


In [None]:
from collections import defaultdict

# Given X_train_counts, y_train, fit X_test_counts for y_train
class NaiveBayes:
    def __init__(self):
        self.priors = {}
        self.likelihoods = defaultdict(lambda: {'spam': 0, 'ham': 0})
        self.vocab = set()
        self.total_words = {'spam': 0, 'ham': 0}
        self.total_docs = {'spam': 0, 'ham': 0}

    def preprocess(self, message):
        # Tokenize and clean message
        return message.lower().split()

    def train(self, X_train, y_train):
        # Calculate priors and likelihoods
        for label, message in zip(y_train, X_train):
            label = 'spam' if label == 1 else 'ham'
            self.total_docs[label] += 1

            words = self.preprocess(message)
            self.vocab.update(words)
            self.total_words[label] += len(words)

            for word in words:
                self.likelihoods[word][label] += 1

        # Compute priors
        total_docs = sum(self.total_docs.values())
        self.priors['spam'] = self.total_docs['spam'] / total_docs
        self.priors['ham'] = self.total_docs['ham'] / total_docs

    def predict(self, X_test):
        predictions = []
        for message in X_test:
            words = self.preprocess(message)

            # Start with log priors
            log_prob_spam = np.log(self.priors['spam'])
            log_prob_ham = np.log(self.priors['ham'])

            for word in words:
                # Calculate log likelihoods with Laplace smoothing
                log_prob_spam += np.log(
                    (self.likelihoods[word]['spam'] + 1) /
                    (self.total_words['spam'] + len(self.vocab))
                )
                log_prob_ham += np.log(
                    (self.likelihoods[word]['ham'] + 1) /
                    (self.total_words['ham'] + len(self.vocab))
                )

            # Predict the class with the higher log probability
            predictions.append(1 if log_prob_spam > log_prob_ham else 0)
        return predictions

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return np.mean(y_pred == y_test)


In [None]:
# Testing with given dataset

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.25)

model = NaiveBayes()
model.train(X_train, y_train)

In [None]:
print('Test results: Model has a' , model.score(X_test, y_test)*100, '% accuracy\n')

Test results: Model has a 97.5592246949031 % accuracy

