In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

In [2]:

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sahit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
class NaiveBayes:
    def __init__(self, alpha=1):
        """
        Naive Bayes classifier for spam filtering using both multinomial and Bernoulli models.

        Args:
            alpha (float): Smoothing parameter. Default is 1.
        """
        self.alpha = alpha
        self.vectorizer = TfidfVectorizer()
        self.kf = KFold(n_splits=10, shuffle=True, random_state=42)

    def preprocess_text(self, text):
        """
        Preprocesses input text for feature extraction.

        Args:
            text (str): Input text.

        Returns:
            str: Preprocessed text.
        """
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word.isalnum()]
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)

    def fit_predict(self, data):
        """
        Fits the Naive Bayes models, predicts and evaluates their performance.

        Args:
            data (pd.DataFrame): Input data containing 'Message' and 'Category' columns.

        Returns:
            float, float: Mean accuracy of multinomial Naive Bayes and Bernoulli Naive Bayes.
        """
        X = self.vectorizer.fit_transform(data['Message'])
        y = data['Category']

        mnb_scores = []
        bnb_scores = []

        for train_index, test_index in self.kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            mnb_y_pred = self.multinomial_naive_bayes(X_train, y_train, X_test)
            mnb_score = np.mean(mnb_y_pred == y_test)
            mnb_scores.append(mnb_score)

            bnb_y_pred = self.bernoulli_naive_bayes(X_train, y_train, X_test)
            bnb_score = np.mean(bnb_y_pred == y_test)
            bnb_scores.append(bnb_score)

            print("Debug Mode:")
            print("Multinomial Naive Bayes - Predicted:", mnb_y_pred)
            print("Bernoulli Naive Bayes - Predicted:", bnb_y_pred)
            print("True Labels:", y_test)
            print()

        mean_mnb_accuracy = np.mean(mnb_scores)
        mean_bnb_accuracy = np.mean(bnb_scores)

        return mean_mnb_accuracy, mean_bnb_accuracy

    def multinomial_naive_bayes(self, X_train, y_train, X_test):
        """
        Applies multinomial Naive Bayes model for classification.

        Args:
            X_train (sparse matrix): Training features.
            y_train (pd.Series): Training labels.
            X_test (sparse matrix): Testing features.

        Returns:
            np.array: Predicted labels.
        """
        X_spam = X_train[y_train == 'spam']
        X_ham = X_train[y_train == 'ham']

        # class priors
        spam_prior = X_spam.shape[0] / X_train.shape[0]
        ham_prior = X_ham.shape[0] / X_train.shape[0]

        # likelihoods in vocabulary
        spam_likelihood = np.array(X_spam.sum(axis=0))[0]
        ham_likelihood = np.array(X_ham.sum(axis=0))[0]

        # some smoothing ig
        spam_likelihood += self.alpha
        ham_likelihood += self.alpha

        # conditional probabilities
        spam_probs = np.log(spam_likelihood / (np.sum(spam_likelihood) + len(self.vectorizer.vocabulary_)))
        ham_probs = np.log(ham_likelihood / (np.sum(ham_likelihood) + len(self.vectorizer.vocabulary_)))

        spam_log_likelihoods = X_test.dot(spam_probs)
        ham_log_likelihoods = X_test.dot(ham_probs)

        spam_log_posterior = np.log(spam_prior) + spam_log_likelihoods
        ham_log_posterior = np.log(ham_prior) + ham_log_likelihoods

        y_pred = np.where(spam_log_posterior > ham_log_posterior, 'spam', 'ham')

        return y_pred

    def bernoulli_naive_bayes(self, X_train, y_train, X_test):
        """
        Applies Bernoulli Naive Bayes model for classification.

        Args:
            X_train (sparse matrix): Training features.
            y_train (pd.Series): Training labels.
            X_test (sparse matrix): Testing features.

        Returns:
            np.array: Predicted labels.
        """
        X_spam = X_train[y_train == 'spam']
        X_ham = X_train[y_train == 'ham']

        spam_prior = X_spam.shape[0] / X_train.shape[0]
        ham_prior = X_ham.shape[0] / X_train.shape[0]

        spam_likelihood = np.array(X_spam.sum(axis=0))[0]
        ham_likelihood = np.array(X_ham.sum(axis=0))[0]

        spam_likelihood += self.alpha
        ham_likelihood += self.alpha

        spam_probs = np.log(spam_likelihood / (np.sum(spam_likelihood)))
        ham_probs = np.log(ham_likelihood / (np.sum(ham_likelihood)))

        spam_log_likelihoods = X_test.dot(spam_probs)
        ham_log_likelihoods = X_test.dot(ham_probs)

        spam_log_posterior = np.log(spam_prior) + spam_log_likelihoods
        ham_log_posterior = np.log(ham_prior) + ham_log_likelihoods

        y_pred = np.where(spam_log_posterior > ham_log_posterior, 'spam', 'ham')

        return y_pred

In [4]:
def main():
    data = pd.read_csv(r'C:/Users/sahit/sip/spamham.csv')

    spam_filter = NaiveBayes(alpha=1)
    mean_mnb_accuracy, mean_bnb_accuracy = spam_filter.fit_predict(data)

    print("Multinomial Naive Bayes - Mean Accuracy:", mean_mnb_accuracy)
    print("Bernoulli Naive Bayes - Mean Accuracy:", mean_bnb_accuracy)

In [5]:

if __name__ == "__main__":
    main()

Debug Mode:
Multinomial Naive Bayes - Predicted: ['spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham'
 'spam' 'ham' 'spam' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham'
 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam'
 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham'
 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham