In [5]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from math import log

class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.feature_probs = {}
        self.vocabulary = set()
        self.classes = []

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        words = text.split()
        return words

    def fit(self, X, y):
        self.classes = list(set(y))
        n_samples = len(X)
        class_counts = Counter(y)

        for cls in self.classes:
            self.class_priors[cls] = class_counts[cls] / n_samples

        class_word_counts = defaultdict(lambda: defaultdict(int))
        class_total_words = defaultdict(int)

        for text, label in zip(X, y):
            words = self.preprocess_text(text)
            for word in words:
                self.vocabulary.add(word)
                class_word_counts[label][word] += 1
                class_total_words[label] += 1

        vocab_size = len(self.vocabulary)

        for cls in self.classes:
            self.feature_probs[cls] = {}
            total_words = class_total_words[cls]
            for word in self.vocabulary:
                word_count = class_word_counts[cls][word]
                self.feature_probs[cls][word] = (word_count + self.alpha) / (total_words + self.alpha * vocab_size)

    def predict_single(self, text):
        words = self.preprocess_text(text)
        word_counts = Counter(words)
        class_scores = {}

        for cls in self.classes:
            score = log(self.class_priors[cls])
            for word, count in word_counts.items():
                if word in self.vocabulary:
                    score += count * log(self.feature_probs[cls][word])
            class_scores[cls] = score

        return max(class_scores, key=class_scores.get)

    def predict(self, X):
        return [self.predict_single(text) for text in X]

    def predict_proba(self, X):
        probabilities = []

        for text in X:
            words = self.preprocess_text(text)
            word_counts = Counter(words)
            class_scores = {}

            for cls in self.classes:
                score = log(self.class_priors[cls])
                for word, count in word_counts.items():
                    if word in self.vocabulary:
                        score += count * log(self.feature_probs[cls][word])
                class_scores[cls] = score

            max_score = max(class_scores.values())
            exp_scores = {cls: np.exp(score - max_score) for cls, score in class_scores.items()}
            total = sum(exp_scores.values())
            probs = {cls: exp_scores[cls] / total for cls in self.classes}
            probabilities.append(probs)

        return probabilities

def train_test_split(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    n_samples = len(X)
    n_test = int(n_samples * test_size)
    indices = np.random.permutation(n_samples)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    X_train = [X[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_train = [y[i] for i in train_indices]
    y_test = [y[i] for i in test_indices]
    return X_train, X_test, y_train, y_test

def accuracy_score(y_true, y_pred):
    correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    return correct / len(y_true)

def precision_recall_f1(y_true, y_pred, pos_label='spam'):
    tp = sum(1 for true, pred in zip(y_true, y_pred) if true == pos_label and pred == pos_label)
    fp = sum(1 for true, pred in zip(y_true, y_pred) if true != pos_label and pred == pos_label)
    fn = sum(1 for true, pred in zip(y_true, y_pred) if true == pos_label and pred != pos_label)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path, encoding='latin-1')
    if 'v1' in df.columns and 'v2' in df.columns:
        df = df[['v1', 'v2']]
        df.columns = ['label', 'text']
    elif 'label' in df.columns and 'text' in df.columns:
        df = df[['label', 'text']]
    else:
        df.columns = ['label', 'text'] + [f'col_{i}' for i in range(2, len(df.columns))]
        df = df[['label', 'text']]
    df = df.dropna()
    return df['text'].tolist(), df['label'].tolist()

def main():
    file_path = 'spam.csv'
    
    X, y = load_and_preprocess_data(file_path)
    
    print(f"dataset has a total of {len(X)} samples")
    print(f"classes are: {set(y)}")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"training samples: {len(X_train)}")
    print(f"test samples: {len(X_test)}")
    
    nb_classifier = MultinomialNaiveBayes(alpha=1.0)
    nb_classifier.fit(X_train, y_train)
    
    y_pred = nb_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1 = precision_recall_f1(y_test, y_pred, pos_label='spam')

    print(f"accuracy: {accuracy:.4f}")
    print(f"precision: {precision:.4f}")
    print(f"recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    return nb_classifier


if __name__ == "__main__":
    classifier = main()


dataset has a total of 5572 samples
classes are: {'spam', 'ham'}
training samples: 4458
test samples: 1114
accuracy: 0.9811
precision: 0.9706
recall: 0.8859
F1-Score: 0.9263


In [6]:
samples = [
    "Congratulations! You just won a lottery!",
    "The results for the KU entrance exam will be published next week.",
    "URGENT: Your account will be closed",
    "Get your Kathmandu University degree without attending classes!",
]

for text in samples:
    prediction = classifier.predict_single(text)
    probabilities = classifier.predict_proba([text])[0]
    print(f"text: '{text}'")
    print(f"prediction: {prediction}")
    print(f"probabilities: {probabilities}")
    print()


text: 'Congratulations! You just won a lottery!'
prediction: spam
probabilities: {'ham': 0.004531476796176658, 'spam': 0.9954685232038233}

text: 'The results for the KU entrance exam will be published next week.'
prediction: ham
probabilities: {'ham': 0.9309508096046316, 'spam': 0.06904919039536837}

text: 'URGENT: Your account will be closed'
prediction: spam
probabilities: {'ham': 0.11454321028037505, 'spam': 0.885456789719625}

text: 'Get your Kathmandu University degree without attending classes!'
prediction: ham
probabilities: {'ham': 0.9669097708531139, 'spam': 0.033090229146886134}

