<a href="https://colab.research.google.com/github/student-monika/Marvel_tasks_Level_2/blob/main/Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

# Load dataset
file_path = "spam.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Keep only the relevant columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode labels: spam = 1, ham = 0
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split data into features and labels
messages = df['message'].values
labels = df['label'].values

# Tokenize and preprocess text
def preprocess(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase and split into words
    tokens = text.lower().split()
    return tokens

# Split data into training and testing sets
def train_test_split(data, labels, test_size=0.3):
    split_index = int(len(data) * (1 - test_size))
    return data[:split_index], data[split_index:], labels[:split_index], labels[split_index:]

# Train the classifier
def train_naive_bayes(messages, labels):
    # Separate messages by class
    spam_messages = [messages[i] for i in range(len(labels)) if labels[i] == 1]
    ham_messages = [messages[i] for i in range(len(labels)) if labels[i] == 0]

    # Calculate prior probabilities
    p_spam = len(spam_messages) / len(messages)
    p_ham = len(ham_messages) / len(messages)

    # Tokenize and count word frequencies
    word_counts_spam = defaultdict(int)
    word_counts_ham = defaultdict(int)

    for message in spam_messages:
        for word in preprocess(message):
            word_counts_spam[word] += 1

    for message in ham_messages:
        for word in preprocess(message):
            word_counts_ham[word] += 1

    # Calculate total word counts
    total_spam_words = sum(word_counts_spam.values())
    total_ham_words = sum(word_counts_ham.values())

    # Vocabulary
    vocabulary = set(word_counts_spam.keys()).union(set(word_counts_ham.keys()))

    return {
        'p_spam': p_spam,
        'p_ham': p_ham,
        'word_counts_spam': word_counts_spam,
        'word_counts_ham': word_counts_ham,
        'total_spam_words': total_spam_words,
        'total_ham_words': total_ham_words,
        'vocabulary': vocabulary,
    }

# Predict if a message is spam or ham
def predict(message, model, alpha=1):
    # Tokenize the message
    words = preprocess(message)

    # Calculate log-probabilities
    log_p_spam = np.log(model['p_spam'])
    log_p_ham = np.log(model['p_ham'])

    for word in words:
        # Likelihood of the word in spam and ham messages
        spam_likelihood = (model['word_counts_spam'][word] + alpha) / (model['total_spam_words'] + alpha * len(model['vocabulary']))
        ham_likelihood = (model['word_counts_ham'][word] + alpha) / (model['total_ham_words'] + alpha * len(model['vocabulary']))

        log_p_spam += np.log(spam_likelihood)
        log_p_ham += np.log(ham_likelihood)

    # Return the class with the highest probability
    return 1 if log_p_spam > log_p_ham else 0

# Evaluate the model's accuracy
def evaluate_model(test_messages, test_labels, model):
    predictions = [predict(message, model) for message in test_messages]
    accuracy = np.mean(np.array(predictions) == np.array(test_labels))
    return accuracy

# Split the dataset
train_messages, test_messages, train_labels, test_labels = train_test_split(messages, labels)

# Train the model
naive_bayes_model = train_naive_bayes(train_messages, train_labels)

# Evaluate the model
accuracy = evaluate_model(test_messages, test_labels, naive_bayes_model)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Test with custom inputs
test_message = "Congratulations! You've won a free prize."
print(f"Message: '{test_message}' -> Prediction: {'Spam' if predict(test_message, naive_bayes_model) == 1 else 'Ham'}")


Model Accuracy: 97.25%
Message: 'Congratulations! You've won a free prize.' -> Prediction: Spam
