# Implementing Naive Bayes Algorithm for Data Classification

##### Fatemeh Razaqnejad
Develop and test a Naive Bayes classifier to categorize data points in both continuous and discrete datasets using Gaussian, uniform, and binomial distributions.

#### Import necessary libraries

In [None]:
import pandas as pd
from collections import defaultdict
import numpy as np

#### Load the data

In [1]:
# Replace 'emails.csv' with the actual path to your dataset if necessary
emails_data = pd.read_csv('emails.csv')

#### Function to calculate word probabilities

In [None]:
def calculate_word_probs(data, label):
    word_counts = defaultdict(int)
    total_words = 0
    
    for email in data[data['spam'] == label]['text']:
        words = email.split()
        for word in words:
            word_counts[word] += 1
            total_words += 1
    
    word_probs = {word: (count + 1) / (total_words + len(word_counts)) for word, count in word_counts.items()}
    return word_probs, total_words

# Calculate probabilities for spam and not spam emails
spam_word_probs, spam_total_words = calculate_word_probs(emails_data, 1)
not_spam_word_probs, not_spam_total_words = calculate_word_probs(emails_data, 0)

#### Function to classify an email based on word probabilities

In [None]:
def classify_email(email, spam_word_probs, not_spam_word_probs, spam_total_words, not_spam_total_words):
    epsilon = 1e-10  # Small value to prevent zero probability
    words = email.split()
    
    spam_prob = 1
    not_spam_prob = 1
    word_probs = {}
    
    for word in words:
        spam_prob_word = spam_word_probs.get(word, epsilon)
        not_spam_prob_word = not_spam_word_probs.get(word, epsilon)
        
        spam_prob *= spam_prob_word
        not_spam_prob *= not_spam_prob_word
        word_probs[word] = {
            'spam_prob': spam_prob_word,
            'not_spam_prob': not_spam_prob_word
        }
    
    # Default class probabilities (assuming equal likelihood)
    spam_prob *= len(spam_word_probs) / (len(spam_word_probs) + len(not_spam_word_probs))
    not_spam_prob *= len(not_spam_word_probs) / (len(spam_word_probs) + len(not_spam_word_probs))
    
    result = 'spam' if spam_prob > not_spam_prob else 'not_spam'
    return result, spam_prob, not_spam_prob, word_probs

#### Function to get a sample email from the user

In [None]:
def get_sample_email():
    return input("Please enter the sample email text: ")

# Get sample email from user input
sample_email = get_sample_email()

#### Classify the sample email and print the results

In [None]:
predicted_label, spam_prob, not_spam_prob, word_probs = classify_email(sample_email, spam_word_probs, not_spam_word_probs, spam_total_words, not_spam_total_words)

print(f'The predicted label for the email is: {predicted_label}\n')
print(f'Spam Probability: {spam_prob}')
print(f'Not Spam Probability: {not_spam_prob}\n')
print('Detailed probabilities for each word:')
for word, probs in word_probs.items():
    print(f'Word "{word}":')
    print(f"  Spam Probability: {probs['spam_prob']}")
    print(f"  Not Spam Probability: {probs['not_spam_prob']}\n")