## Experiment 6
---
Implement a Naive Bayes Classifier and evaluate its accuracy.

In [54]:
import random
import pandas as pd
from faker import Faker
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

# Set random seed for reproducibility
random.seed(42)

# Initialize Faker object
faker = Faker()

# Generate fake emails
n_emails = 10000
spam_prop = 0.5
emails = []
for i in range(n_emails):
    if random.random() < spam_prop:
        # Generate spam email
        subject = faker.text(max_nb_chars=50)
        body = faker.text(max_nb_chars=500)
        label = 'spam'
    else:
        # Generate ham email
        subject = faker.sentence(nb_words=5)
        body = faker.paragraph(nb_sentences=3)
        body = 'ham_' + body
        label = 'ham'
    emails.append({'subject': subject, 'body': body, 'label': label})

# Convert emails to pandas DataFrame
emails_df = pd.DataFrame(emails)

# Split data into training and testing sets
train_prop = 0.6
train_size = int(train_prop * n_emails)
train_emails = emails_df.sample(n=train_size, random_state=42)
test_emails = emails_df.drop(train_emails.index)

# Preprocess email text data
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_emails['subject'] + ' ' + train_emails['body'])
y_train = train_emails['label']
X_test = vectorizer.transform(test_emails['subject'] + ' ' + test_emails['body'])
y_test = test_emails['label']

# Fit Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict labels for test data
y_pred = clf.predict(X_test)

# Calculate accuracy and confusion matrix
acc = clf.score(X_test, y_test)
cm = confusion_matrix(y_test, y_pred)

print('Accuracy:', acc)
print('Confusion matrix:')
print(cm)


Accuracy: 0.96175
Confusion matrix:
[[1840  153]
 [   0 2007]]
