In [1]:
# Step 1: Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib  # for saving the trained model

# Step 2: Load dataset from 'ham' and 'spam' folders
def load_data(directory):
    emails = []
    labels = []
    
    for folder in ['ham', 'spam']:
        folder_path = os.path.join(directory, folder)
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), 'r', encoding="latin-1") as file:
                emails.append(file.read())
                labels.append(folder)
    
    return emails, labels

# Load the dataset (update the path to where your 'ham' and 'spam' folders are stored)
emails, labels = load_data('../data/')

# Step 3: Convert to DataFrame
df = pd.DataFrame({'email': emails, 'label': labels})
df.head()

# Step 4: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['email'], df['label'], test_size=0.3, random_state=42)

# Step 5: Feature extraction using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Step 6: Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# Step 7: Test the classifier on the test set
y_pred = nb_classifier.predict(X_test_counts)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Step 9: Save the model to 'nb_model.pkl'
joblib.dump(nb_classifier, '../models/nb_model.pkl')

# Save the vectorizer too (useful for future predictions)
joblib.dump(vectorizer, '../models/vectorizer.pkl')

# Visualize the confusion matrix (optional)
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

Accuracy: 98.13%
[[1107   13]
 [  16  416]]


Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'seaborn'