# Naive Bayes Classifier with One-Hot Encoding on a PCA preprocessed dataset

In [1]:
import os
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# SADR: path to the dataset.
dataset_path = os.path.join("preprocessed_datasets", "dataset_one_hot.pkl")

# SADR: loading training data.
with open(dataset_path, "rb") as f:
    dataset_one_hot = pickle.load(f)

# SADR: getting the training, validation, and testing data.
X_train, y_train = dataset_one_hot["X_train"], dataset_one_hot["y_train"]
X_val, y_val = dataset_one_hot["X_val"], dataset_one_hot["y_val"]
X_test, y_test = dataset_one_hot["X_test"], dataset_one_hot["y_test"]

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")

# SADR: rejoining the training and validation data.
# Important to do k-fold cross-validation.
X_train = np.concat((X_train, X_val))
y_train = np.concat((y_train, y_val))
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

X_train: (2560, 60000)
X_test: (800, 60000)
X_val: (640, 60000)
X_train: (3200, 60000), y_train: (3200,)


In [2]:
import numpy as np
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

# Combine train and validation sets
X_combined = np.concatenate((X_train, X_val), axis=0)
y_combined = np.concatenate((y_train, y_val), axis=0)

# Convert labels if needed (assuming -1/1 encoding)
y_combined = np.where(y_combined == -1, 0, y_combined)

# Initialize PCA
pca = PCA(n_components=120, random_state=42)

# Define models to compare
models = {
    "GaussianNB": GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB()
}

# Manual 5-fold cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

results = {}

for name, model in models.items():
    fold_scores = []
    
    for train_idx, val_idx in kf.split(X_combined):
        # Split data
        X_train_fold, X_val_fold = X_combined[train_idx], X_combined[val_idx]
        y_train_fold, y_val_fold = y_combined[train_idx], y_combined[val_idx]
        
        # Apply PCA
        X_train_pca = pca.fit_transform(X_train_fold)
        X_val_pca = pca.transform(X_val_fold)
        
        # Apply MinMaxScaler for MultinomialNB/BernoulliNB
        if name in ["MultinomialNB", "BernoulliNB"]:
            scaler = MinMaxScaler()
            X_train_pca = scaler.fit_transform(X_train_pca)
            X_val_pca = scaler.transform(X_val_pca)
        
        # Train and predict
        model.fit(X_train_pca, y_train_fold)
        y_pred = model.predict(X_val_pca)
        
        # Calculate F1 score
        fold_f1 = f1_score(y_val_fold, y_pred)
        fold_scores.append(fold_f1)
    
    # Store results
    results[name] = {
        "mean_f1": np.mean(fold_scores),
        "std_f1": np.std(fold_scores),
        "all_scores": fold_scores
    }
    
    print(f"----- {name} -----")
    print(f"Mean F1: {results[name]['mean_f1']:.4f} (±{results[name]['std_f1']:.4f})")
    print(f"Fold scores: {[f'{s:.4f}' for s in fold_scores]}\n")

# Select best model
best_model_name = max(results, key=lambda x: results[x]["mean_f1"])
print(f"\nBest model: {best_model_name} (Mean F1: {results[best_model_name]['mean_f1']:.4f})")

# Train best model on full data with PCA
pca_full = PCA(n_components=120, random_state=42)
X_combined_pca = pca_full.fit_transform(X_combined)

# Apply MinMaxScaler if best model is MultinomialNB/BernoulliNB
if best_model_name in ["MultinomialNB", "BernoulliNB"]:
    scaler_full = MinMaxScaler()
    X_combined_pca = scaler_full.fit_transform(X_combined_pca)

best_model = models[best_model_name].fit(X_combined_pca, y_combined)

# Evaluate on test set
X_test_pca = pca_full.transform(X_test)
if best_model_name in ["MultinomialNB", "BernoulliNB"]:
    X_test_pca = scaler_full.transform(X_test_pca)
y_test = np.where(y_test == -1, 0, y_test)  # Convert labels if needed
y_pred = best_model.predict(X_test_pca)

print("\n=== Test Set Performance ===")
print(f"Test F1 Score: {f1_score(y_test, y_pred):.4f}")

----- GaussianNB -----
Mean F1: 0.8615 (±0.0090)
Fold scores: ['0.8487', '0.8683', '0.8731', '0.8538', '0.8635']

----- MultinomialNB -----
Mean F1: 0.9354 (±0.0233)
Fold scores: ['0.9446', '0.9416', '0.9521', '0.9492', '0.8894']

----- BernoulliNB -----
Mean F1: 0.6700 (±0.0141)
Fold scores: ['0.6684', '0.6743', '0.6919', '0.6678', '0.6477']


Best model: MultinomialNB (Mean F1: 0.9354)

=== Test Set Performance ===
Test F1 Score: 0.9456


In [3]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report

# Ensure test labels are in {0, 1} (if encoded as -1/1)
y_test = np.where(y_test == -1, 0, y_test)

# Predict on test set
y_pred = best_model.predict(X_test_pca)

# Compute metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
support = np.bincount(y_test)  # Counts of each class in test set

# Print detailed report
print("\n=== Test Set Evaluation ===")
print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Support: {support}\n")



=== Test Set Evaluation ===
              precision    recall  f1-score   support

     Class 0       0.94      0.94      0.94       386
     Class 1       0.95      0.94      0.95       414

    accuracy                           0.94       800
   macro avg       0.94      0.94      0.94       800
weighted avg       0.94      0.94      0.94       800

Accuracy: 0.9437
Precision: 0.9467
Recall: 0.9444
F1-Score: 0.9456
Support: [386 414]



In [7]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

# Assuming you've already trained your best_model (MultinomialNB) and have X_test_pca

# Get feature log probabilities from the trained model
# These are log(P(feature|class))
log_prob = best_model.feature_log_prob_  # Shape: (n_classes, n_features)

# Calculate the absolute difference between class log probabilities
# This shows which features have the largest difference between classes
feature_importance = np.abs(log_prob[1] - log_prob[0])

# Get indices of top N most important features
top_n = 10  # Number of top features you want to see
top_indices = np.argsort(feature_importance)[-top_n:][::-1]

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'PC_Component': top_indices,
    'Importance_Score': feature_importance[top_indices],
    'Class_0_LogProb': log_prob[0][top_indices],
    'Class_1_LogProb': log_prob[1][top_indices]
})

print("\n=== Top {} Most Important PCA Components ===".format(top_n))
print(importance_df)




=== Top 10 Most Important PCA Components ===
   PC_Component  Importance_Score  Class_0_LogProb  Class_1_LogProb
0             9          0.690239        -5.239005        -5.929245
1             1          0.612279        -4.747316        -4.135037
2            10          0.290542        -5.227305        -5.517847
3             0          0.283684        -4.791873        -4.508189
4             2          0.170034        -4.500307        -4.330273
5             8          0.060897        -4.978840        -5.039737
6             6          0.045046        -4.640229        -4.595183
7            20          0.044210        -4.740774        -4.784983
8            15          0.042181        -4.792479        -4.750298
9            26          0.040056        -5.048699        -5.088755
