In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [9]:
# Load the dataset
file_path = 'cleaned_german_credit_data_updated.csv'
df = pd.read_csv(file_path)

# Assume the last column is the target variable and the rest are features
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes model
gnb = GaussianNB()

In [10]:
param_grid = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

grid_search = GridSearchCV(gnb, param_grid, scoring='accuracy', cv=5, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'var_smoothing': 1e-08}


In [12]:
# Evaluate on test set
best_gnb = grid_search.best_estimator_
y_pred = best_gnb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.53
Confusion Matrix:
[[18 32]
 [17 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.36      0.42        50
           1       0.54      0.69      0.61        55

    accuracy                           0.53       105
   macro avg       0.53      0.53      0.52       105
weighted avg       0.53      0.53      0.52       105

