In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
# Load the dataset
file_path = '../data/processed/new_cleaning.csv'
df = pd.read_csv(file_path)

#Creating the X and y variables
X = df.drop("Risk_bad", axis=1) 
X = X.iloc[: , 1:]
y = df["Risk_bad"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize Gaussian Naive Bayes model
gnb = GaussianNB()

In [3]:
param_grid = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

grid_search = GridSearchCV(gnb, param_grid, scoring='accuracy', cv=5, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'var_smoothing': 0.001}


In [4]:
# Evaluate on test set
best_gnb = grid_search.best_estimator_
y_pred = best_gnb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.70
Confusion Matrix:
[[150  28]
 [ 46  26]]

Classification Report:
              precision    recall  f1-score   support

       False       0.77      0.84      0.80       178
        True       0.48      0.36      0.41        72

    accuracy                           0.70       250
   macro avg       0.62      0.60      0.61       250
weighted avg       0.68      0.70      0.69       250



In [5]:

# Save the trained model to a file
import joblib

# Save the trained GridSearchCV model
joblib.dump(grid_search, "gnb_model.pkl")
print("Model saved as 'gnb_model.pkl'")

Model saved as 'gnb_model.pkl'


In [6]:

# Function to load the model and predict on new data
def predict_new_data(new_data):
    """Load the saved model and predict on new data."""
    import joblib
    import numpy as np

    # Load the model
    model = joblib.load("gnb_model.pkl")

    # Ensure new data is a 2D array
    if not isinstance(new_data, np.ndarray):
        new_data = np.array(new_data)
    if new_data.ndim == 1:
        new_data = new_data.reshape(1, -1)

    # Predict and return the results
    predictions = model.predict(new_data)
    probabilities = model.predict_proba(new_data)
    return predictions, probabilities


In [7]:
# Example usage of the prediction function
# Replace `new_input` with your own data point(s)
feature_names = X.columns.tolist()
new_input = [67,2,7.063903961472068,6,False,False,False,False,True,False,False,True,True,False,False,True,False,False,False,False,False,False,False,True]

new_input_df = pd.DataFrame([new_input], columns=feature_names)
predictions, probabilities = predict_new_data(new_input_df)

print("Predictions:", predictions)
print("Probabilities:", probabilities)


Predictions: [False]
Probabilities: [[0.98483941 0.01516059]]


