In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [10]:
# Load the dataset
file_path = 'cleaned_german_credit_data_updated.csv'
df = pd.read_csv(file_path)

# Assume the last column is the target variable and the rest are features
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize XGBoost model
model_xg = XGBClassifier(random_state=2)

In [11]:
# Setting the Hyper Parameters
param_test1 = {
 'max_depth':[3,5,6,10],
 'min_child_weight':[3,5,10],
 'gamma':[0.0, 0.1, 0.2, 0.3, 0.4],
# 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 10],
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}

grid_search = GridSearchCV(model_xg, param_grid=param_test1, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

In [12]:
# Evaluate on test set
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.64
Confusion Matrix:
[[34 26]
 [21 50]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.57      0.59        60
           1       0.66      0.70      0.68        71

    accuracy                           0.64       131
   macro avg       0.64      0.64      0.64       131
weighted avg       0.64      0.64      0.64       131



In [13]:

# Save the trained model to a file
import joblib

# Save the trained GridSearchCV model
joblib.dump(grid_search, "xgboost_model.pkl")
print("Model saved as 'xgboost_model.pkl'")

Model saved as 'xgboost_model.pkl'


In [17]:

# Function to load the model and predict on new data
def predict_new_data(new_data):
    """Load the saved model and predict on new data."""
    import joblib
    import numpy as np

    # Load the model
    model = joblib.load("xgboost_model.pkl")

    # Ensure new data is a 2D array
    if not isinstance(new_data, np.ndarray):
        new_data = np.array(new_data)
    if new_data.ndim == 1:
        new_data = new_data.reshape(1, -1)

    # Predict and return the results
    predictions = model.predict(new_data)
    probabilities = model.predict_proba(new_data)
    return predictions, probabilities


In [None]:

# Example usage of the prediction function
# Replace `new_input` with your own data point(s)
new_input = [22,1,2,0,1,5951,48,1,0,0,0,0,0,1,0,0]  # Example input (adjust to your feature size)
predictions, probabilities = predict_new_data(new_input)

print("Predictions:", predictions)
print("Probabilities:", probabilities)


ValueError: Feature shape mismatch, expected: 16, got 14