# 1️⃣ Separate Features & Target

In [1]:
import pandas as pd

dataset = pd.read_csv("german_credit_cleaned.csv")

X = dataset.drop("risk", axis=1)
y = dataset["risk"]


In [31]:
X.shape

(1000, 15)

In [30]:
dataset.columns

Index(['age', 'sex', 'job', 'housing', 'saving_accounts', 'checking_account',
       'credit_amount', 'duration', 'purpose', 'risk', 'credit_per_month',
       'age_risk_band', 'job_stability', 'housing_stability',
       'has_saving_account', 'has_checking_account'],
      dtype='object')

# 2️⃣ Identify Column Types

In [2]:
# ➡️ NO encoding / scaling needed
bin_cols  = [ col for col in X.columns  if X[col].dropna().nunique() == 2 
                                                and set(X[col].dropna().unique()).issubset({0, 1}) ]
bin_cols

['job_stability',
 'housing_stability',
 'has_saving_account',
 'has_checking_account']

In [3]:
engineered_binary = [
    'job_stability',
    'housing_stability'
]

account_flags = [
    'has_saving_account',
    'has_checking_account'
]


In [4]:
num_cols =[ col for col in X.select_dtypes(include='number').columns if col not in bin_cols]
num_cols

['age', 'job', 'credit_amount', 'duration', 'credit_per_month']

In [5]:
cat_cols =X.select_dtypes(include='object').columns.to_list()
cat_cols 

['sex',
 'housing',
 'saving_accounts',
 'checking_account',
 'purpose',
 'age_risk_band']

# 3️⃣ Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
        ("bin", "passthrough", bin_cols)
    ]
)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [29]:
X.columns

Index(['age', 'sex', 'job', 'housing', 'saving_accounts', 'checking_account',
       'credit_amount', 'duration', 'purpose', 'credit_per_month',
       'age_risk_band', 'job_stability', 'housing_stability',
       'has_saving_account', 'has_checking_account'],
      dtype='object')

In [28]:
X_train.columns

Index(['age', 'sex', 'job', 'housing', 'saving_accounts', 'checking_account',
       'credit_amount', 'duration', 'purpose', 'credit_per_month',
       'age_risk_band', 'job_stability', 'housing_stability',
       'has_saving_account', 'has_checking_account'],
      dtype='object')

In [9]:
# For imbalanced data
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline




In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, average_precision_score


In [11]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        class_weight="balanced",
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        eval_metric="logloss",
        random_state=42
    )
}

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

results = []

for name, model in models.items():
    pipe = ImbPipeline(steps=[
        ("preprocessing", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])

    # Fit pipeline
    pipe.fit(X_train, y_train)

    # Predict probabilities
    y_pred_proba = pipe.predict_proba(X_test)[:, 1]

    # Threshold for classification
    threshold = 0.5
    y_pred = (y_pred_proba >= threshold).astype(int)

    # Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} Confusion Matrix:\n", cm)
    print(f"{name} Classification Report:\n", classification_report(y_test, y_pred))

    results.append({
        "Model": name,
        "ROC_AUC": roc_auc,
        "PR_AUC": pr_auc
    })



Logistic Regression Confusion Matrix:
 [[101  39]
 [ 20  40]]
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.72      0.77       140
           1       0.51      0.67      0.58        60

    accuracy                           0.70       200
   macro avg       0.67      0.69      0.67       200
weighted avg       0.74      0.70      0.71       200


Random Forest Confusion Matrix:
 [[106  34]
 [ 22  38]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79       140
           1       0.53      0.63      0.58        60

    accuracy                           0.72       200
   macro avg       0.68      0.70      0.68       200
weighted avg       0.74      0.72      0.73       200


XGBoost Confusion Matrix:
 [[100  40]
 [ 19  41]]
XGBoost Classification Report:
               precision    recall  f1-score   support

      

In [19]:
results_df = pd.DataFrame(results).sort_values(by="PR_AUC", ascending=False)
results_df


Unnamed: 0,Model,ROC_AUC,PR_AUC
1,Random Forest,0.767381,0.644419
2,XGBoost,0.756071,0.620629
0,Logistic Regression,0.744524,0.581556


#  Random Forest save

In [27]:
import joblib
# Random Forest pipeline ko save karna
rf_pipe = ImbPipeline(steps=[
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", models["Random Forest"])
])

# Train
rf_pipe.fit(X_train, y_train)

# Save
joblib.dump(rf_pipe, "random_forest_pipeline.pkl")
print("Random Forest pipeline saved successfully!")


Random Forest pipeline saved successfully!


In [21]:
loaded_rf = joblib.load("random_forest_pipeline.pkl")

# Prediction karna
y_pred = loaded_rf.predict(X_test)


In [22]:
y_test1=y_test.copy()
y_test1.to_xarray()

In [23]:
y_pred

array([0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0])

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Full classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.72
Precision: 0.5277777777777778
Recall: 0.6333333333333333
F1 Score: 0.5757575757575758

Confusion Matrix:
 [[106  34]
 [ 22  38]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79       140
           1       0.53      0.63      0.58        60

    accuracy                           0.72       200
   macro avg       0.68      0.70      0.68       200
weighted avg       0.74      0.72      0.73       200



In [25]:
y_pred_proba = rf_pipe.predict_proba(X_test)[:,1]

# Risk-averse bank strategy
y_pred = (y_pred_proba >= 0.35).astype(int)

confusion_matrix(y_test, y_pred)


array([[82, 58],
       [13, 47]])

In [26]:
FN = 13
FP = 58

total_loss = FN*100000 + FP*10000
print("Total Business Loss:", total_loss)


Total Business Loss: 1880000
