<a href="https://colab.research.google.com/github/staerkjoe/ML_colab/blob/main/CreditRisk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predictive Maintenance Data
## Set up environment and read data

In [12]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data

In [13]:
pip install ucimlrepo



In [14]:
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

In [15]:

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

df = pd.concat([X, y], axis=1)


In [16]:
df.head()

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [17]:
attribute_mapping = {
    "Attribute1": "checking_status",        # Status of existing checking account
    "Attribute2": "duration",              # Duration (months)
    "Attribute3": "credit_history",        # Credit history
    "Attribute4": "purpose",               # Purpose
    "Attribute5": "credit_amount",         # Credit amount
    "Attribute6": "savings",               # Savings account/bonds
    "Attribute7": "employment",            # Present employment since
    "Attribute8": "installment_rate",      # Installment rate in % of disposable income
    "Attribute9": "personal_status_sex",   # Personal status and sex
    "Attribute10": "other_debtors",        # Other debtors / guarantors
    "Attribute11": "residence_since",      # Present residence since
    "Attribute12": "property",             # Property
    "Attribute13": "age",                  # Age (years)
    "Attribute14": "other_installment",    # Other installment plans
    "Attribute15": "housing",              # Housing
    "Attribute16": "existing_credits",     # Number of existing credits at this bank
    "Attribute17": "job",                  # Job
    "Attribute18": "people_liable",        # Number of people liable to provide maintenance
    "Attribute19": "telephone",            # Telephone
    "Attribute20": "foreign_worker",       # Foreign worker
    "class": "credit_risk"                 # Target: 1 = Good, 2 = Bad
}

# Apply mapping
df = df.rename(columns=attribute_mapping)


In [18]:
df["credit_risk"] = df["credit_risk"].map({1: 0, 2: 1})

In [19]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings,employment,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment,housing,existing_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1


In [27]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = df.drop("credit_risk", axis=1)
y = df["credit_risk"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import OrdinalEncoder

# Identify categorical vs numerical
ordinal_features = ["checking_status", "credit_history", "savings", "employment", "job"]

nominal_features = ["purpose", "personal_status_sex", "other_debtors", "property", "other_installment", "housing", "telephone", "foreign_worker"]

num_features = ["duration","credit_amount","installment_rate","residence_since", "age","existing_credits","people_liable"]

# Transformers
preprocessor_logreg = ColumnTransformer([
    ("nominal", OneHotEncoder(handle_unknown="ignore"), nominal_features),
    ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ordinal_features),
    ("num", StandardScaler(), num_features)
])


logreg_pipeline = ImbPipeline(steps=[
    ("preprocess", preprocessor_logreg),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear"))
])


# GridSearch hyperparameters

param_grid = {
    "clf__penalty": ["l1", "l2"],            # regularization type
    "clf__C": [0.01, 0.1, 1, 10, 100],      # inverse regularization strength
    "clf__solver": ["liblinear"]            # liblinear supports l1 + l2
}

grid_search = GridSearchCV(
    logreg_pipeline,
    param_grid,
    cv=5,
    scoring="f1",   # or "roc_auc", depending on your goal
    n_jobs=-1
)


# 6. Fit & Evaluate

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)
print("Test score:", grid_search.score(X_test, y_test))

Best params: {'clf__C': 0.01, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
Best CV score: 0.6066575227554792
Test score: 0.6666666666666666


## XGBoost

In [29]:
from sklearn.preprocessing import OrdinalEncoder

preprocessor_tree = ColumnTransformer([
    ("nominal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), nominal_features),
    ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ordinal_features),
    ("num", "passthrough", num_features)  # no scaling needed
])

In [30]:
from xgboost import XGBClassifier

xgb_pipeline = ImbPipeline(steps=[
    ("preprocess", preprocessor_tree),   # your encoder/transformer
    ("smote", SMOTE(random_state=42)),
    ("clf", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        tree_method="gpu_hist",   # GPU acceleration
        predictor="gpu_predictor", # ensure GPU is used for inference
        random_state=42
    ))
])

In [31]:
# 5. GridSearch hyperparameters

param_grid_xgb = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [3, 5, 7],
    "clf__learning_rate": [0.01, 0.1, 0.2],
    "clf__subsample": [0.8, 1.0],
    "clf__colsample_bytree": [0.8, 1.0]
}

grid_search_xgb = GridSearchCV(
    xgb_pipeline,
    param_grid_xgb,
    cv=5,
    scoring="f1",   # or "roc_auc", depending on your goal
    n_jobs=-1,
    verbose=2
)


# 6. Fit & Evaluate

grid_search_xgb.fit(X_train, y_train)

print("Best params:", grid_search_xgb.best_params_)
print("Best CV score:", grid_search_xgb.best_score_)
print("Test score:", grid_search_xgb.score(X_test, y_test))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params: {'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 5, 'clf__n_estimators': 100, 'clf__subsample': 1.0}
Best CV score: 0.5988845838845839
Test score: 0.5428571428571428



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
