In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


df = pd.read_csv("../data/german_credit_data.csv")
print(df.shape)
df.head()


(1000, 11)


Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [2]:
target_col = "Risk" if "Risk" in df.columns else df.columns[-1]
print("Target column:", target_col)
print(df[target_col].value_counts())

Target column: Risk
Risk
good    700
bad     300
Name: count, dtype: int64


In [3]:
X = df.drop(columns=[target_col])
y = df[target_col]

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("Target values:", y.unique())


Features shape: (1000, 10)
Target shape: (1000,)
Target values: ['good' 'bad']


In [4]:
bad_class=y.value_counts().idxmin()
y_bin=(y==bad_class).astype(int)
print("Bad/Default",bad_class)
print( y_bin.value_counts())

Bad/Default bad
Risk
0    700
1    300
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=[target_col])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_bin,
    test_size=0.2,
    random_state=42,
    stratify=y_bin
)

print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


Risk
0    0.7
1    0.3
Name: proportion, dtype: float64
Risk
0    0.7
1    0.3
Name: proportion, dtype: float64


In [6]:
num_cols=X_train.select_dtypes(include=["int64","float64"]).columns
cat_cols=X_train.select_dtypes(exclude=["int64","float64"]).columns
print("Numerical Col",list(num_cols))
print("Categorical Col",list(cat_cols))

Numerical Col ['Unnamed: 0', 'Age', 'Job', 'Credit amount', 'Duration']
Categorical Col ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [8]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])


In [10]:
clf.fit(X_train, y_train)


In [11]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]


In [12]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


ROC-AUC: 0.7608333333333334

Confusion Matrix:
 [[125  15]
 [ 38  22]]

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.89      0.83       140
           1       0.59      0.37      0.45        60

    accuracy                           0.73       200
   macro avg       0.68      0.63      0.64       200
weighted avg       0.72      0.73      0.71       200



In [15]:
import numpy as np

threshold = 0.3
y_pred_03 = (y_proba >= threshold).astype(int)

print("Threshold:", threshold)
print("ROC-AUC",roc_auc_score(y_test,y_pred_03))
print(confusion_matrix(y_test, y_pred_03))
print(classification_report(y_test, y_pred_03))


Threshold: 0.3
ROC-AUC 0.7369047619047618
[[99 41]
 [14 46]]
              precision    recall  f1-score   support

           0       0.88      0.71      0.78       140
           1       0.53      0.77      0.63        60

    accuracy                           0.73       200
   macro avg       0.70      0.74      0.70       200
weighted avg       0.77      0.72      0.74       200

