# Credit Risk Triage – Statistics Workflow

# Load and Prepare Data


### 1. Import dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import statsmodels.api as sm

### 2. Load dataset and assign column names

In [18]:

# load UCI credit dataset with proper column names
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"

cols = [
    "status", "duration", "credit_history", "purpose", "amount",
    "savings", "employment_duration", "installment_rate",
    "personal_status_sex", "other_debtors", "present_residence",
    "property", "age", "other_installment_plans", "housing",
    "number_credits", "job", "people_liable", "telephone",
    "foreign_worker", "credit_risk"
]

df = pd.read_csv(url, sep=" ", header=None, names=cols)

In [19]:
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [22]:
# convert numeric labels into Good/Bad for business clarity
df['credit_risk'] = df['credit_risk'].map({1: "Good", 2: "Bad"})
df['credit_risk'].value_counts()


Unnamed: 0_level_0,count
credit_risk,Unnamed: 1_level_1
Good,700
Bad,300


# Statistical Tests and Visuals

In [23]:
# select core lending variables for simple triage analysis

df_model = df[['duration','amount','age','purpose','savings','employment_duration','housing','credit_risk']]
df_model.head()

Unnamed: 0,duration,amount,age,purpose,savings,employment_duration,housing,credit_risk
0,6,1169,67,A43,A65,A75,A152,Good
1,48,5951,22,A43,A61,A73,A152,Bad
2,12,2096,49,A46,A61,A74,A152,Good
3,42,7882,45,A42,A61,A74,A153,Good
4,24,4870,53,A40,A61,A73,A153,Bad


In [24]:
# check for missing values
df_model.isna().sum()


Unnamed: 0,0
duration,0
amount,0
age,0
purpose,0
savings,0
employment_duration,0
housing,0
credit_risk,0


### Visuals and Statistical Tests

In [25]:
# boxplot: amount vs credit_risk
plt.figure(figsize=(6,4))
df_model.boxplot(column='amount', by='credit_risk')
plt.title('Loan Amount by Credit Risk')
plt.suptitle('')
plt.savefig('box_amount.png', bbox_inches='tight')
plt.close()


<Figure size 600x400 with 0 Axes>

In [None]:
# boxplot: duration vs credit_risk
plt.figure(figsize=(6,4))
df_model.boxplot(column='duration', by='credit_risk')
plt.title('Loan Duration by Credit Risk')
plt.suptitle('')
plt.savefig('box_duration.png', bbox_inches='tight')
plt.close()


In [27]:
# chi-square tests for selected categorical variables
cat_vars = ['purpose', 'savings', 'employment_duration']

for c in cat_vars:
    table = pd.crosstab(df_model[c], df_model['credit_risk'])
    chi2, p, dof, expected = chi2_contingency(table)
    print(c, p)


purpose 0.00011574910079691586
savings 2.761214238568249e-07
employment_duration 0.0010454523491402541


# Logistic Regression

In [29]:
# separate target before encoding
y = df_model['credit_risk'].map({'Bad':1, 'Good':0})

X = df_model.drop('credit_risk', axis=1)

# encode categoricals
X = pd.get_dummies(X, drop_first=True)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)



In [31]:
# basic evaluation metrics

              precision    recall  f1-score   support

           0       0.76      0.89      0.82       175
           1       0.57      0.33      0.42        75

    accuracy                           0.72       250
   macro avg       0.66      0.61      0.62       250
weighted avg       0.70      0.72      0.70       250

ROC AUC: 0.6978285714285715


In [32]:
# save ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.savefig('roc_curve.png', bbox_inches='tight')
plt.close()
