# Credit Approval Risk Model

## Executive Summary
This notebook builds a structured credit approval decision-support model using the UCI Credit Approval dataset.

It demonstrates:
- Data cleaning & preprocessing
- Mixed-type feature handling
- Logistic Regression baseline
- Model comparison
- Risk trade-off interpretation


In [None]:
# ===== Parameters =====
DATA_PATH = "cc_approvals.data"
TEST_SIZE = 0.2
RANDOM_STATE = 42
CV_FOLDS = 5
SAVE_FIGURES = True
FIG_DIR = "visualisations"
THRESHOLD = 0.5

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score,
    ConfusionMatrixDisplay, RocCurveDisplay
)


## 1) Load Dataset (No Header + '?' as missing)

In [None]:
# Load dataset
df = pd.read_csv(DATA_PATH, header=None)

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Assign generic column names
df.columns = [f"A{i}" for i in range(df.shape[1])]

df.head()

## 2) Identify Target Column

In [None]:
# Last column is target
target_col = df.columns[-1]

# Convert target: '+' = approved, '-' = rejected
df[target_col] = df[target_col].map({'+': 1, '-': 0})

df[target_col].value_counts()

## 3) Data Overview

In [None]:
print("Shape:", df.shape)
display(df.dtypes)

missing = df.isna().sum().sort_values(ascending=False)
display(missing[missing > 0])

## 4) Feature Preparation

In [None]:
X = df.drop(columns=[target_col])
y = df[target_col]

# Attempt numeric conversion
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='ignore')

numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['number']).columns.tolist()

numeric_features, categorical_features

## 5) Preprocessing Pipeline

In [None]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)


## 6) Logistic Regression Model

In [None]:
log_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=2000))
])

log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)
y_prob = log_model.predict_proba(X_test)[:, 1]


## 7) Cross-Validation

In [None]:
cv_scores = cross_val_score(log_model, X, y, cv=CV_FOLDS)
print("Cross-validation accuracy:", round(cv_scores.mean(), 3))

## 8) Evaluation Metrics

In [None]:
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "ROC_AUC": roc_auc_score(y_test, y_prob)
}

pd.DataFrame([metrics]).round(3)

## 9) Confusion Matrix & ROC

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title("Confusion Matrix")
plt.show()

RocCurveDisplay.from_estimator(log_model, X_test, y_test)
plt.title("ROC Curve")
plt.show()

## 10) Business Interpretation

- False Positive: Approving high-risk applicant → financial loss
- False Negative: Rejecting creditworthy applicant → lost revenue

Model threshold can be adjusted depending on institutional risk appetite.