In [28]:
!pip -q install xgboost==1.7.6 --no-cache-dir

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m170.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
from google.colab import files
uploaded = files.upload()
CSV_PATH = list(uploaded.keys())[0]
print("Using:", CSV_PATH)


Saving Loan_Default.csv to Loan_Default (3).csv
Using: Loan_Default (3).csv


In [30]:
import pandas as pd
import numpy as np

def clean_cols(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(r'[^0-9a-zA-Z]+', '_', regex=True)
        .str.strip('_')
    )
    return df

df = clean_cols(pd.read_csv(CSV_PATH))
print("Shape:", df.shape)
df.head()


Shape: (148670, 34)


Unnamed: 0,id,year,loan_limit,gender,approv_in_adv,loan_type,loan_purpose,credit_worthiness,open_credit,business_or_commercial,...,credit_type,credit_score,co_applicant_credit_type,age,submission_of_application,ltv,region,security_type,status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [33]:
TARGET = 'status'

In [34]:
positive_values = {'charged_off','default','bad_loan','yes','1',1}
def to_binary(s):
    if s.dtype == 'O':
        s2 = s.astype(str).str.lower().str.strip()
        return s2.isin({str(v).lower() for v in positive_values}).astype(int)
    else:
        return s.astype(int).apply(lambda v: 1 if v in positive_values else 0)

y = to_binary(df[TARGET])
X = df.drop(columns=[TARGET])
print("Positive rate (defaults):", y.mean().round(4))

Positive rate (defaults): 0.2464


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [36]:
# Split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [37]:
# Column types
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]

In [38]:
# Preprocess
preprocess = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols)
    ],
    remainder='drop'
)

print(f"Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

Numeric: 12, Categorical: 21


In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [40]:
def evaluate_trained(pipe, X_te, y_te, name="Model"):

    y_prob = None
    try:
        y_prob = pipe.predict_proba(X_te)[:,1]
    except Exception:
        try:
            from scipy.special import expit
            y_prob = expit(pipe.decision_function(X_te))
        except Exception:
            pass
    y_pred = pipe.predict(X_te)

    metrics = {
        'model': name,
        'accuracy': accuracy_score(y_te, y_pred),
        'precision': precision_score(y_te, y_pred, zero_division=0),
        'recall': recall_score(y_te, y_pred, zero_division=0),
        'f1': f1_score(y_te, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_te, y_prob) if y_prob is not None else np.nan,
    }
    return metrics

In [41]:

#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000, class_weight='balanced')

pipe_logreg = Pipeline([
    ('prep', preprocess),
    ('clf', logreg)
])
pipe_logreg.fit(X_train_raw, y_train)
metrics_logreg = evaluate_trained(pipe_logreg, X_test_raw, y_test, "Logistic Regression")
metrics_logreg


{'model': 'Logistic Regression',
 'accuracy': 0.8388713257550279,
 'precision': 0.6607935099505641,
 'recall': 0.7113810043668122,
 'f1': 0.6851547611224289,
 'roc_auc': np.float64(0.8674573044026339)}

In [42]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)

pipe_tree = Pipeline([
    ('prep', preprocess),
    ('clf', tree)
])
pipe_tree.fit(X_train_raw, y_train)
metrics_tree = evaluate_trained(pipe_tree, X_test_raw, y_test, "Decision Tree")
metrics_tree


{'model': 'Decision Tree',
 'accuracy': 0.9998991054012242,
 'precision': 0.9998635184932442,
 'recall': 0.9997270742358079,
 'f1': 0.9997952917093142,
 'roc_auc': np.float64(0.9998412216666854)}

In [43]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced_subsample'
)

pipe_rf = Pipeline([
    ('prep', preprocess),
    ('clf', rf)
])
pipe_rf.fit(X_train_raw, y_train)
metrics_rf = evaluate_trained(pipe_rf, X_test_raw, y_test, "Random Forest")
metrics_rf


{'model': 'Random Forest',
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'roc_auc': np.float64(1.0)}