In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [18]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///clean_dataset.db')

In [19]:
import pandas as pd

In [20]:
df = pd.read_sql("select * from modeling_dataset", engine)

In [21]:
# Separate target and features
y = df["default"].astype(int)
X = df.drop(columns=["default", "customer_id", "application_id"])

In [22]:
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
# Preprocessing
numeric_transformer = "passthrough"
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Simple baseline model
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", clf),
    ]
)

model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred))

ROC-AUC: 0.7711268698034118
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     17081
           1       0.00      0.00      0.00       919

    accuracy                           0.95     18000
   macro avg       0.47      0.50      0.49     18000
weighted avg       0.90      0.95      0.92     18000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [25]:
df

Unnamed: 0,customer_id,application_id,application_hour,application_day_of_week,account_open_year,preferred_contact,referral_code,account_status_code,random_noise_1,num_login_sessions,...,previous_zip_code,loan_type,loan_amount,loan_term,interest_rate,loan_purpose,loan_to_value_ratio,origination_channel,loan_officer_id,marketing_campaign
0,10000,620515,5,6,2013,Mail,REF0000,ACT-2,1.137099,13,...,451,Personal,17700.0,36,12.50,Debt Consolidation,0.000,Direct Mail,1045,W
1,10001,624978,4,2,2015,Phone,REF0000,ACT-3,-0.164932,6,...,537,Mortgage,114000.0,180,6.83,Refinance,0.774,Branch,1011,B
2,10002,564658,10,3,2020,Phone,REF0000,ACT-3,0.526700,1,...,679,Personal,9300.0,36,13.99,Major Purchase,0.000,Online,1084,K
3,10003,621493,7,5,2010,Email,REF0000,A01,-0.709779,4,...,719,Personal,8700.0,48,13.26,Medical,0.000,Online,1048,A
4,10004,637785,1,2,2020,Mail,REF0000,ACT-3,-0.603132,6,...,933,Personal,7200.0,24,10.77,Debt Consolidation,0.000,Branch,1055,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89994,99994,585752,13,5,2016,Email,REF0000,A01,1.628917,10,...,933,Personal,11300.0,24,9.70,Home Improvement,0.000,Branch,1050,O
89995,99995,595205,13,5,2014,Mail,REF9754,ACTIVE,-0.501960,12,...,536,Personal,16500.0,36,11.67,Other,0.000,Branch,1071,D
89996,99996,544796,7,5,2010,Mail,REF0000,A01,-0.964956,4,...,193,Personal,17800.0,36,14.99,Major Purchase,0.000,Online,1046,J
89997,99997,560885,8,3,2021,Email,REF0000,ACT-1,0.328372,5,...,555,CreditCard,11700.0,0,19.10,Revolving Credit,0.000,Branch,997,K


In [27]:
import pandas as pd
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report

from xgboost import XGBClassifier

# 1. Load data from SQLite
engine = create_engine("sqlite:///clean_dataset.db")
df = pd.read_sql("SELECT * FROM modeling_dataset", engine)

# 2. Target and features
y = df["default"].astype(int)
X = df.drop(columns=["default", "customer_id", "application_id"])

# 3. Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# 4. Train / test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

# 5. Preprocessing: passthrough numeric, one‑hot encode categorical
numeric_transformer = "passthrough"
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# 6. Handle class imbalance for XGBoost
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
scale_pos_weight = neg / pos  # > 1 since defaults are rare

# 7. Define XGBoost classifier
xgb_clf = XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
    tree_method="hist",      # change to "gpu_hist" if you have GPU
    random_state=42,
    n_jobs=-1,
)

# 8. Full pipeline: preprocessing + model
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", xgb_clf),
    ]
)

# 9. Train
model.fit(X_train, y_train)

# 10. Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred))

ROC-AUC: 0.8054958837553056
              precision    recall  f1-score   support

           0       0.98      0.80      0.88     17081
           1       0.15      0.67      0.25       919

    accuracy                           0.79     18000
   macro avg       0.56      0.73      0.56     18000
weighted avg       0.94      0.79      0.85     18000

