In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
import os
from pathlib import Path

import cupy as cp
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from loanapprovalprediction.constants import (
    CATEGORICAL_VARIABLES,
    CONTINUOUS_VARIABLES,
    TARGET,
)

sns.set_theme()

DATA_DIR = Path(os.getcwd()).parent / "data"

In [3]:
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")

## Preprocess data

In [40]:
def preprocess_data(
    data_: pd.DataFrame, 
    scaler: StandardScaler, 
    cont_features: list[str],
    cat_features: list[str],
    fit:bool=False
) -> pd.DataFrame:
    data = data_.reset_index(drop=True)
    if fit:
        X_cont = scaler.fit_transform(data[cont_features])
    else:
        X_cont = scaler.transform(data[cont_features])

    return pd.concat([
        pd.DataFrame(X_cont, columns=cont_features), 
        data[cat_features].apply(pd.Categorical)], axis=1
    )

scaler = StandardScaler()
cont_features = list(set(CONTINUOUS_VARIABLES) - set(["person_age", "cb_person_cred_hist_length"]))
# cont_features = CONTINUOUS_VARIABLES
training, valid = train_test_split(train, test_size=0.2, random_state=42)
X_train = preprocess_data(training, scaler, cont_features=cont_features, cat_features=CATEGORICAL_VARIABLES, fit=True)
X_valid = preprocess_data(valid, scaler, cont_features=cont_features, cat_features=CATEGORICAL_VARIABLES,)
y_train = training[TARGET]
y_valid = valid[TARGET]

X_test = preprocess_data(test, scaler, cont_features=cont_features, cat_features=CATEGORICAL_VARIABLES,)

In [43]:
from xgboost import XGBClassifier

clf = XGBClassifier(
    enable_categorical=True
)
clf.fit(
    X_train, 
    y_train, 
    eval_set=[(X_valid, y_valid)],
    verbose=False
)
print(clf.score(X_train, y_train))
print(clf.score(X_valid, y_valid))

0.9636584534060875
0.9488447437974252


In [39]:
from xgboost import XGBClassifier

clf = XGBClassifier(enable_categorical=True)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_valid, y_valid))

0.9602054736124137
0.9473953448716855


In [None]:
graph = xgb.to_graphviz(clf, num_trees=1)
graph

In [17]:
preds = clf.predict(X_test)

In [20]:
preds = clf.predict(X_test)
test.assign(loan_status = preds)[["id", TARGET]].to_csv(DATA_DIR / "submission.csv", index=False)

In [22]:
!kaggle competitions submit -c playground-series-s4e10 -f ../data/submission.csv -m "Initial submission"

100%|█████████████████████████████████████████| 305k/305k [00:00<00:00, 659kB/s]
Successfully submitted to Loan Approval Prediction

In [None]:
# Score on initial submission: 0.86906
# Score on submission using Ax-optimized hyperparameters: 0.86842 :(

In [47]:
import pandas as pd
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from sklearn.metrics import accuracy_score
import numpy as np

def train_evaluate_xgboost(params):
    """Training function that Ax will optimize"""
    # Convert ax parameters to XGBoost parameters
    xgb_params = {
        "max_depth": int(params["max_depth"]),
        "learning_rate": params["learning_rate"],
        "min_child_weight": params["min_child_weight"],
        "colsample_bytree": params["colsample_bytree"],
        "subsample": params["subsample"],
        "n_estimators": int(params["n_estimators"]),
        "enable_categorical": True,
        "early_stopping_rounds":20
    }
    
    # Initialize and train model
    clf = XGBClassifier(**xgb_params)
    clf.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False,
        
    )
    
    # Get predictions on validation set
    y_pred = clf.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    
    # Ax maximizes the objective, so return accuracy
    return accuracy

# Define the parameters to optimize
parameters = [
    {
        "name": "max_depth",
        "type": "range",
        "bounds": [3, 12],
        "value_type": "int",
    },
    {
        "name": "learning_rate",
        "type": "range",
        "bounds": [0.01, 0.3],
        "log_scale": True,
    },
    {
        "name": "min_child_weight",
        "type": "range",
        "bounds": [1, 7],
    },
    {
        "name": "colsample_bytree",
        "type": "range",
        "bounds": [0.3, 1.0],
    },
    {
        "name": "subsample",
        "type": "range",
        "bounds": [0.6, 1.0],
    },
    {
        "name": "n_estimators",
        "type": "range",
        "bounds": [100, 1000],
        "value_type": "int",
    },
]

# Run optimization
best_parameters, values, experiment, model = optimize(
    parameters=parameters,
    evaluation_function=train_evaluate_xgboost,
    objective_name='accuracy',
    total_trials=50,  # Number of iterations
    random_seed=42,
)

# Print best parameters and score
print("\nBest parameters:")
for param, value in best_parameters.items():
    print(f"{param}: {value}")
print(f"\nBest accuracy: {values[0]['accuracy']}")

# Train final model with best parameters
best_params = {
    "max_depth": int(best_parameters["max_depth"]),
    "learning_rate": best_parameters["learning_rate"],
    "min_child_weight": best_parameters["min_child_weight"],
    "colsample_bytree": best_parameters["colsample_bytree"],
    "subsample": best_parameters["subsample"],
    "n_estimators": int(best_parameters["n_estimators"]),
    "enable_categorical": True,
    "early_stopping_rounds":20
}

final_model = XGBClassifier(**best_params)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False,
)

# Print final model performance
print("\nFinal model performance:")
print(f"Training accuracy: {final_model.score(X_train, y_train):.4f}")
print(f"Validation accuracy: {final_model.score(X_valid, y_valid):.4f}")

[INFO 10-27 19:43:11] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter learning_rate. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 10-27 19:43:11] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter min_child_weight. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 10-27 19:43:11] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter colsample_bytree. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 10-27 19:43:11] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter subsample. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 


Encountered exception in computing model fit quality: RandomModelBridge does not support prediction.

[INFO 10-27 19:43:13] ax.service.managed_loop: Running optimization trial 2...

Encountered exception in computing model fit quality: RandomModelBridge does not support prediction.

[INFO 10-27 19:43:14] ax.service.managed_loop: Running optimization trial 3...

Encountered exception in computing model fit quality: RandomModelBridge does not support prediction.

[INFO 10-27 19:43:17] ax.service.managed_loop: Running optimization trial 4...

Encountered exception in computing model fit quality: RandomModelBridge does not support prediction.

[INFO 10-27 19:43:17] ax.service.managed_loop: Running optimization trial 5...

Encountered exception in computing model fit quality: RandomModelBridge does not support prediction.

[INFO 10-27 19:43:18] ax.service.managed_loop: Running optimization trial 6...

Encountered exception in computing model fit quality: RandomModelBridge does not support 


Best parameters:
max_depth: 3
learning_rate: 0.09969525121932622
min_child_weight: 1
colsample_bytree: 0.8565528234193323
subsample: 0.7054890206594646
n_estimators: 575

Best accuracy: 0.9511766553078507

Final model performance:
Training accuracy: 0.9517
Validation accuracy: 0.9514


In [48]:
best_params

{'max_depth': 3,
 'learning_rate': 0.09969525121932622,
 'min_child_weight': 1,
 'colsample_bytree': 0.8565528234193323,
 'subsample': 0.7054890206594646,
 'n_estimators': 575,
 'enable_categorical': True,
 'early_stopping_rounds': 20}

In [51]:
preds = clf.predict(X_test)
test_with_preds = test.assign(loan_status = preds)[["id", TARGET]]
#.to_csv(DATA_DIR / "submission2.csv", index=False)

In [60]:
test_with_preds.to_csv(DATA_DIR / "submission2.csv", index=False)

In [61]:
!kaggle competitions submit -c playground-series-s4e10 -f ../data/submission2.csv -m "Used optimized hyperparameters"

100%|█████████████████████████████████████████| 305k/305k [00:00<00:00, 437kB/s]
Successfully submitted to Loan Approval Prediction

In [50]:
initial_submission = pd.read_csv(DATA_DIR / "submission.csv")

In [52]:
initial_submission

Unnamed: 0,id,loan_status
0,58645,1
1,58646,0
2,58647,1
3,58648,0
4,58649,0
...,...,...
39093,97738,0
39094,97739,0
39095,97740,0
39096,97741,0


In [56]:
comparison = (
    initial_submission
        .join(test_with_preds.rename(columns={"loan_status":"new_pred"}).set_index("id"), on="id")
        .assign(same=lambda x: x[TARGET] == x["new_pred"])
)

In [58]:
comparison[~comparison["same"]]

Unnamed: 0,id,loan_status,new_pred,same
59,58704,1,0,False
165,58810,0,1,False
196,58841,0,1,False
239,58884,0,1,False
246,58891,0,1,False
...,...,...,...,...
38830,97475,1,0,False
38876,97521,1,0,False
38942,97587,1,0,False
38970,97615,1,0,False


In [59]:
comparison[comparison["same"]]

Unnamed: 0,id,loan_status,new_pred,same
0,58645,1,1,True
1,58646,0,0,True
2,58647,1,1,True
3,58648,0,0,True
4,58649,0,0,True
...,...,...,...,...
39093,97738,0,0,True
39094,97739,0,0,True
39095,97740,0,0,True
39096,97741,0,0,True
