In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import optuna 
from optuna.samplers import TPESampler
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df


In [3]:

train_basetable = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\train\\train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\train\\train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\train\\train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\csv\\bankcredit-riskCOMPETITION\\csv_files\\train\\train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\train\\train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\train\\train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [4]:
test_basetable = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\csv_files\\test\\test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [5]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [6]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [7]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    filtered_data = data.filter(pl.col("case_id").is_in(case_ids))
    return (
        filtered_data[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        filtered_data[cols_pred].to_pandas(),
        filtered_data["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_val, X_val, y_val = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_val, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [8]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_val.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 48)
Valid: (305332, 48)
Test: (305332, 48)


In [9]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_val, label=y_val, reference=lgb_train)


#The tuning process has been commented out due to its time-consuming nature.

# Define the objective function for Optuna optimization
def objective(trial, X_train, X_val, y_train, y_val):
    #Define parameters to be optimized for the LGBMClassifier
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",#gbdt
        "random_state": 42,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "n_estimators": trial.suggest_int("n_estimators", 400, 600),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 0.08),
        "max_depth": trial.suggest_int("max_depth", 5, 14),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
    }


    lgbm_classifier = LGBMClassifier(**param)
    lgbm_classifier.fit(X_train, y_train)
    score = lgbm_classifier.score(X_val, y_val)

    return score


# Set up the sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
study = optuna.create_study(direction="maximize", sampler=sampler)

# Run the optimization process
study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val), n_trials=10)

# Get the best parameters after optimization
best_params = study.best_params

print('='*50)
print(best_params)

[I 2024-03-01 04:05:06,710] A new study created in memory with name: no-name-921cc4d5-5a36-40b5-b2ab-14b2696a3f8f
[I 2024-03-01 04:05:22,890] Trial 0 finished with value: 0.9690697339289691 and parameters: {'learning_rate': 0.0249816047538945, 'n_estimators': 591, 'lambda_l1': 0.012319939418114049, 'lambda_l2': 0.05190609389379257, 'max_depth': 6, 'colsample_bytree': 0.3935967122017216, 'subsample': 0.5290418060840998, 'min_child_samples': 45, 'num_leaves': 64}. Best is trial 0 with value: 0.9690697339289691.
[I 2024-03-01 04:05:34,627] Trial 1 finished with value: 0.9690730090524413 and parameters: {'learning_rate': 0.03832290311184182, 'n_estimators': 404, 'lambda_l1': 0.014699098521619942, 'lambda_l2': 0.06827098485602953, 'max_depth': 7, 'colsample_bytree': 0.4090949803242604, 'subsample': 0.5917022549267169, 'min_child_samples': 22, 'num_leaves': 57}. Best is trial 1 with value: 0.9690730090524413.
[I 2024-03-01 04:05:47,531] Trial 2 finished with value: 0.9690599085585526 and par

{'learning_rate': 0.03493192507310232, 'n_estimators': 466, 'lambda_l1': 0.0056355835028602365, 'lambda_l2': 0.031768762520096354, 'max_depth': 8, 'colsample_bytree': 0.7377637070028384, 'subsample': 0.8187787356776066, 'min_child_samples': 46, 'num_leaves': 52}


In [10]:
best_params_lgbm = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    'learning_rate': 0.03493192507310232, 'n_estimators': 466, 'lambda_l1': 0.0056355835028602365, 'lambda_l2': 0.031768762520096354, 'max_depth': 8, 'colsample_bytree': 0.7377637070028384, 'subsample': 0.8187787356776066, 'min_child_samples': 46, 'num_leaves': 52}



lgbm_classifier = LGBMClassifier(**best_params_lgbm)
lgbm_classifier.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 28872, number of negative: 887123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8980
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031520 -> initscore=-3.425111
[LightGBM] [Info] Start training from score -3.425111


In [11]:
for base, X in [(base_train, X_train), (base_val, X_val), (base_test, X_test)]:
    y_pred = lgbm_classifier.predict(X)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_val["target"], base_val["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

The AUC score on the train set is: 0.5031164889686818
The AUC score on the valid set is: 0.501068117285037
The AUC score on the test set is: 0.5012524252323455


In [12]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_val)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')

The stability score on the train set is: 0.0031918685290863234
The stability score on the valid set is: -0.0006921044334633223
The stability score on the test set is: -0.0008989638505342598


In [13]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = lgbm_classifier.predict(X_submission)



In [14]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION\\bankcredit-riskCOMPETITIONsub\\sub2.csv")