In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

2024-04-25 23:44:03.993091: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 23:44:03.993248: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 23:44:04.129514: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [3]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [4]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [5]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [6]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [None]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [None]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

In [None]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['category']).columns

In [None]:
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

In [None]:
X_train_numerical = pd.DataFrame(numerical_imputer.fit_transform(X_train[numerical_cols]), columns=numerical_cols)
X_test_numerical = pd.DataFrame(numerical_imputer.transform(X_test[numerical_cols]), columns=numerical_cols)
X_valid_numerical = pd.DataFrame(numerical_imputer.transform(X_valid[numerical_cols]), columns=numerical_cols)

In [None]:
X_train_categorical = pd.DataFrame(categorical_imputer.fit_transform(X_train[categorical_cols]), columns=categorical_cols)
X_test_categorical = pd.DataFrame(categorical_imputer.transform(X_test[categorical_cols]), columns=categorical_cols)
X_valid_categorical = pd.DataFrame(categorical_imputer.transform(X_valid[categorical_cols]), columns=categorical_cols)

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_categorical)
X_test_encoded = encoder.transform(X_test_categorical)
X_valid_encoded = encoder.transform(X_valid_categorical)

In [None]:
X_train_final = hstack([X_train_numerical, X_train_encoded])
X_test_final = hstack([X_test_numerical, X_test_encoded])
X_valid_final = hstack([X_valid_numerical, X_valid_encoded])
# X_train_final = pd.DataFrame(numerical_imputer.fit_transform(X_train[numerical_cols]), columns=numerical_cols)
# X_test_final = pd.DataFrame(numerical_imputer.transform(X_test[numerical_cols]), columns=numerical_cols)

In [None]:
smote = SMOTE(sampling_strategy=1.0) 
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_final, y_train)

In [None]:
X_train_resampled_dense = X_train_resampled.toarray()
column_names = [f'feature_{i}' for i in range(X_train_resampled_dense.shape[1])]

In [None]:
X_train_resampled_df = pd.DataFrame(X_train_resampled_dense, columns=column_names)
del X_train_resampled_dense
del X_train_resampled

In [None]:
X_valid_resampled, y_valid_resampled = smote.fit_resample(X_valid_final, y_valid)

In [None]:
X_valid_resampled_dense = X_valid_resampled.toarray()
column_names_valid = [f'feature_{i}' for i in range(X_valid_resampled_dense.shape[1])]
X_valid_resampled_df = pd.DataFrame(X_valid_resampled_dense, columns=column_names_valid)
del X_valid_resampled_dense
del X_valid_resampled

In [None]:
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_final, y_test)

In [None]:
X_test_resampled_dense = X_test_resampled.toarray()
column_names_test = [f'feature_{i}' for i in range(X_test_resampled_dense.shape[1])]
X_test_resampled_df = pd.DataFrame(X_test_resampled_dense, columns=column_names_test)
del X_test_resampled_dense
del X_test_resampled

In [None]:
X_valid_resampled_df

In [24]:
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='macro'),
        'Recall': recall_score(y_test, y_pred, average='macro'),
        'AUC-ROC': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }


results = {}

In [25]:
rf = RandomForestClassifier(n_estimators=10)
lr = LogisticRegression()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
lgb = LGBMClassifier()

In [None]:
results['Random Forest'] = train_evaluate_model(rf, X_train_resampled_df, y_train_resampled, X_test_resampled_df, y_test_resampled)
results['Logistic Regression'] = train_evaluate_model(lr, X_train_resampled_df, y_train_resampled, X_test_resampled_df, y_test_resampled)
results['XGBoost'] = train_evaluate_model(xgb, X_train_resampled_df, y_train_resampled, X_test_resampled_df, y_test_resampled)
results['LightGBM'] = train_evaluate_model(lgb, X_train_resampled_df, y_train_resampled, X_test_resampled_df, y_test_resampled)

In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_scaled, to_categorical(y_train), epochs=10, batch_size=32)
nn_pred = model.predict(X_test_scaled)
nn_results = {
    'Accuracy': accuracy_score(y_test, np.argmax(nn_pred, axis=1)),
    'Precision': precision_score(y_test, np.argmax(nn_pred, axis=1), average='macro'),
    'Recall': recall_score(y_test, np.argmax(nn_pred, axis=1), average='macro'),
    'AUC-ROC': roc_auc_score(y_test, nn_pred[:, 1])
}
results['Neural Network'] = nn_results

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9685 - loss: 0.1387
Epoch 2/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2ms/step - accuracy: 0.9685 - loss: 0.1324
Epoch 3/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 2ms/step - accuracy: 0.9686 - loss: 0.1315
Epoch 4/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2ms/step - accuracy: 0.9683 - loss: 0.1316
Epoch 5/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 2ms/step - accuracy: 0.9683 - loss: 0.1314
Epoch 6/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 2ms/step - accuracy: 0.9682 - loss: 0.1312
Epoch 7/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - accuracy: 0.9686 - loss: 0.1301
Epoch 8/10
[1m28625/28625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 2ms/step - accuracy: 0.9682 - loss: 0.1310


In [36]:
results

{'Random Forest': {'Accuracy': 0.9673633945999764,
  'Precision': 0.545212184412824,
  'Recall': 0.5018162533168044,
  'AUC-ROC': 0.5830174684375969},
 'Logistic Regression': {'Accuracy': 0.9682476779374583,
  'Precision': 0.5367822702051923,
  'Recall': 0.5000745557682122,
  'AUC-ROC': 0.6391508931988725},
 'XGBoost': {'Accuracy': 0.9682116515792646,
  'Precision': 0.6771970175358124,
  'Recall': 0.5021543448501494,
  'AUC-ROC': 0.7290069890564326},
 'LightGBM': {'Accuracy': 0.9682837042956519,
  'Precision': 0.6984523154439042,
  'Recall': 0.5005927758712975,
  'AUC-ROC': 0.7280186623250275},
 'Neural Network': {'Accuracy': 0.96815924960371,
  'Precision': 0.6270463223063523,
  'Recall': 0.5013278985691576,
  'AUC-ROC': 0.7132694455638207}}

In [43]:
 y_train.mean()

0.03151982270645582