# Install packages

In [1]:
!pip install /kaggle/input/kaggle-home-credit-risk-model-stability-lib/kaggle_home_credit_risk_model_stability-0.3-py3-none-any.whl --force-reinstall

Processing /kaggle/input/kaggle-home-credit-risk-model-stability-lib/kaggle_home_credit_risk_model_stability-0.3-py3-none-any.whl
Installing collected packages: kaggle-home-credit-risk-model-stability
Successfully installed kaggle-home-credit-risk-model-stability-0.3


# Import packages

In [2]:
import polars as pl
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import copy
import lightgbm as lgb
import importlib

import kaggle_home_credit_risk_model_stability.libs as hcr
from kaggle_home_credit_risk_model_stability.libs.input.dataset import Dataset
from kaggle_home_credit_risk_model_stability.libs.input.data_loader import DataLoader
from kaggle_home_credit_risk_model_stability.libs.preprocessor.preprocessor import Preprocessor
from kaggle_home_credit_risk_model_stability.libs.preprocessor.steps import *
from kaggle_home_credit_risk_model_stability.libs.feature_description.feature_description import FreatureDescriptionGetter

from glob import glob
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostClassifier
from enum import Enum

In [3]:
data_loader = DataLoader()
preprocessor = Preprocessor({
    "set_types": SetTypesStep(),
    "process_categorical": ProcessCategoricalStep(),
    "pairwise_diff_raw_dates": PairwiseDateDiffStep(),
#    "generate_cum_features": GenerateCumFeaturesStep(),
    "aggregate_depth_table": AggregateDepthTableStep(),
    "drop_columns": DropColumnsStep(),
    "join_table": JoinTablesStep(),
    "fill_nulls": FillNullStep(),
    "reduce_memory_usage": ReduceMemoryUsageStep(),
#   "drop_non_important_features": DropNonImportantFeaturesStep()
#    "drop_dates_columns": DropDatesColumnsStep()
})

In [4]:
# train_dataset = data_loader.load_train_dataset()
# case_id_set = train_dataset.get_base().filter(train_dataset.get_base()["WEEK_NUM"] < 10)["case_id"]
# train_dataset = train_dataset.filter(lambda table: table.filter(pl.col("case_id").is_in(case_id_set)))

In [5]:
train_dataset = data_loader.load_train_dataset()
print("Train dataset week range: {}-{}".format(train_dataset.get_base()["WEEK_NUM"].min(), train_dataset.get_base()["WEEK_NUM"].max()))
gc.collect()
train_df = preprocessor.process_train_dataset(train_dataset)
del train_dataset
gc.collect()

Train dataset week range: 0-91
Step: set_types, execution_time: 20.203157424926758
Step: process_categorical, execution_time: 26.184550523757935
Create 169 new columns as pairwise dates diff
Step: pairwise_diff_raw_dates, execution_time: 1.330657720565796
Step: aggregate_depth_table, execution_time: 18.238659620285034
Columns to drop: 1188
Step: drop_columns, execution_time: 29.541739463806152
Step: join_table, execution_time: 7.974450349807739
Step: fill_nulls, execution_time: 0.12775135040283203
Step: reduce_memory_usage, execution_time: 5.70292329788208


0

In [6]:
train_df.write_parquet("/kaggle/working/train_df.parquet")

In [7]:
train_df.estimated_size() / 1024 / 1024

2970.9914045333862

# Data visualization

In [8]:
# def bucket_mean_split(x, y, buckets=100):
#     x_buckets = []
#     y_buckets = []

#     x = np.array(x)
#     y = np.array(y)
#     x_quantiles = [np.quantile(x, i / buckets) for i in range(0, buckets + 1)]
#     for i in range(0, buckets):
#         filter = (x >= x_quantiles[i]) & (x <= x_quantiles[i + 1])
#         x_buckets.append(np.mean(x[filter]))
#         y_buckets.append(np.mean(y[filter]))
#     return x_buckets, y_buckets

In [9]:
# train["target"].mean()

In [10]:
# print("Mean target={} where actualdpdtolerance_344P != 0".format(train["target"].filter(train["actualdpdtolerance_344P"] != 0).mean()))
# print("Mean target={} where actualdpdtolerance_344P == 0".format(train["target"].filter(train["actualdpdtolerance_344P"] == 0).mean()))
# print("Mean target={} where actualdpdtolerance_344P is null".format(train["target"].filter(train["actualdpdtolerance_344P"].is_null()).mean()))

In [11]:
# columns = [column for column in train.columns if (train[column].dtype == pl.Float64)]

# figure, axis = plt.subplots(len(columns) // 5 + 1, 5, figsize=(15, 30))
# for index, column in enumerate(columns):
#     current_plot = axis[index // 5, index % 5]
#     x = train[column]
#     y = train["target"]
#     if (x.dtype == pl.String or x.dtype == pl.Boolean):
#         continue
#     filter = x.is_not_null()
#     x = x.filter(filter)
#     y = y.filter(filter)
#     if (x.is_empty()):
#         continue
#     x_bucket, y_bucket = bucket_mean_split(x.to_numpy(), y.to_numpy(), buckets=100)
#     current_plot.plot(x_bucket, y_bucket)
#     current_plot.set_title(column)
# plt.show()

In [12]:
# features = train_df.columns
# features.remove("WEEK_NUM")
# features.remove("case_id")
# features.remove("target")

In [13]:
# gc.collect()

### Feature Importance

In [14]:
# def train_rf(X, Y):    
#     params = {
#         "boosting_type": "gbdt",
#         "objective": "binary",
#         "metric": "auc",
#         "max_depth": 8,
#         "max_bin": 255,
#         "learning_rate": 0.05,
#         "n_estimators": 200,
#         "colsample_bytree": 0.8, 
#         "colsample_bynode": 0.8,
#         "verbose": -1,
#         "random_state": 42,
#         "device": "gpu",
#     }
    
#     model = lgb.LGBMClassifier(**params)
    
#     model.fit(X.to_pandas(), Y.to_pandas())
    
#     # Get feature importances
#     return pl.DataFrame({
#         "feature": list(X.columns),
#         "feature_importance": model.feature_importances_,
#         "train_score": roc_auc_score(Y.to_pandas(), model.predict(X.to_pandas()))
#     })

In [15]:
# def get_feature_importance(X, Y):
#     COUNT_ITER = 50
    
#     dfs = []
#     for i in range(COUNT_ITER):
#         gc.collect()
        
#         start_time = time.time()
#         shuffled_Y = Y.clone().sample(fraction=1.0, shuffle=True)
#         current_df = train_rf(X, shuffled_Y)
#         finish_time = time.time()
#         print(f"Finish iteration: {i + 1}/{COUNT_ITER}, time: {finish_time - start_time}")
#         current_df = current_df.with_columns(pl.lit(i).alias("iteration"))
#         dfs.append(current_df)
        
#     return pl.concat(dfs, rechunk=True), train_rf(X, Y)

In [16]:
# shuffled_feature_importance_df, feature_importance_df = get_feature_importance(train_df[features], train_df["target"])
# shuffled_feature_importance_df.write_csv("shuffled_feature_importance_df.csv", separator=",")
# feature_importance_df.write_csv("feature_importance_df.csv", separator=",")

In [17]:
# important_features = []
# for column in feature_importance_df["feature"]:
#     shuffled_feature_importance = shuffled_feature_importance_df.filter(shuffled_feature_importance_df["feature"] == column)["feature_importance"].to_numpy()
#     feature_importance = feature_importance_df.filter(feature_importance_df["feature"] == column)["feature_importance"][0]
    
#     if (np.percentile(shuffled_feature_importance, 90) <= feature_importance):
#         important_features.append(column)
# print(len(important_features), important_features)

# Training

In [18]:
# class VotingModel(BaseEstimator, RegressorMixin):
#     def __init__(self, estimators):
#         super().__init__()
#         self.estimators = estimators
        
#     def fit(self, X, y=None):
#         return self
    
#     def predict(self, X):
#         y_preds = [estimator.predict(X) for estimator in self.estimators]
#         return np.mean(y_preds, axis=0)
    
#     def predict_proba(self, X):
#         y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
#         return np.mean(y_preds, axis=0)
    
#     def get_feature_importance(self):
#         feature_importances = np.array([0 for feature in features])

#         for model in self.estimators:
#             feature_importances += model.feature_importances_

#         return feature_importances

In [19]:
# for column in features:
#     if (train_dataset[column].dtype != pl.Categorical) and \
#         (train_dataset[column].dtype != pl.Float64) and \
#         (train_dataset[column].dtype != pl.Float64):
#         print(column, train_dataset[column].dtype)

In [20]:
# def train_lgb_model(X_train, Y_train, X_test, Y_test):
#     params = {
#         "boosting_type": "gbdt",
#         "objective": "binary",
#         "metric": "auc",
#         "max_depth": 8,
#         "max_bin": 250,
#         "learning_rate": 0.05,
#         "n_estimators": 1000,
#         "colsample_bytree": 0.8, 
#         "colsample_bynode": 0.8,
#         "verbose": -1,
#         "random_state": 42,
#         "device": "gpu",
#     }
    
#     model = lgb.LGBMClassifier(**params)
    
#     model.fit(
#         X_train, Y_train,
#         eval_set=[(X_test, Y_test)],
#         callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
#     )
        
#     return model

In [21]:
# gc.collect()

In [22]:
# weeks = train_df["WEEK_NUM"]
# cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

# fitted_models = []

# oof_predicted = np.zeros(train_df.shape[0])

# for idx_train, idx_test in cv.split(train_df[features], train_df["target"], groups=weeks):        
# #     X_train = train_df[features][idx_train].to_pandas()
# #     Y_train = train_df["target"][idx_train].to_pandas()
    
#     X_test = train_df[features][idx_test].to_pandas()
#     Y_test = train_df["target"][idx_test].to_pandas()
    
#     start = time.time()
#     params = {
#         "boosting_type": "gbdt",
#         "objective": "binary",
#         "metric": "auc",
#         "max_depth": 8,
#         "max_bin": 250,
#         "learning_rate": 0.05,
#         "n_estimators": 1000,
#         "colsample_bytree": 0.8, 
#         "colsample_bynode": 0.8,
#         "verbose": -1,
#         "random_state": 42,
#         "device": "gpu",
#     }
    
#     model = lgb.LGBMClassifier(**params)
    
#     model.fit(
#         train_df[features][idx_train].to_pandas(), train_df["target"][idx_train].to_pandas(),
#         eval_set=[(X_test, Y_test)],
#         callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
#     )
        

# #    model = train_lgb_model(X_train, Y_train, X_test, Y_test)
#     finish = time.time()
    
#     print("fit time: {}".format(finish - start))
#     fitted_models.append(model)
    
#     test_pred = model.predict_proba(X_test)[:, 1]
#     oof_predicted[idx_test] = test_pred
#     gc.collect()

# model = VotingModel(fitted_models)
# gc.collect()

In [23]:
# roc_auc_oof = roc_auc_score(train_df["target"], oof_predicted)
# print("CV roc_auc_oof: ", roc_auc_oof)

# Predict

In [24]:
# del train_df
# gc.collect()

In [25]:
# test_dataset = data_loader.load_test_dataset()
# print("Test dataset week range: {}-{}".format(test_dataset.base["WEEK_NUM"].min(), test_dataset.base["WEEK_NUM"].max()))
# gc.collect()
# test_df = preprocessor.process_test_dataset(test_dataset)
# del test_dataset
# gc.collect()

In [26]:
# print([name for value, name in reversed(sorted(zip(model.get_feature_importance(), features)))][0:200])

In [27]:
# CV roc_auc_oof:  0.8353789451006361 without feature selection ~500 features
# CV roc_auc_oof:  0.8329322959533371 with feature selection ~200 features

In [28]:
# def gini_stability(dataset, w_fallingrate=88.0, w_resstd=-0.5):
#     gini_in_time = dataset\
#         .sort_values("WEEK_NUM")\
#         .groupby("WEEK_NUM")[["true", "predicted"]]\
#         .apply(lambda x: 2 * roc_auc_score(x["true"], x["predicted"]) - 1).tolist()
    
#     x = np.arange(len(gini_in_time))
#     y = gini_in_time
#     a, b = np.polyfit(x, y, 1)
#     y_hat = a * x + b
#     residuals = y - y_hat
#     res_std = np.std(residuals)
#     avg_gini = np.mean(gini_in_time)
    
#     print(avg_gini, min(0, a), res_std)
    
#     return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [29]:
# # Y_train_predicted = model.predict_proba(train_df[features].to_pandas())
# Y_test_predicted = model.predict_proba(test_df[features].to_pandas())

In [30]:
# result = pd.DataFrame({
#     "WEEK_NUM": train_df["WEEK_NUM"],
#     "true": train_df["target"],
#     "predicted": Y_train_predicted[:, 1],
# })

# train_result = gini_stability(result)
# print("train_score: {}".format(train_result))

# if MODE == Mode.Train:
#     result = pd.DataFrame({
#         "WEEK_NUM": test_df["WEEK_NUM"],
#         "true": test_df["target"],
#         "predicted": Y_test_predicted[:, 1],
#     })

#     test_result = gini_stability(result)
#     print("test_score: {}".format(test_result))

In [31]:
# Version 1, test_score: 0.583319926590042, public score 0.48
# 100 iters

# Version 2, test_score: 0.6062624701357268, public score 0.517
# 300 iters

# Version 3, test_score: ?, public score 0.545
# 2000 iters

# Version 4, test_score: ?, public score ?
# ? iters, Improve aggregate functions for depth_1, depth_2


# Submition

In [32]:
# case_id = test_df["case_id"]
# X = test_df[features].to_pandas()

# Y = model.predict_proba(X)

# submission = pd.DataFrame({
#     "case_id": case_id.to_numpy(),
#     "score": Y[:, 1]
# }).set_index('case_id')
# submission.to_csv("./submission.csv")