In [1]:
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, ClassifierMixin

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def predict_proba_in_batches(model, data, batch_size=100000, predict_mode="base"):
    num_samples = len(data)
    num_batches = int(np.ceil(num_samples / batch_size))
    probabilities = np.zeros((num_samples,))

    for batch_idx in range(num_batches):
        print(f"Processing batch: {batch_idx+1}/{num_batches}")
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_samples)
        X_batch = data.iloc[start_idx:end_idx]
        if predict_mode == "base":
            batch_probs = model.predict_proba(X_batch)[:, 1]
        elif predict_mode == "lightautoml":
            batch_probs = model.predict(X_batch).data.squeeze()
        probabilities[start_idx:end_idx] = batch_probs
        gc.collect()

    return probabilities

class DataPreprocessor:
    @staticmethod
    def transform_data_types(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def process_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_columns(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

# Класс для агрегации данных
# Класс для агрегации данных
class DataAggregator:
    @staticmethod
    def get_expressions(df):
        numeric_cols = [col for col in df.columns if col[-1] in ("P", "A")]
        date_cols = [col for col in df.columns if col[-1] in ("D",)]
        string_cols = [col for col in df.columns if col[-1] in ("M",)]
        other_cols = [col for col in df.columns if col[-1] in ("T", "L")]
        count_cols = [col for col in df.columns if "num_group" in col]

        expr_max_numeric = [pl.max(col).alias(f"max_{col}") for col in numeric_cols]
        expr_max_date = [pl.max(col).alias(f"max_{col}") for col in date_cols]
        expr_max_string = [pl.max(col).alias(f"max_{col}") for col in string_cols]
        expr_max_other = [pl.max(col).alias(f"max_{col}") for col in other_cols]
        expr_max_count = [pl.max(col).alias(f"max_{col}") for col in count_cols]

        return expr_max_numeric + expr_max_date + expr_max_string + expr_max_other + expr_max_count


# Функции чтения данных
def read_data_file(file_path, depth=None):
    df = pl.read_parquet(file_path)
    df = df.pipe(DataPreprocessor.transform_data_types)
    
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(DataAggregator.get_expressions(df))
    
    return df
def read_files(file_pattern, depth=None):
    chunks = []
    for file_path in glob(str(file_pattern)):
        df = pl.read_parquet(file_path)
        df = df.pipe(DataPreprocessor.transform_data_types)
        
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(DataAggregator.get_expressions(df))
        
        chunks.append(df)
        
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    
    return df


# Основная часть кода
ROOT            = Path("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION")
TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

# Чтение и предобработка данных для тренировочного набора
train_data_store = {
    "df_base": read_data_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_data_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_data_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_data_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_data_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
    ]
}

# Чтение и предобработка данных для тестового набора
test_data_store = {
    "df_base": read_data_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_data_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_data_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_data_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_data_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_data_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_data_file(TEST_DIR / "test_other_1.parquet", 1),
        read_data_file(TEST_DIR / "test_person_1.parquet", 1),
        read_data_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_data_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_data_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
    ]
}

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
        
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        
    df_base = df_base.pipe(DataPreprocessor.process_dates)
    
    return df_base


# Объединение данных для тренировочного и тестового наборов
df_train = feature_eng(**train_data_store)
df_test = feature_eng(**test_data_store)

# Фильтрация колонок
df_train = df_train.pipe(DataPreprocessor.filter_columns)
df_test = df_test.select([col for col in df_train.columns if col != "target"])

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols
df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)

# Очистка памяти
del train_data_store
del test_data_store
gc.collect()



0

In [3]:

from sklearn.model_selection import GroupShuffleSplit,train_test_split
X = df_train.drop(columns=["target", "case_id","WEEK_NUM"])
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, val_index in gss.split(X, y, groups=weeks):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    weeks_train, weeks_val = weeks.iloc[train_index], weeks.iloc[val_index]

In [2]:
df_train[cat_cols] = df_train[cat_cols].astype(str)

In [4]:
params = {
    "eval_metric": "AUC",
    "boosting_type": "Plain",
    "random_seed": 42,
    'iterations': 4500,
    "learning_rate": 0.01505646379545813,
    "min_data_in_leaf": 32,
    "l2_leaf_reg": 4.955169551336579e-08,
    "random_strength": 1.5828504506752107e-07,
}

from sklearn.model_selection import GroupShuffleSplit, train_test_split
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from catboost import Pool, CatBoostClassifier

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
CatBoostClassifierm = CatBoostClassifier(**params)
CatBoostClassifierm.fit(train_pool, eval_set=(X_val, y_val), verbose=100)

predictions = CatBoostClassifierm.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, predictions)
print("AUC on validation set:", auc_score)


0:	test: 0.7202152	best: 0.7202152 (0)	total: 3.61s	remaining: 4h 30m 42s
100:	test: 0.8088639	best: 0.8092508 (91)	total: 3m 35s	remaining: 2h 36m 45s
200:	test: 0.8152960	best: 0.8152960 (200)	total: 7m 5s	remaining: 2h 31m 44s
300:	test: 0.8226839	best: 0.8226974 (298)	total: 10m 39s	remaining: 2h 28m 35s
400:	test: 0.8277173	best: 0.8277173 (400)	total: 14m 10s	remaining: 2h 24m 56s
500:	test: 0.8314860	best: 0.8314860 (500)	total: 17m 38s	remaining: 2h 20m 52s
600:	test: 0.8340490	best: 0.8340490 (600)	total: 21m 10s	remaining: 2h 17m 24s
700:	test: 0.8362111	best: 0.8362111 (700)	total: 24m 44s	remaining: 2h 14m 6s
800:	test: 0.8376132	best: 0.8376132 (800)	total: 28m 17s	remaining: 2h 10m 36s
900:	test: 0.8387954	best: 0.8387954 (900)	total: 31m 48s	remaining: 2h 7m 2s
1000:	test: 0.8398947	best: 0.8398947 (1000)	total: 35m 18s	remaining: 2h 3m 24s
1100:	test: 0.8407289	best: 0.8407289 (1100)	total: 38m 47s	remaining: 1h 59m 45s
1200:	test: 0.8414548	best: 0.8414548 (1200)	total

Training has stopped (degenerate solution on iteration 1556, probably too small l2-regularization, try to increase it)



bestTest = 0.8434644496
bestIteration = 1555

Shrink model to first 1556 iterations.
AUC on validation set: 0.8434644496223225


In [4]:
from sklearn.model_selection import GroupShuffleSplit,train_test_split  # Importing GroupShuffleSplit for group-based splitting
import optuna  # Importing Optuna for hyperparameter optimization
from lightgbm import LGBMClassifier  # Importing LightGBM classifier
from sklearn.metrics import roc_auc_score  # Importing ROC AUC score metric

# Defining a custom metric to optimize for gini stability
def gini_stability(base, y_true, weeks, w_fallingrate=88.0, w_resstd=-0.5):
    # Computing Gini in time for each week and aggregating
    gini_in_time = base.groupby(weeks)\
                      .apply(lambda x: 2 * roc_auc_score(y_true.loc[x.index], x["predictions"]) - 1)\
                      .tolist()
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    
    # Computing average Gini, falling rate, and residual standard deviation
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

# Defining the objective function for Optuna optimization
def objective(trial, X_train, X_val, y_train, y_val, weeks_train, weeks_val):
    # Defining the parameters to be optimized along with their ranges
    param = {
        "objective": "binary",  # Objective function for optimization - binary classification
        "metric": "auc",  # Quality metric - Area Under ROC Curve
        "verbosity": -1,  # Verbosity level for training information (-1 to disable)
        "boosting_type": "gbdt",  # Boosting type - Gradient Boosting Decision Trees
        "random_state": 42,  # Setting random state for result reproducibility
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.5),  # Learning rate
        "num_leaves": trial.suggest_int("num_leaves", 2, 70),  # Maximum number of leaves in a tree
        "max_depth": trial.suggest_int("max_depth", 5, 16),  # Maximum tree depth
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100),  # Minimum data in leaf
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),  # Subsample ratio for training each tree
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),  # Feature fraction for building each tree
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),  # L1 regularization
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),  # L2 regularization
        "n_estimators": trial.suggest_int("n_estimators", 1200, 2800),  # Number of boosting trees
        "min_split_gain": trial.suggest_loguniform("min_split_gain", 1e-4, 0.1),  # Minimum loss reduction for split
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),  # Frequency of subsampling for training each tree
        "cat_smooth": trial.suggest_int("cat_smooth", 10, 50),  # Smoothing for categorical features
        "max_bin": trial.suggest_int("max_bin", 150, 500),  # Maximum number of bins for histogram construction
        "max_delta_step": trial.suggest_int("max_delta_step", 1, 10)  # Maximum delta step in weight estimation for gradient descent
    }


    # Initializing LightGBM classifier with the suggested parameters
    lgbm_classifier = LGBMClassifier(**params)
    
    # Training the classifier
    lgbm_classifier.fit(X_train, y_train)
    
    # Making predictions on the validation set
    predictions = lgbm_classifier.predict_proba(X_val)[:, 1]
    predictions_df = pd.DataFrame(predictions, columns=['predictions'])
    
    # Combining predictions with validation set and weeks information
    index_X_val = X_val.index
    X_val_with_weeks = X_val.join(weeks_val.rename("WEEK_NUM"))
    base = pd.concat([X_val_with_weeks.reset_index(drop=True), predictions_df], axis=1)
    base.index = index_X_val
    base["WEEK_NUM"] = weeks_val
    
    # Computing the gini stability score
    score = gini_stability(base, y_val, weeks_val)

    return score

# Using TPESampler for more efficient sampling
sampler = optuna.samplers.TPESampler(seed=42)  

# Creating an Optuna study to maximize the objective function
study = optuna.create_study(direction="maximize", sampler=sampler)

# Optimizing the objective function with 2000 trials
study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val, weeks_train, weeks_val), n_trials=2000)

# Retrieving the best parameters found during optimization
best_params = study.best_params
print('='*50)
print(best_params)


[I 2024-05-16 14:40:13,381] A new study created in memory with name: no-name-d9c99e2a-253f-40d8-ae54-f6d5b7d94ae4


In [4]:
best_params

{'learning_rate': 0.04643712833927281,
 'n_estimators': 753,
 'reg_alpha': 0.07054742991649575,
 'reg_lambda': 0.09472469242094589,
 'max_depth': 9,
 'num_leaves': 64,
 'min_data_in_leaf': 35,
 'feature_fraction': 0.49098130041779536,
 'bagging_fraction': 0.7868937212510314,
 'bagging_freq': 5,
 'min_child_samples': 38,
 'min_split_gain': 0.2273261520885937,
 'min_child_weight': 0.029529028536948946}

In [3]:
from lightgbm import LGBMClassifier 
from sklearn.ensemble import VotingClassifier
best_params_lgbm1 = {
    1
    }
best_params_lgbm2 = {
    2
    }
best_params_lgbm3 = {
    3
    }
best_params_lgbm4 = {
    4
    }
best_params_lgbm5 = {
    5
    }


lgbm_model_1 = LGBMClassifier(**best_params_lgbm1)
lgbm_model_2 = LGBMClassifier(**best_params_lgbm2)
lgbm_model_3 = LGBMClassifier(**best_params_lgbm3)
lgbm_model_4 = LGBMClassifier(**best_params_lgbm4)
lgbm_model_5 = LGBMClassifier(**best_params_lgbm5)

# Создание VotingClassifier с использованием моделей LGBM
voting_classifier = VotingClassifier(
    estimators=[
        ('lgbm1', lgbm_model_1),
        ('lgbm2', lgbm_model_2),
        ('lgbm3', lgbm_model_3),
        ('lgbm4', lgbm_model_4),
        ('lgbm5', lgbm_model_5)
    ],
    voting='soft'  
)
voting_classifier.fit(X_train, y_train)


