In [1]:
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, ClassifierMixin

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def predict_proba_in_batches(model, data, batch_size=100000, predict_mode="base"):
    num_samples = len(data)
    num_batches = int(np.ceil(num_samples / batch_size))
    probabilities = np.zeros((num_samples,))

    for batch_idx in range(num_batches):
        print(f"Processing batch: {batch_idx+1}/{num_batches}")
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_samples)
        X_batch = data.iloc[start_idx:end_idx]
        if predict_mode == "base":
            batch_probs = model.predict_proba(X_batch)[:, 1]
        elif predict_mode == "lightautoml":
            batch_probs = model.predict(X_batch).data.squeeze()
        probabilities[start_idx:end_idx] = batch_probs
        gc.collect()

    return probabilities

class DataPreprocessor:
    @staticmethod
    def transform_data_types(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def process_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_columns(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

# Класс для агрегации данных
# Класс для агрегации данных
class DataAggregator:
    @staticmethod
    def get_expressions(df):
        numeric_cols = [col for col in df.columns if col[-1] in ("P", "A")]
        date_cols = [col for col in df.columns if col[-1] in ("D",)]
        string_cols = [col for col in df.columns if col[-1] in ("M",)]
        other_cols = [col for col in df.columns if col[-1] in ("T", "L")]
        count_cols = [col for col in df.columns if "num_group" in col]

        expr_max_numeric = [pl.max(col).alias(f"max_{col}") for col in numeric_cols]
        expr_max_date = [pl.max(col).alias(f"max_{col}") for col in date_cols]
        expr_max_string = [pl.max(col).alias(f"max_{col}") for col in string_cols]
        expr_max_other = [pl.max(col).alias(f"max_{col}") for col in other_cols]
        expr_max_count = [pl.max(col).alias(f"max_{col}") for col in count_cols]

        return expr_max_numeric + expr_max_date + expr_max_string + expr_max_other + expr_max_count


# Функции чтения данных
def read_data_file(file_path, depth=None):
    df = pl.read_parquet(file_path)
    df = df.pipe(DataPreprocessor.transform_data_types)
    
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(DataAggregator.get_expressions(df))
    
    return df
def read_files(file_pattern, depth=None):
    chunks = []
    for file_path in glob(str(file_pattern)):
        df = pl.read_parquet(file_path)
        df = df.pipe(DataPreprocessor.transform_data_types)
        
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(DataAggregator.get_expressions(df))
        
        chunks.append(df)
        
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    
    return df


# Основная часть кода
ROOT            = Path("C:\\Users\\Daniil Bokhan\\Desktop\\csv\\bankcredit-riskCOMPETITION")
TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

# Чтение и предобработка данных для тренировочного набора
train_data_store = {
    "df_base": read_data_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_data_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_data_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_data_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_data_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_data_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
    ]
}

# Чтение и предобработка данных для тестового набора
test_data_store = {
    "df_base": read_data_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_data_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_data_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_data_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_data_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_data_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_data_file(TEST_DIR / "test_other_1.parquet", 1),
        read_data_file(TEST_DIR / "test_person_1.parquet", 1),
        read_data_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_data_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_data_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
    ]
}

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
        
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        
    df_base = df_base.pipe(DataPreprocessor.process_dates)
    
    return df_base


# Объединение данных для тренировочного и тестового наборов
df_train = feature_eng(**train_data_store)
df_test = feature_eng(**test_data_store)

# Фильтрация колонок
df_train = df_train.pipe(DataPreprocessor.filter_columns)
df_test = df_test.select([col for col in df_train.columns if col != "target"])

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols
df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)

# Очистка памяти
del train_data_store
del test_data_store
gc.collect()



In [None]:

from sklearn.model_selection import GroupShuffleSplit,train_test_split
X = df_train.drop(columns=["target", "case_id","WEEK_NUM"])
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, val_index in gss.split(X, y, groups=weeks):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    weeks_train, weeks_val = weeks.iloc[train_index], weeks.iloc[val_index]

In [None]:




from sklearn.model_selection import GroupShuffleSplit,train_test_split
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score


def gini_stability(base, y_true, weeks, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.groupby(weeks)\
                      .apply(lambda x: 2 * roc_auc_score(y_true.loc[x.index], x["predictions"]) - 1)\
                      .tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

def objective(trial, X_train, X_val, y_train, y_val, weeks_train, weeks_val):
    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 600, 2000),
        'lambda': trial.suggest_float('lambda', 0.01, 1),
        'alpha': trial.suggest_float('alpha', 0.01, 1),
        'max_depth': trial.suggest_int('max_depth', 1, 16),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 20),
        'device':'gpu',
    }

    lgbm_classifier = LGBMClassifier(**param)
    lgbm_classifier.fit(X_train, y_train)
    
    predictions = lgbm_classifier.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, predictions)

    return auc_score
sampler = optuna.samplers.TPESampler(seed=42)  
study = optuna.create_study(direction="maximize", sampler=sampler)

study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val, weeks_train, weeks_val), n_trials=500)

best_params = study.best_params
print('='*50)
print(best_params)
