In [None]:
import gc
import os
import pickle

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

DATA_FEATURES_FOLDER = "data/features"
TIME_PERIOD_MAPPING = {"M": "month", "W": "week", "D": "day"}

In [None]:
def reduce_df_size(
    df: pd.DataFrame, force_cat: list | None = None, drop: list | None = None
) -> pd.DataFrame:

    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")

    # drops
    if drop:
        for drop_col in drop:
            df = df.drop(drop_col, axis=1)

    float_columns = df.select_dtypes("float").columns
    integer_columns = df.select_dtypes("integer").columns

    # for float all are possible from float to unsigned
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast="float")
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast="integer")
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast="unsigned")

    # for integer two possibilities
    df[integer_columns] = df[integer_columns].apply(pd.to_numeric, downcast="integer")
    df[integer_columns] = df[integer_columns].apply(pd.to_numeric, downcast="unsigned")

    # category
    if force_cat:
        for category_col in force_cat:
            df[category_col] = df[category_col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%")
    return df


def cache_or_generate(filename, generator_function, load, *args, **kwargs):
    if load:
        if os.path.exists(filename):
            with open(filename, "rb") as f:
                print(f"Loading data from {filename}")
                return pickle.load(f)
        else:
            print(f"Generating and saving data to {filename}")
            data = generator_function(*args, **kwargs)
            with open(filename, "wb") as f:
                pickle.dump(data, f)
            return data
    else:
        print("Not loading")
        return None

In [None]:
df_transactions = pd.read_parquet("data/df_transaction.pa")
df_train = pd.read_parquet("data/train.pa")

df_train = reduce_df_size(df_train)
df_transactions = reduce_df_size(df_transactions, drop=["merchant_name"])


# df_train = df_train[df_train["client_num"] < 500]
# df_transactions = df_transactions[df_transactions["client_num"] < 500]


test_clients = list(
    set(df_transactions["client_num"].unique().tolist()).difference(
        df_train["client_num"].to_list()
    )
)

transactions = {
    "train": df_transactions[~df_transactions["client_num"].isin(test_clients)],
    "test": df_transactions[df_transactions["client_num"].isin(test_clients)],
}

Memory usage of dataframe is 1.07 MB
Memory usage after optimization is: 0.33 MB
Decreased by 68.7%
Memory usage of dataframe is 515.30 MB
Memory usage after optimization is: 309.18 MB
Decreased by 40.0%


In [None]:
LOAD = False
AGGS = ["sum", "std", "mean", "min", "max"]


def generate_amount_ts(df, period):
    return reduce_df_size(
        df.set_index("date_time")
        .groupby("client_num")
        .resample(period)["amount"]
        .agg(AGGS)
        .reset_index()
    )


trans_amt_aggs = {}

for split, dataset in transactions.items():
    trans_amt_aggs[split] = {}
    for short_period, full_period in TIME_PERIOD_MAPPING.items():
        file_path = f"{DATA_FEATURES_FOLDER}/{split}_transactions_{full_period}.pkl"
        trans_amt_aggs[split][full_period] = cache_or_generate(
            file_path, generate_amount_ts, LOAD, dataset, short_period
        )

Not loading
Not loading
Not loading
Not loading
Not loading
Not loading


In [None]:
PARAMS = EfficientFCParameters()
N_JOBS = 16
LOAD = False


def extract_tsfresh_features(df, column_id, column_sort, column_value, y=None):
    df = df.fillna(0)
    result = None
    print("Extracting", column_value, "y=", type(y))
    if y is not None:
        result = extract_relevant_features(
            df,
            y=y,
            column_id=column_id,
            column_sort=column_sort,
            column_value=column_value,
            n_jobs=N_JOBS,
            default_fc_parameters=PARAMS,
        )
    else:
        result = extract_features(
            df,
            column_id=column_id,
            column_sort=column_sort,
            column_value=column_value,
            n_jobs=N_JOBS,
            default_fc_parameters=PARAMS,
            impute_function=impute,
        )
    print("Extracting", column_value, "shape=", result.shape)

    return result


def extract_tsfresh_features_ts_columns(df, column_id, column_sort, ts_columns, y=None):
    results = []
    for col in ts_columns:
        result = extract_tsfresh_features(df, column_id, column_sort, col, y)
        results.append(result)
    concat_df = pd.concat(results, axis=1)
    print("Concat shape", concat_df.shape)
    return concat_df


column_id = "client_num"
column_sort = "date_time"
y = df_train.set_index("client_num").iloc[:, 0]  # series required
column_value = "amount"

trans_amt_aggs_features = {}

# aggs
for split, aggs_datasets in trans_amt_aggs.items():
    trans_amt_aggs_features[split] = {}
    for full_period, dataset in aggs_datasets.items():
        file_path = (
            f"{DATA_FEATURES_FOLDER}/{split}_transactions_{full_period}_features.pkl"
        )
        trans_amt_aggs_features[split][full_period] = cache_or_generate(
            file_path,
            extract_tsfresh_features_ts_columns,
            LOAD,
            df=dataset,
            column_id=column_id,
            column_sort=column_sort,
            ts_columns=AGGS,
            y=y if split == "train" else None,
        )

# full
for split, dataset in transactions.items():
    file_path = f"{DATA_FEATURES_FOLDER}/{split}_transactions_full_features.pkl"

    trans_amt_aggs_features[split]["full"] = cache_or_generate(
        file_path,
        extract_tsfresh_features,
        LOAD,
        df=dataset,
        column_id=column_id,
        column_sort=column_sort,
        column_value=column_value,
        y=y if split == "train" else None,
    )

Not loading
Not loading
Not loading
Not loading
Not loading
Not loading
Not loading
Not loading


In [None]:
LOAD = True


def create_amount_ts_features(split_period_df_dict):
    for period, dataset in split_period_df_dict.items():
        split_period_df_dict[period] = dataset.add_prefix(f"ts_{period}_")

    amount_ts_features = pd.concat(list(split_period_df_dict.values()), axis=1)

    print("Shape before", amount_ts_features.shape)
    amount_ts_features = amount_ts_features.T.drop_duplicates().T
    amount_ts_features = amount_ts_features.loc[:, amount_ts_features.nunique() > 1]
    amount_ts_features = reduce_df_size(amount_ts_features)
    print("Shape after", amount_ts_features.shape)

    return amount_ts_features


ts_features = {}

for split, datasets_features in trans_amt_aggs_features.items():
    file_path = f"{DATA_FEATURES_FOLDER}/{split}_amount_ts_features.pkl"
    ts_features[split] = cache_or_generate(
        file_path,
        create_amount_ts_features,
        LOAD,
        split_period_df_dict=datasets_features,
    )

Loading data from data/features/train_amount_ts_features.pkl
Loading data from data/features/test_amount_ts_features.pkl


In [None]:
train_features = ts_features["train"]
test_features = ts_features["test"]

common_columns = train_features.columns.intersection(test_features.columns)
train_features = train_features[common_columns]
test_features = test_features[common_columns]

print(train_features.shape)
print(test_features.shape)

(70000, 4505)
(39143, 4505)


In [None]:
train_features = train_features.reset_index()
train_features = train_features.rename(columns={"index": "client_num"})
test_features = test_features.reset_index()
test_features = test_features.rename(columns={"index": "client_num"})

In [None]:
tsfresh_features = pd.concat([train_features, test_features])
tsfresh_features

Unnamed: 0,client_num,ts_month_sum_sum__sum_values,ts_month_sum_sum__quantile__q_0.3,ts_month_sum_sum__quantile__q_0.4,ts_month_sum_sum__quantile__q_0.6,ts_month_sum_sum__quantile__q_0.7,"ts_month_sum_sum__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","ts_month_sum_sum__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)",ts_month_sum_sum__quantile__q_0.1,"ts_month_sum_sum__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)",...,ts_full_FULL_amount__ar_coefficient__coeff_1__k_10,"ts_full_FULL_amount__cwt_coefficients__coeff_9__w_2__widths_(2, 5, 10, 20)","ts_full_FULL_amount__cwt_coefficients__coeff_6__w_2__widths_(2, 5, 10, 20)","ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_19","ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_8",ts_full_FULL_amount__partial_autocorrelation__lag_1,ts_full_FULL_amount__autocorrelation__lag_1,"ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_16",ts_full_FULL_amount__symmetry_looking__r_0.2,"ts_full_FULL_amount__cwt_coefficients__coeff_10__w_2__widths_(2, 5, 10, 20)"
0,1,863878,223786.2,230229.6,273888.2,311103.4,3.553388e+05,250359.229482,210899.4,179891.523475,...,0.026840,64675.843368,-24414.939894,-2.650545e+04,4342.582521,0.039248,0.039248,1.394325e+05,1,36436.719427
1,2,344108,101008.2,107827.6,121499.8,128352.6,1.030089e+05,73869.259141,87369.4,53207.112289,...,0.008114,818.978370,823.623005,-6.458124e+04,35812.510880,0.001493,0.001493,1.776491e+04,1,-725.189242
2,3,1621825,69126.6,69467.8,352629.8,635450.6,9.377156e+05,600436.513367,68444.2,425395.933829,...,-0.012920,-280816.907484,369164.707969,-1.099448e+06,231174.792984,0.261533,0.261533,-1.293465e+06,1,-152611.379219
3,4,199796,56091.6,59900.8,69252.4,74794.8,8.192955e+04,58714.306632,48473.2,42287.386989,...,0.031599,-9153.776700,-4069.831964,-1.474321e+04,21025.580694,0.039427,0.039427,-5.235801e+04,1,8848.187406
4,5,67359,18945.8,19078.4,21288.8,23366.6,2.568245e+04,18251.089477,18680.6,13129.670410,...,-0.024653,-47.645553,-129.587023,-1.496116e+04,9915.134221,-0.026696,-0.026696,1.471015e+04,1,-178.094475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39138,109127,1960002,162000.0,216000.0,554000.4,838000.8,1.036464e+06,655517.237708,54000.0,463520.683968,...,0.008678,195299.048898,366507.938735,1.757362e+02,254.499047,0.479598,0.479598,2.363635e+02,1,-244150.505170
39139,109128,28357,28357.0,28357.0,28357.0,28357.0,1.739111e+04,10999.100776,28357.0,7777.538746,...,0.008678,-179.559063,-2291.657797,1.757362e+02,11163.000000,0.105864,0.105864,2.363635e+02,1,-407.724002
39140,109130,3588,767.2,852.6,1178.2,1418.4,7.826929e+02,550.341468,596.4,395.326961,...,0.008678,329.717067,-84.429569,1.757362e+02,132.000000,-0.274564,-0.274564,2.363635e+02,1,80.453643
39141,109137,3723,3723.0,3723.0,3723.0,3723.0,2.283284e+03,1444.075614,3723.0,1021.115659,...,0.008678,98.079456,16.601667,1.757362e+02,249.896781,-0.146298,-0.146298,2.363635e+02,1,52.914136


In [None]:
assert tsfresh_features.isna().sum().sum() == 0

In [None]:
tsfresh_features.to_pickle("data/features/tsfresh_generations.pkl")

In [None]:
df_train

Unnamed: 0,client_num,target
0,94779,3
1,17279,0
2,5717,2
3,27471,1
4,72725,0
...,...,...
69995,107219,1
69996,108682,1
69997,93497,3
69998,14344,6


In [None]:
df_train = df_train.merge(
    train_features, left_on="client_num", right_on="index", how="left"
)

del train_features
gc.collect()

0

In [None]:
X_train = df_train.drop(["target", "client_num", "index"], axis=1)
y_train = df_train["target"]

test_features = test_features.reset_index()
test_features = test_features.rename(columns={"index": "client_num"})

submission = test_features[["client_num"]]
X_test = test_features.drop("client_num", axis=1)

In [None]:
test_features

Unnamed: 0,client_num,ts_month_sum_sum__sum_values,ts_month_sum_sum__quantile__q_0.3,ts_month_sum_sum__quantile__q_0.4,ts_month_sum_sum__quantile__q_0.6,ts_month_sum_sum__quantile__q_0.7,"ts_month_sum_sum__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","ts_month_sum_sum__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)",ts_month_sum_sum__quantile__q_0.1,"ts_month_sum_sum__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)",...,ts_full_FULL_amount__ar_coefficient__coeff_1__k_10,"ts_full_FULL_amount__cwt_coefficients__coeff_9__w_2__widths_(2, 5, 10, 20)","ts_full_FULL_amount__cwt_coefficients__coeff_6__w_2__widths_(2, 5, 10, 20)","ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_19","ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_8",ts_full_FULL_amount__partial_autocorrelation__lag_1,ts_full_FULL_amount__autocorrelation__lag_1,"ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_16",ts_full_FULL_amount__symmetry_looking__r_0.2,"ts_full_FULL_amount__cwt_coefficients__coeff_10__w_2__widths_(2, 5, 10, 20)"
0,0,106935,28531.6,35621.8,45562.0,48412.0,2.757517e+04,23607.014952,14351.2,17381.215677,...,0.259413,116.453450,-1128.178013,18129.411403,3097.927303,0.237970,0.237970,4691.062603,1,-3.342568
1,10,3020981,696874.6,737464.8,956008.4,1133961.8,1.338690e+06,930897.197519,615694.2,667648.558364,...,0.112312,-217.529786,5059.134621,-259599.527978,10450.059108,0.113700,0.113700,401290.399130,1,1157.217300
2,11,200840,55898.4,61389.2,72410.8,77941.6,7.939025e+04,60445.451193,44916.8,43884.080014,...,0.106536,159.270796,-229.082339,22039.609374,-13851.753923,0.110089,0.110089,-21748.131402,1,-38.124623
3,14,711788,225890.2,227823.6,236193.8,242630.6,2.539093e+05,185460.814415,222023.4,133917.816075,...,0.014787,2746.182881,-1031.644177,-49410.702395,21441.849854,0.013820,0.013820,-108314.718056,1,2566.103837
4,16,117194,35142.8,44218.4,53832.0,54370.0,5.540983e+04,41105.313620,16991.6,29742.559626,...,0.138884,-150.792070,5.055584,-2224.978177,6991.589609,0.142226,0.142226,1703.497581,1,-171.209523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39138,109127,1960002,162000.0,216000.0,554000.4,838000.8,1.036464e+06,655517.237708,54000.0,463520.683968,...,0.008678,195299.048898,366507.938735,175.736200,254.499047,0.479598,0.479598,236.363474,1,-244150.505170
39139,109128,28357,28357.0,28357.0,28357.0,28357.0,1.739111e+04,10999.100776,28357.0,7777.538746,...,0.008678,-179.559063,-2291.657797,175.736200,11163.000000,0.105864,0.105864,236.363474,1,-407.724002
39140,109130,3588,767.2,852.6,1178.2,1418.4,7.826929e+02,550.341468,596.4,395.326961,...,0.008678,329.717067,-84.429569,175.736200,132.000000,-0.274564,-0.274564,236.363474,1,80.453643
39141,109137,3723,3723.0,3723.0,3723.0,3723.0,2.283284e+03,1444.075614,3723.0,1021.115659,...,0.008678,98.079456,16.601667,175.736200,249.896781,-0.146298,-0.146298,236.363474,1,52.914136


In [None]:
assert X_train.shape[0] == 70000
assert X_test.shape[0] == 39143
assert df_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()
EARLY_STOPPING = 400
EVAL_METRIC = "MAE"

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric=EVAL_METRIC,
        early_stopping_rounds=EARLY_STOPPING,
        task_type="GPU",
        # depth=4,
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5803788	test: 1.5801113	best: 1.5801113 (0)	total: 380ms	remaining: 1h 3m 14s
1:	total: 415ms	remaining: 34m 34s
2:	total: 452ms	remaining: 25m 5s
3:	total: 490ms	remaining: 20m 24s
4:	total: 522ms	remaining: 17m 23s
5:	learn: 1.5732952	test: 1.5729799	best: 1.5729799 (5)	total: 554ms	remaining: 15m 23s
6:	total: 585ms	remaining: 13m 55s
7:	total: 619ms	remaining: 12m 53s
8:	total: 648ms	remaining: 11m 59s
9:	total: 678ms	remaining: 11m 16s
10:	learn: 1.5664996	test: 1.5660653	best: 1.5660653 (10)	total: 711ms	remaining: 10m 45s
11:	total: 755ms	remaining: 10m 28s
12:	total: 789ms	remaining: 10m 6s
13:	total: 819ms	remaining: 9m 44s
14:	total: 851ms	remaining: 9m 26s
15:	learn: 1.5595995	test: 1.5590960	best: 1.5590960 (15)	total: 883ms	remaining: 9m 10s
16:	total: 913ms	remaining: 8m 56s
17:	total: 945ms	remaining: 8m 43s
18:	total: 974ms	remaining: 8m 31s
19:	total: 1.01s	remaining: 8m 22s
20:	learn: 1.5528205	test: 1.5522363	best: 1.5522363 (20)	total: 1.04s	remaining: 8

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5803616	test: 1.5803786	best: 1.5803786 (0)	total: 54.5ms	remaining: 9m 4s
1:	total: 106ms	remaining: 8m 47s
2:	total: 154ms	remaining: 8m 33s
3:	total: 183ms	remaining: 7m 37s
4:	total: 212ms	remaining: 7m 3s
5:	learn: 1.5732298	test: 1.5734143	best: 1.5734143 (5)	total: 241ms	remaining: 6m 41s
6:	total: 271ms	remaining: 6m 26s
7:	total: 301ms	remaining: 6m 15s
8:	total: 330ms	remaining: 6m 6s
9:	total: 360ms	remaining: 5m 59s
10:	learn: 1.5663682	test: 1.5666134	best: 1.5666134 (10)	total: 388ms	remaining: 5m 52s
11:	total: 420ms	remaining: 5m 49s
12:	total: 451ms	remaining: 5m 46s
13:	total: 479ms	remaining: 5m 41s
14:	total: 509ms	remaining: 5m 38s
15:	learn: 1.5594335	test: 1.5597062	best: 1.5597062 (15)	total: 541ms	remaining: 5m 37s
16:	total: 570ms	remaining: 5m 34s
17:	total: 599ms	remaining: 5m 31s
18:	total: 628ms	remaining: 5m 29s
19:	total: 657ms	remaining: 5m 27s
20:	learn: 1.5524877	test: 1.5528796	best: 1.5528796 (20)	total: 684ms	remaining: 5m 25s
21:	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5802656	test: 1.5805340	best: 1.5805340 (0)	total: 44.2ms	remaining: 7m 22s
1:	total: 86.8ms	remaining: 7m 13s
2:	total: 128ms	remaining: 7m 8s
3:	total: 172ms	remaining: 7m 10s
4:	total: 204ms	remaining: 6m 47s
5:	learn: 1.5733661	test: 1.5737543	best: 1.5737543 (5)	total: 237ms	remaining: 6m 35s
6:	total: 269ms	remaining: 6m 23s
7:	total: 303ms	remaining: 6m 18s
8:	total: 334ms	remaining: 6m 10s
9:	total: 366ms	remaining: 6m 5s
10:	learn: 1.5665312	test: 1.5670199	best: 1.5670199 (10)	total: 397ms	remaining: 6m
11:	total: 428ms	remaining: 5m 56s
12:	total: 460ms	remaining: 5m 53s
13:	total: 492ms	remaining: 5m 50s
14:	total: 524ms	remaining: 5m 48s
15:	learn: 1.5595629	test: 1.5600767	best: 1.5600767 (15)	total: 557ms	remaining: 5m 47s
16:	total: 590ms	remaining: 5m 46s
17:	total: 624ms	remaining: 5m 45s
18:	total: 657ms	remaining: 5m 45s
19:	total: 690ms	remaining: 5m 44s
20:	learn: 1.5526840	test: 1.5532341	best: 1.5532341 (20)	total: 726ms	remaining: 5m 44s
21:	total: 

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5803111	test: 1.5803983	best: 1.5803983 (0)	total: 30.6ms	remaining: 5m 5s
1:	total: 61.7ms	remaining: 5m 8s
2:	total: 91.3ms	remaining: 5m 4s
3:	total: 119ms	remaining: 4m 57s
4:	total: 150ms	remaining: 4m 59s
5:	learn: 1.5731366	test: 1.5735770	best: 1.5735770 (5)	total: 180ms	remaining: 5m
6:	total: 213ms	remaining: 5m 4s
7:	total: 244ms	remaining: 5m 4s
8:	total: 277ms	remaining: 5m 7s
9:	total: 313ms	remaining: 5m 12s
10:	learn: 1.5660027	test: 1.5667621	best: 1.5667621 (10)	total: 343ms	remaining: 5m 11s
11:	total: 372ms	remaining: 5m 9s
12:	total: 404ms	remaining: 5m 10s
13:	total: 433ms	remaining: 5m 8s
14:	total: 464ms	remaining: 5m 8s
15:	learn: 1.5589351	test: 1.5600292	best: 1.5600292 (15)	total: 497ms	remaining: 5m 10s
16:	total: 3.14s	remaining: 30m 41s
17:	total: 3.14s	remaining: 30m 41s
18:	total: 5.77s	remaining: 53m 21s
19:	total: 5.77s	remaining: 53m 21s
20:	learn: 1.5518781	test: 1.5533210	best: 1.5533210 (20)	total: 8.41s	remaining: 1h 13m 39s
21:	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5803196	test: 1.5802959	best: 1.5802959 (0)	total: 53.5ms	remaining: 8m 54s
1:	total: 107ms	remaining: 8m 53s
2:	total: 155ms	remaining: 8m 36s
3:	total: 202ms	remaining: 8m 25s
4:	total: 246ms	remaining: 8m 12s
5:	learn: 1.5734012	test: 1.5733867	best: 1.5733867 (5)	total: 276ms	remaining: 7m 40s
6:	total: 308ms	remaining: 7m 19s
7:	total: 339ms	remaining: 7m 3s
8:	total: 369ms	remaining: 6m 49s
9:	total: 398ms	remaining: 6m 37s
10:	learn: 1.5665226	test: 1.5664972	best: 1.5664972 (10)	total: 429ms	remaining: 6m 29s
11:	total: 461ms	remaining: 6m 23s
12:	total: 493ms	remaining: 6m 18s
13:	total: 529ms	remaining: 6m 17s
14:	total: 559ms	remaining: 6m 11s
15:	learn: 1.5595550	test: 1.5594916	best: 1.5594916 (15)	total: 589ms	remaining: 6m 7s
16:	total: 618ms	remaining: 6m 2s
17:	total: 646ms	remaining: 5m 58s
18:	total: 674ms	remaining: 5m 53s
19:	total: 705ms	remaining: 5m 51s
20:	learn: 1.5527560	test: 1.5526843	best: 1.5526843 (20)	total: 739ms	remaining: 5m 50s
21:	total

In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
1383,3.70558,ts_day_sum_sum__number_peaks__n_1
3037,3.460619,ts_full_FULL_amount__quantile__q_0.6
2679,3.449445,ts_day_max_max__number_peaks__n_1
1398,1.85316,ts_day_sum_sum__cid_ce__normalize_True
1590,1.658696,ts_day_sum_sum__number_crossing_m__m_1
197,1.222721,ts_month_min_min__quantile__q_0.1
3039,1.148171,ts_full_FULL_amount__median
3055,1.13832,ts_full_FULL_amount__quantile__q_0.3
2658,1.113551,ts_day_max_max__quantile__q_0.4
3097,1.033507,ts_full_FULL_amount__quantile__q_0.1


In [None]:
# Predictions on the test set using probabilities
y_pred_proba = np.zeros(
    (X_test.shape[0], len(np.unique(y_train)))
)  # Assuming classes are 0 to n_classes-1
for model in models_list:
    y_pred_proba += model.predict_proba(X_test)

# Take the class with the maximum summed probability
y_pred = np.argmax(y_pred_proba, axis=1)

In [None]:
submission["target"] = y_pred
submission.to_csv("submissions/ts_fresh_features_test_2.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission["target"] = y_pred


In [None]:
#     transaction_span_days=("date_time", lambda x: (x.max() - x.min()).days),

#     # Count of transactions in recent periods
#     transactions_last_month=("date_time", lambda x: df_transactions.loc[x.index, 'is_last_month'].sum()),
#     transactions_last_week=("date_time", lambda x: df_transactions.loc[x.index, 'is_last_week'].sum()),
#     transactions_last_two_weeks=("date_time", lambda x: df_transactions.loc[x.index, 'is_last_two_weeks'].sum()),

#     # Additional statistics for amount during specific periods
#     sum_amount_last_month=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_month']].sum()),
#     avg_amount_last_month=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_month']].mean() if df_transactions.loc[x.index, 'is_last_month'].any() else 0),
#     max_amount_last_month=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_month']].max() if df_transactions.loc[x.index, 'is_last_month'].any() else 0),

#     sum_amount_last_week=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_week']].sum()),
#     avg_amount_last_week=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_week']].mean() if df_transactions.loc[x.index, 'is_last_week'].any() else 0),
#     max_amount_last_week=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_week']].max() if df_transactions.loc[x.index, 'is_last_week'].any() else 0),

#     sum_amount_last_two_weeks=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_two_weeks']].sum()),
#     avg_amount_last_two_weeks=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_two_weeks']].mean() if df_transactions.loc[x.index, 'is_last_two_weeks'].any() else 0),
#     max_amount_last_two_weeks=("amount", lambda x: x[df_transactions.loc[x.index, 'is_last_two_weeks']].max() if df_transactions.loc[x.index, 'is_last_two_weeks'].any() else 0)
# )

In [None]:
# Добавляем количество дней, часов с момента первой и предыдущей транзакций
first_trx = transactions_df.groupby("user_id")["transaction_dttm"].min().reset_index()
first_trx.rename(columns={"transaction_dttm": "first_tr"}, inplace=True)
transactions_df = transactions_df.merge(first_trx, on="user_id", how="left")

transactions_df["days_from_first_tr"] = (
    transactions_df["transaction_dttm"] - transactions_df["first_tr"]
) / np.timedelta64(1, "D")
transactions_df["days_from_first_tr"] = (transactions_df["days_from_first_tr"]).astype(
    "int"
)
transactions_df["days_from_prev_tr"] = transactions_df[
    "transaction_dttm"
].diff() / np.timedelta64(1, "D")
transactions_df["days_from_prev_tr"] = transactions_df["days_from_prev_tr"].fillna(0)

transactions_df["days_from_prev_tr"] = (transactions_df["days_from_prev_tr"]).astype(
    "int"
)

transactions_df["hours_from_first_tr"] = (
    transactions_df["transaction_dttm"] - transactions_df["first_tr"]
) / np.timedelta64(1, "h")
transactions_df["hours_from_prev_tr"] = transactions_df[
    "transaction_dttm"
].diff() / np.timedelta64(1, "h")
transactions_df["hours_from_prev_tr"] = transactions_df["hours_from_prev_tr"].fillna(0)

transactions_df = transactions_df.drop(columns=["first_tr"])