In [None]:
from datetime import datetime

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

df_transactions = pd.read_parquet("data/df_transaction.pa")
df_train = pd.read_parquet("data/train.pa")

In [None]:
df_transactions = df_transactions.drop("merchant_name", axis=1)

In [None]:
# time diff features

start_date = datetime.strptime("2024-07-01", "%Y-%m-%d")
current_date = datetime.strptime("2024-10-01", "%Y-%m-%d")

time_features = (
    df_transactions.groupby("client_num")["date_time"].agg(["min", "max"]).reset_index()
)
time_features["history_start"] = start_date
time_features["history_end"] = current_date

time_features["time_between_first_and_last_days"] = (
    time_features["max"] - time_features["min"]
).dt.days
time_features["time_between_first_and_last_hours"] = (
    time_features["max"] - time_features["min"]
).dt.total_seconds() / 3600

time_features["time_between_start_and_first_days"] = (
    time_features["min"] - time_features["history_start"]
).dt.days
time_features["time_between_start_and_first_hours"] = (
    time_features["min"] - time_features["history_start"]
).dt.total_seconds() / 3600

time_features["time_between_last_and_end_days"] = (
    time_features["history_end"] - time_features["max"]
).dt.days
time_features["time_between_last_and_end_hours"] = (
    time_features["history_end"] - time_features["max"]
).dt.total_seconds() / 3600

time_features = time_features.drop(["min", "max", "history_start", "history_end"], axis=1)

In [None]:
time_features

Unnamed: 0,client_num,time_between_first_and_last_days,time_between_first_and_last_hours,time_between_start_and_first_days,time_between_start_and_first_hours,time_between_last_and_end_days,time_between_last_and_end_hours
0,0,74,1777.016667,17,424.066667,0,6.916667
1,1,91,2192.900000,0,9.016667,0,6.083333
2,2,91,2186.733333,0,16.866667,0,4.400000
3,3,91,2197.800000,0,8.133333,0,2.066667
4,4,90,2169.816667,1,34.750000,0,3.433333
...,...,...,...,...,...,...,...
109138,109138,46,1125.816667,44,1066.816667,0,15.366667
109139,109139,16,405.466667,29,710.800000,45,1091.733333
109140,109140,45,1082.400000,5,132.666667,41,992.933333
109141,109141,56,1351.716667,7,178.383333,28,677.900000


In [None]:
df_transactions

Unnamed: 0,client_num,date_time,mcc_code,amount
0,0,2024-07-18 16:04:00,8099,2900
1,0,2024-07-22 16:31:00,5411,455
2,0,2024-07-24 16:23:00,5541,1003
3,0,2024-07-28 15:51:00,5691,1480
4,0,2024-07-28 18:00:00,5331,88
...,...,...,...,...
13508150,109142,2024-08-19 21:32:00,6011,14000
13508151,109142,2024-08-19 21:40:00,6011,24000
13508152,109142,2024-08-19 21:46:00,6011,23000
13508153,109142,2024-08-19 22:04:00,6011,32000


In [None]:
def classify_time(hour):
    if 5 <= hour < 12:
        return "Morning"
    if 12 <= hour < 17:
        return "Afternoon"
    if 17 <= hour < 21:
        return "Evening"
    return "Night"


df_transactions["month"] = df_transactions.date_time.dt.month.astype("int8")
df_transactions["hour"] = df_transactions.date_time.dt.hour.astype("int8")
df_transactions["day_of_month"] = df_transactions.date_time.dt.day.astype("int8")
df_transactions["day_of_week"] = (df_transactions.date_time.dt.dayofweek + 1).astype(
    "int8"
)
df_transactions["is_wknd"] = (df_transactions.date_time.dt.weekday // 4).astype("int8")

df_transactions["time_of_day"] = df_transactions["hour"].apply(classify_time)
top_10_mcc = df_transactions["mcc_code"].value_counts().head(10).index.tolist()
df_transactions["mcc_code_in_top10"] = (
    df_transactions["mcc_code"].isin(top_10_mcc).astype(int)
)

In [None]:
df_transactions["mcc_code"].nunique()

320

In [None]:
df_transactions

Unnamed: 0,client_num,date_time,mcc_code,amount,month,hour,day_of_month,day_of_week,is_wknd,time_of_day,mcc_code_in_top10
0,0,2024-07-18 16:04:00,8099,2900,7,16,18,4,0,Afternoon,0
1,0,2024-07-22 16:31:00,5411,455,7,16,22,1,0,Afternoon,1
2,0,2024-07-24 16:23:00,5541,1003,7,16,24,3,0,Afternoon,1
3,0,2024-07-28 15:51:00,5691,1480,7,15,28,7,1,Afternoon,0
4,0,2024-07-28 18:00:00,5331,88,7,18,28,7,1,Evening,0
...,...,...,...,...,...,...,...,...,...,...,...
13508150,109142,2024-08-19 21:32:00,6011,14000,8,21,19,1,0,Night,1
13508151,109142,2024-08-19 21:40:00,6011,24000,8,21,19,1,0,Night,1
13508152,109142,2024-08-19 21:46:00,6011,23000,8,21,19,1,0,Night,1
13508153,109142,2024-08-19 22:04:00,6011,32000,8,22,19,1,0,Night,1


In [None]:
default_features = (
    df_transactions.groupby("client_num")
    .agg(
        {
            "amount": [
                "sum",
                "mean",
                "max",
                "min",
                "std",
                "median",
                "nunique",
            ],
            "mcc_code": [
                "nunique",
                "count",
                lambda x: x.mode()[0],
            ],
            "is_wknd": ["sum", "mean"],
            "hour": ["mean", "std", "max", "min"],
            "day_of_week": ["mean", "std", "max", "min"],
            "mcc_code_in_top10": [
                "sum",
                "mean",
            ],
            "time_of_day": [lambda x: x.mode()[0], "nunique", "count"],
            "day_of_month": ["nunique"],
        }
    )
    .reset_index()
)

default_features.columns = ["_".join(c) for c in default_features.columns]
default_features = default_features.rename(columns={"client_num_": "client_num"})
default_features

Unnamed: 0,client_num,amount_sum,amount_mean,amount_max,amount_min,amount_std,amount_median,amount_nunique,mcc_code_nunique,mcc_code_count,...,day_of_week_mean,day_of_week_std,day_of_week_max,day_of_week_min,mcc_code_in_top10_sum,mcc_code_in_top10_mean,time_of_day_<lambda_0>,time_of_day_nunique,time_of_day_count,day_of_month_nunique
0,0,106935,810.113636,7322,28,1311.578925,403.0,111,18,132,...,3.969697,2.067337,7,1,112,0.848485,Evening,4,132,31
1,1,863878,3599.491667,100000,6,11704.843812,691.5,215,29,240,...,4.075000,1.846233,7,1,150,0.625000,Evening,4,240,31
2,2,344108,1147.026667,24496,23,2629.178018,456.0,253,33,300,...,4.233333,2.049281,7,1,239,0.796667,Afternoon,4,300,31
3,3,1621825,11032.823129,1000000,1,86498.559476,450.0,116,21,147,...,4.142857,1.861653,7,1,87,0.591837,Morning,4,147,31
4,4,199796,1637.672131,50000,24,4938.356295,449.5,89,9,122,...,3.516393,1.773234,7,1,85,0.696721,Evening,4,122,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109138,109138,236283,14767.687500,59255,1,20613.075561,3105.5,16,4,16,...,2.000000,1.032796,3,1,2,0.125000,Morning,2,16,5
109139,109139,9640,642.666667,1150,25,344.028169,710.0,13,6,15,...,3.333333,1.234427,6,2,14,0.933333,Afternoon,3,15,8
109140,109140,28389,1577.166667,9900,35,2679.431749,184.5,14,5,18,...,3.388889,2.173067,6,1,9,0.500000,Afternoon,2,18,7
109141,109141,61843,3865.187500,22360,170,7251.421398,632.5,16,10,16,...,3.937500,2.489143,7,1,10,0.625000,Afternoon,3,16,11


In [None]:
def get_features(columns):
    features = df_transactions.pivot_table(
        index="client_num",
        columns=columns,
        values="amount",
        aggfunc=["sum", "mean", "count", "std", "min", "max", "median"],
        fill_value=0,
    )
    features.columns = [f"{columns}_{i[1]}_{i[0]}" for i in features.columns]
    features = features.reset_index()
    return features


month_features = get_features("month")
day_of_month_features = get_features("day_of_month")
day_of_week_features = get_features("day_of_week")
time_of_day_features = get_features("time_of_day")
hour_features = get_features("hour")
is_wknd_features = get_features("is_wknd")

In [None]:
def get_fraction(columns, aggfunc):

    pt = df_transactions.pivot_table(
        index="client_num",
        columns=columns,
        values="amount",
        aggfunc=aggfunc,
        fill_value=0,
    )

    result = pt.div(pt.sum(axis=1), axis=0)
    result.columns = [f"fraction_{columns}_{aggfunc}_{col}" for col in result.columns]
    result = result.reset_index()

    return result


month_fraction_sum = get_fraction("month", "sum")
month_fraction_count = get_fraction("month", "count")

day_of_month_fraction_sum = get_fraction("day_of_month", "sum")
day_of_month_fraction_count = get_fraction("day_of_month", "count")

day_of_week_fraction_sum = get_fraction("day_of_week", "sum")
day_of_week_fraction_count = get_fraction("day_of_week", "count")

time_of_day_fraction_sum = get_fraction("time_of_day", "sum")
time_of_day_fraction_count = get_fraction("time_of_day", "count")

hour_fraction_sum = get_fraction("hour", "sum")
hour_fraction_count = get_fraction("hour", "count")

is_wknd_fraction_sum = get_fraction("is_wknd", "sum")
is_wknd_fraction_count = get_fraction("is_wknd", "count")

mcc_code_fraction_sum = get_fraction("mcc_code", "sum")
mcc_code_fraction_count = get_fraction("mcc_code", "count")

mcc_code_in_top10_fraction_sum = get_fraction("mcc_code_in_top10", "sum")
mcc_code_in_top10_fraction_count = get_fraction("mcc_code_in_top10", "count")

In [None]:
df_transactions["date_time"] = pd.to_datetime(df_transactions["date_time"])
df_transactions = df_transactions.sort_values(by=["client_num", "date_time"])
df_transactions["time_diff"] = (
    df_transactions.groupby("client_num")["date_time"].diff().dt.total_seconds()
)
df_transactions["time_diff_hours"] = df_transactions["time_diff"] / 3600
time_diff_features = (
    df_transactions.groupby("client_num")["time_diff_hours"]
    .agg(["max", "min", "mean"])
    .reset_index()
)

In [None]:
combined_df = pd.concat(
    [
        default_features,
        time_features,
        month_features,
        day_of_month_features,
        day_of_week_features,
        time_of_day_features,
        hour_features,
        is_wknd_features,
        month_fraction_sum,
        month_fraction_count,
        day_of_month_fraction_sum,
        day_of_month_fraction_count,
        day_of_week_fraction_sum,
        day_of_week_fraction_count,
        time_of_day_fraction_sum,
        time_of_day_fraction_count,
        hour_fraction_sum,
        hour_fraction_count,
        is_wknd_fraction_sum,
        is_wknd_fraction_count,
        mcc_code_fraction_sum,
        mcc_code_fraction_count,
        mcc_code_in_top10_fraction_sum,
        mcc_code_in_top10_fraction_count,
        time_diff_features,
    ],
    axis=1,
)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

In [None]:
combined_df.to_pickle("data/features/heuristics.pkl")

In [None]:
test_clients = list(
    set(df_transactions["client_num"].unique().tolist()).difference(
        df_train["client_num"].to_list()
    )
)

In [None]:
X_train = df_train.merge(
    combined_df[~combined_df["client_num"].isin(test_clients)],
    left_on="client_num",
    right_on="client_num",
    how="left",
)
X_test = combined_df[combined_df["client_num"].isin(test_clients)]

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()
EARLY_STOPPING = 400
EVAL_METRIC = "MAE"

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric=EVAL_METRIC,
        early_stopping_rounds=EARLY_STOPPING,
        task_type="GPU",
        # depth=4,
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5806078	test: 1.5802698	best: 1.5802698 (0)	total: 99.2ms	remaining: 16m 31s
1:	total: 122ms	remaining: 10m 8s
2:	total: 145ms	remaining: 8m 4s
3:	total: 164ms	remaining: 6m 50s
4:	total: 183ms	remaining: 6m 4s
5:	learn: 1.5735287	test: 1.5731352	best: 1.5731352 (5)	total: 200ms	remaining: 5m 33s
6:	total: 218ms	remaining: 5m 10s
7:	total: 237ms	remaining: 4m 56s
8:	total: 255ms	remaining: 4m 43s
9:	total: 255ms	remaining: 4m 43s
10:	learn: 1.5668777	test: 1.5663503	best: 1.5663503 (10)	total: 2.88s	remaining: 47m 59s
11:	total: 2.9s	remaining: 43m 53s
12:	total: 2.92s	remaining: 40m 31s
13:	total: 2.94s	remaining: 37m 39s
14:	total: 2.96s	remaining: 35m 10s
15:	learn: 1.5600124	test: 1.5592952	best: 1.5592952 (15)	total: 2.96s	remaining: 35m 10s
16:	total: 5.58s	remaining: 1h 1m 56s
17:	total: 5.6s	remaining: 58m 15s
18:	total: 5.62s	remaining: 54m 59s
19:	total: 5.62s	remaining: 54m 59s
20:	learn: 1.5532595	test: 1.5524403	best: 1.5524403 (20)	total: 8.25s	remaining: 1h 1

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5804902	test: 1.5805100	best: 1.5805100 (0)	total: 28.4ms	remaining: 4m 43s
1:	total: 47.4ms	remaining: 3m 56s
2:	total: 68.8ms	remaining: 3m 49s
3:	total: 89.8ms	remaining: 3m 44s
4:	total: 111ms	remaining: 3m 42s
5:	learn: 1.5732616	test: 1.5734523	best: 1.5734523 (5)	total: 131ms	remaining: 3m 38s
6:	total: 153ms	remaining: 3m 38s
7:	total: 176ms	remaining: 3m 40s
8:	total: 196ms	remaining: 3m 37s
9:	total: 216ms	remaining: 3m 35s
10:	learn: 1.5666042	test: 1.5669206	best: 1.5669206 (10)	total: 234ms	remaining: 3m 32s
11:	total: 252ms	remaining: 3m 29s
12:	total: 269ms	remaining: 3m 26s
13:	total: 287ms	remaining: 3m 24s
14:	total: 305ms	remaining: 3m 23s
15:	learn: 1.5595806	test: 1.5600353	best: 1.5600353 (15)	total: 323ms	remaining: 3m 21s
16:	total: 340ms	remaining: 3m 19s
17:	total: 358ms	remaining: 3m 18s
18:	total: 376ms	remaining: 3m 17s
19:	total: 395ms	remaining: 3m 16s
20:	learn: 1.5526592	test: 1.5532183	best: 1.5532183 (20)	total: 413ms	remaining: 3m 16s
21:

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5804527	test: 1.5807489	best: 1.5807489 (0)	total: 29.4ms	remaining: 4m 53s
1:	total: 54ms	remaining: 4m 30s
2:	total: 82ms	remaining: 4m 33s
3:	total: 108ms	remaining: 4m 30s
4:	total: 136ms	remaining: 4m 30s
5:	learn: 1.5733343	test: 1.5736383	best: 1.5736383 (5)	total: 154ms	remaining: 4m 17s
6:	total: 173ms	remaining: 4m 6s
7:	total: 190ms	remaining: 3m 57s
8:	total: 208ms	remaining: 3m 51s
9:	total: 226ms	remaining: 3m 46s
10:	learn: 1.5666981	test: 1.5670735	best: 1.5670735 (10)	total: 244ms	remaining: 3m 41s
11:	total: 262ms	remaining: 3m 37s
12:	total: 280ms	remaining: 3m 34s
13:	total: 298ms	remaining: 3m 32s
14:	total: 315ms	remaining: 3m 29s
15:	learn: 1.5598870	test: 1.5603560	best: 1.5603560 (15)	total: 333ms	remaining: 3m 27s
16:	total: 350ms	remaining: 3m 25s
17:	total: 350ms	remaining: 3m 25s
18:	total: 350ms	remaining: 3m 25s
19:	total: 366ms	remaining: 3m 22s
20:	learn: 1.5529581	test: 1.5534407	best: 1.5534407 (20)	total: 366ms	remaining: 3m 22s
21:	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5804212	test: 1.5805008	best: 1.5805008 (0)	total: 27.5ms	remaining: 4m 35s
1:	total: 27.5ms	remaining: 4m 35s
2:	total: 2.65s	remaining: 3h 41m 15s
3:	total: 2.67s	remaining: 2h 28m 30s
4:	total: 2.69s	remaining: 1h 52m 8s
5:	learn: 1.5731303	test: 1.5734118	best: 1.5734118 (5)	total: 2.71s	remaining: 1h 30m 19s
6:	total: 2.73s	remaining: 1h 15m 45s
7:	total: 2.75s	remaining: 1h 5m 21s
8:	total: 2.77s	remaining: 57m 34s
9:	total: 2.78s	remaining: 51m 30s
10:	learn: 1.5662580	test: 1.5667624	best: 1.5667624 (10)	total: 2.8s	remaining: 46m 38s
11:	total: 2.82s	remaining: 42m 43s
12:	total: 2.82s	remaining: 42m 43s
13:	total: 5.45s	remaining: 1h 15m 34s
14:	total: 5.47s	remaining: 1h 9m 59s
15:	learn: 1.5592806	test: 1.5600858	best: 1.5600858 (15)	total: 5.48s	remaining: 1h 5m 11s
16:	total: 5.48s	remaining: 1h 5m 11s
17:	total: 8.11s	remaining: 1h 29m 56s
18:	total: 8.13s	remaining: 1h 24m 29s
19:	total: 8.13s	remaining: 1h 24m 29s
20:	learn: 1.5523965	test: 1.5534637	best: 

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5802932	test: 1.5802853	best: 1.5802853 (0)	total: 22.2ms	remaining: 3m 42s
1:	total: 41.5ms	remaining: 3m 27s
2:	total: 59.9ms	remaining: 3m 19s
3:	total: 79.6ms	remaining: 3m 18s
4:	total: 98.9ms	remaining: 3m 17s
5:	learn: 1.5732872	test: 1.5734674	best: 1.5734674 (5)	total: 126ms	remaining: 3m 30s
6:	total: 149ms	remaining: 3m 32s
7:	total: 167ms	remaining: 3m 29s
8:	total: 185ms	remaining: 3m 24s
9:	total: 202ms	remaining: 3m 21s
10:	learn: 1.5664841	test: 1.5667699	best: 1.5667699 (10)	total: 218ms	remaining: 3m 18s
11:	total: 233ms	remaining: 3m 13s
12:	total: 247ms	remaining: 3m 9s
13:	total: 264ms	remaining: 3m 8s
14:	total: 281ms	remaining: 3m 7s
15:	learn: 1.5595790	test: 1.5599333	best: 1.5599333 (15)	total: 300ms	remaining: 3m 6s
16:	total: 317ms	remaining: 3m 6s
17:	total: 335ms	remaining: 3m 5s
18:	total: 352ms	remaining: 3m 5s
19:	total: 371ms	remaining: 3m 4s
20:	learn: 1.5527047	test: 1.5532146	best: 1.5532146 (20)	total: 371ms	remaining: 3m 4s
21:	total: 

In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
25,9.825129,day_of_month_nunique
31,5.079247,time_between_last_and_end_hours
34,4.745681,month_9_sum
30,3.728718,time_between_last_and_end_days
27,2.946809,time_between_first_and_last_hours
1203,2.112257,fraction_mcc_code_count_6012
1209,2.082524,fraction_mcc_code_count_6536
534,1.775764,is_wknd_1_median
5,1.583212,amount_median
541,1.523834,fraction_month_count_9


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

In [None]:
submission["target"] = y_pred

submission.to_csv("submissions/regressor_test.csv", index=False)