In [None]:
from functools import partial

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from catboost import CatBoostRegressor
from ptls.data_load.datasets import MemoryMapDataset, inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset, CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.nn import RnnSeqEncoder, TrxEncoder
from ptls.preprocessing import PandasDataPreprocessor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

df_transactions = pd.read_parquet("data/df_transaction.pa")
df_train = pd.read_parquet("data/train.pa")

In [None]:
df_transactions = df_transactions.drop("merchant_name", axis=1)

In [None]:
df_transactions

Unnamed: 0,client_num,date_time,mcc_code,amount
0,0,2024-07-18 16:04:00,8099,2900
1,0,2024-07-22 16:31:00,5411,455
2,0,2024-07-24 16:23:00,5541,1003
3,0,2024-07-28 15:51:00,5691,1480
4,0,2024-07-28 18:00:00,5331,88
...,...,...,...,...
13508150,109142,2024-08-19 21:32:00,6011,14000
13508151,109142,2024-08-19 21:40:00,6011,24000
13508152,109142,2024-08-19 21:46:00,6011,23000
13508153,109142,2024-08-19 22:04:00,6011,32000


In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="client_num",
    col_event_time="date_time",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True,
)

In [None]:
dataset = preprocessor.fit_transform(df_transactions)

In [None]:
dataset

In [None]:
trx_encoder_params = {
    "embeddings_noise": 0.003,
    "numeric_values": {"amount": "identity"},
    "embeddings": {
        "event_time": {"in": 800, "out": 16},
        "mcc_code": {"in": 250, "out": 16},
    },
}

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type="gru",
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(
        torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9
    ),
)

In [None]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[SeqLenFilter(min_seq_len=25)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [None]:
trainer = pl.Trainer(
    max_epochs=12,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [None]:
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 240 K  | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.962     Total estimated model params size (MB)
19        Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=12` reached.


{'loss': tensor(124.2874), 'seq_len': tensor(73.9533)}


In [None]:
torch.save(seq_encoder.state_dict(), "coles-emb.pt")

In [None]:
train_dl = inference_data_loader(dataset, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
embeds = pd.DataFrame(
    data=train_embeds, columns=[f"embed_{i}" for i in range(train_embeds.shape[1])]
)

In [None]:
# embeds.to_pickle("data/features/coles.pkl")

In [None]:
embeds = embeds.reset_index().rename(columns={"index": "client_num"})

In [None]:
test_clients = list(
    set(df_transactions["client_num"].unique().tolist()).difference(
        df_train["client_num"].to_list()
    )
)

In [None]:
X_train = df_train.merge(
    embeds[~embeds["client_num"].isin(test_clients)],
    left_on="client_num",
    right_on="client_num",
    how="left",
)
X_test = embeds[embeds["client_num"].isin(test_clients)]

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()
EARLY_STOPPING = 50
EVAL_METRIC = "MAE"

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=5000,
        loss_function="MAE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric=EVAL_METRIC,
        early_stopping_rounds=EARLY_STOPPING,
        task_type="GPU",
        # depth=4,
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5809249	test: 1.5805636	best: 1.5805636 (0)	total: 9.6ms	remaining: 48s
1:	total: 16.5ms	remaining: 41.3s
2:	total: 23.8ms	remaining: 39.7s
3:	total: 33.2ms	remaining: 41.4s
4:	total: 40.1ms	remaining: 40s
5:	learn: 1.5760534	test: 1.5754888	best: 1.5754888 (5)	total: 46.9ms	remaining: 39s
6:	total: 56.1ms	remaining: 40s
7:	total: 63.3ms	remaining: 39.5s
8:	total: 70ms	remaining: 38.8s
9:	total: 70ms	remaining: 38.8s
10:	learn: 1.5714083	test: 1.5706890	best: 1.5706890 (10)	total: 79.6ms	remaining: 39.7s
11:	total: 79.6ms	remaining: 39.7s
12:	total: 86.6ms	remaining: 39.3s
13:	total: 96.1ms	remaining: 39.9s
14:	total: 104ms	remaining: 39.7s
15:	learn: 1.5668340	test: 1.5659118	best: 1.5659118 (15)	total: 111ms	remaining: 39.5s
16:	total: 121ms	remaining: 40.1s
17:	total: 128ms	remaining: 39.9s
18:	total: 138ms	remaining: 40.5s
19:	total: 146ms	remaining: 40.3s
20:	learn: 1.5623054	test: 1.5611987	best: 1.5611987 (20)	total: 155ms	remaining: 40.7s
21:	total: 162ms	remaining:

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5808034	test: 1.5809074	best: 1.5809074 (0)	total: 4.83ms	remaining: 24.2s
1:	total: 8.94ms	remaining: 22.3s
2:	total: 13.4ms	remaining: 22.3s
3:	total: 16.8ms	remaining: 21s
4:	total: 21.5ms	remaining: 21.5s
5:	learn: 1.5757758	test: 1.5761331	best: 1.5761331 (5)	total: 25.4ms	remaining: 21.2s
6:	total: 28.8ms	remaining: 20.5s
7:	total: 33.6ms	remaining: 21s
8:	total: 37.5ms	remaining: 20.8s
9:	total: 41.2ms	remaining: 20.6s
10:	learn: 1.5708885	test: 1.5714881	best: 1.5714881 (10)	total: 45.9ms	remaining: 20.8s
11:	total: 49.8ms	remaining: 20.7s
12:	total: 53.3ms	remaining: 20.4s
13:	total: 58.1ms	remaining: 20.7s
14:	total: 62.1ms	remaining: 20.6s
15:	learn: 1.5661705	test: 1.5669604	best: 1.5669604 (15)	total: 67ms	remaining: 20.9s
16:	total: 70.5ms	remaining: 20.7s
17:	total: 75.4ms	remaining: 20.9s
18:	total: 79.4ms	remaining: 20.8s
19:	total: 83ms	remaining: 20.7s
20:	learn: 1.5614346	test: 1.5624466	best: 1.5624466 (20)	total: 91.2ms	remaining: 21.6s
21:	total: 97.7

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5807727	test: 1.5810576	best: 1.5810576 (0)	total: 4.05ms	remaining: 20.3s
1:	total: 8.36ms	remaining: 20.9s
2:	total: 11.4ms	remaining: 19s
3:	total: 14.3ms	remaining: 17.9s
4:	total: 19.1ms	remaining: 19.1s
5:	learn: 1.5758122	test: 1.5761755	best: 1.5761755 (5)	total: 23.3ms	remaining: 19.4s
6:	total: 26.7ms	remaining: 19.1s
7:	total: 29.9ms	remaining: 18.7s
8:	total: 33ms	remaining: 18.3s
9:	total: 35.9ms	remaining: 17.9s
10:	learn: 1.5710566	test: 1.5715441	best: 1.5715441 (10)	total: 39.3ms	remaining: 17.8s
11:	total: 45.9ms	remaining: 19.1s
12:	total: 49.3ms	remaining: 18.9s
13:	total: 52.2ms	remaining: 18.6s
14:	total: 54.9ms	remaining: 18.3s
15:	learn: 1.5663610	test: 1.5669456	best: 1.5669456 (15)	total: 58.8ms	remaining: 18.3s
16:	total: 62.6ms	remaining: 18.4s
17:	total: 66.7ms	remaining: 18.4s
18:	total: 72.3ms	remaining: 19s
19:	total: 76.3ms	remaining: 19s
20:	learn: 1.5616789	test: 1.5623513	best: 1.5623513 (20)	total: 80ms	remaining: 19s
21:	total: 84.9ms	r

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5808202	test: 1.5808326	best: 1.5808326 (0)	total: 5.7ms	remaining: 28.5s
1:	total: 10.9ms	remaining: 27.3s
2:	total: 16.1ms	remaining: 26.8s
3:	total: 21.3ms	remaining: 26.5s
4:	total: 26.2ms	remaining: 26.1s
5:	learn: 1.5757944	test: 1.5759383	best: 1.5759383 (5)	total: 29.6ms	remaining: 24.6s
6:	total: 34.8ms	remaining: 24.8s
7:	total: 39.8ms	remaining: 24.8s
8:	total: 45.1ms	remaining: 25s
9:	total: 51.2ms	remaining: 25.6s
10:	learn: 1.5710402	test: 1.5714192	best: 1.5714192 (10)	total: 56.4ms	remaining: 25.6s
11:	total: 61.5ms	remaining: 25.5s
12:	total: 64.7ms	remaining: 24.8s
13:	total: 70.5ms	remaining: 25.1s
14:	total: 75.8ms	remaining: 25.2s
15:	learn: 1.5663237	test: 1.5667801	best: 1.5667801 (15)	total: 81.6ms	remaining: 25.4s
16:	total: 88.3ms	remaining: 25.9s
17:	total: 94.6ms	remaining: 26.2s
18:	total: 100ms	remaining: 26.3s
19:	total: 106ms	remaining: 26.4s
20:	learn: 1.5616427	test: 1.5621345	best: 1.5621345 (20)	total: 111ms	remaining: 26.4s
21:	total: 11

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5808474	test: 1.5809174	best: 1.5809174 (0)	total: 5.12ms	remaining: 25.6s
1:	total: 9.92ms	remaining: 24.8s
2:	total: 13ms	remaining: 21.6s
3:	total: 17.3ms	remaining: 21.6s
4:	total: 21.8ms	remaining: 21.8s
5:	learn: 1.5758198	test: 1.5760949	best: 1.5760949 (5)	total: 24.9ms	remaining: 20.7s
6:	total: 29.5ms	remaining: 21s
7:	total: 33.9ms	remaining: 21.1s
8:	total: 36.9ms	remaining: 20.4s
9:	total: 41.2ms	remaining: 20.6s
10:	learn: 1.5710031	test: 1.5714735	best: 1.5714735 (10)	total: 45.6ms	remaining: 20.7s
11:	total: 48.9ms	remaining: 20.3s
12:	total: 53.4ms	remaining: 20.5s
13:	total: 58.1ms	remaining: 20.7s
14:	total: 61ms	remaining: 20.3s
15:	learn: 1.5662621	test: 1.5669851	best: 1.5669851 (15)	total: 65.7ms	remaining: 20.5s
16:	total: 70.5ms	remaining: 20.7s
17:	total: 75.1ms	remaining: 20.8s
18:	total: 78.1ms	remaining: 20.5s
19:	total: 82.5ms	remaining: 20.5s
20:	learn: 1.5615077	test: 1.5624636	best: 1.5624636 (20)	total: 87.1ms	remaining: 20.6s
21:	total: 90