In [None]:
from functools import partial

import pandas as pd
import pytorch_lightning as pl
import torch
from ptls.data_load.datasets import MemoryMapDataset, inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset, CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.nn import RnnSeqEncoder, TrxEncoder
from ptls.preprocessing import PandasDataPreprocessor

df_transactions = pd.read_parquet("data/df_transaction.pa")
df_train = pd.read_parquet("data/train.pa")

In [None]:
df_transactions = df_transactions.drop("merchant_name", axis=1)

In [None]:
df_transactions

Unnamed: 0,client_num,date_time,mcc_code,amount
0,0,2024-07-18 16:04:00,8099,2900
1,0,2024-07-22 16:31:00,5411,455
2,0,2024-07-24 16:23:00,5541,1003
3,0,2024-07-28 15:51:00,5691,1480
4,0,2024-07-28 18:00:00,5331,88
...,...,...,...,...
13508150,109142,2024-08-19 21:32:00,6011,14000
13508151,109142,2024-08-19 21:40:00,6011,24000
13508152,109142,2024-08-19 21:46:00,6011,23000
13508153,109142,2024-08-19 22:04:00,6011,32000


In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="client_num",
    col_event_time="date_time",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True,
)

In [None]:
dataset = preprocessor.fit_transform(df_transactions)

In [None]:
dataset

In [None]:
trx_encoder_params = {
    "embeddings_noise": 0.003,
    "numeric_values": {"amount": "identity"},
    "embeddings": {
        "event_time": {"in": 800, "out": 16},
        "mcc_code": {"in": 250, "out": 16},
    },
}

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type="gru",
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(
        torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9
    ),
)

In [None]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[SeqLenFilter(min_seq_len=25)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [None]:
trainer = pl.Trainer(
    max_epochs=12,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [None]:
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 240 K  | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.962     Total estimated model params size (MB)
19        Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=12` reached.


{'loss': tensor(124.2874), 'seq_len': tensor(73.9533)}


In [None]:
torch.save(seq_encoder.state_dict(), "data/features/coles-emb.pt")

In [None]:
train_dl = inference_data_loader(dataset, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
embeds = pd.DataFrame(
    data=train_embeds, columns=[f"embed_{i}" for i in range(train_embeds.shape[1])]
)

In [None]:
embeds.to_parquet("data/features/coles.pa")