In [3]:
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [5]:
from pathlib import Path

import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate


from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule
import ptls

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomCoLES

In [6]:
from typing import Callable, Dict

from ptls.nn.seq_encoder.containers import SeqEncoderContainer
from ptls.frames.coles import CoLESModule


class CustomCoLES(CoLESModule):
    """
    Custom coles module inhereted from ptls coles module.
    """

    def __init__(
        self,
        optimizer_partial: Callable,
        lr_scheduler_partial: Callable,
        sequence_encoder: SeqEncoderContainer,
    ) -> None:
        """Overrided initialize method, which is suitable for our tasks

        Args:
            optimizer_partial (Callable): Partial initialized torch optimizer (with parameters)
            lr_scheduler_partial (Callable): Partial initialized torch lr scheduler
                (with parameters)
            sequence_encoder (SeqEncoderContainer): Ptls sequence encoder
                (including sequence encoder and single transaction encoder)
        """
        super().__init__(
            seq_encoder=sequence_encoder,
            optimizer_partial=optimizer_partial,
            lr_scheduler_partial=lr_scheduler_partial,
        )
        self.sequence_encoder_model = sequence_encoder

    def get_seq_encoder_weights(self) -> Dict:
        """Get weights of the sequnce encoder in torch format

        Returns:
            dict: Encoder weights
        """
        return self.sequence_encoder_model.state_dict()

In [35]:
from src.coles.datamodule import SampleAll
from ptls.frames.coles.split_strategy import AbsSplit, SampleSlices
import numpy as np
import torch
from torch import minimum, maximum
from ptls.data_load.padded_batch import PaddedBatch
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.frames import ABSModule


class CoLESonCoLES(ABSModule):
    """
    Coles on coles embeddings model
    """

    def __init__(
        self,
        optimizer_partial: Callable,
        lr_scheduler_partial: Callable,
        frozen_encoder: SeqEncoderContainer,
        col_time: str = "event_time",
        encoding_splitter: AbsSplit = SampleAll(20, 1),
        training_splitter: AbsSplit = SampleSlices(split_count=5, cnt_max=150, cnt_min=15)
    ) -> None:
        """Overrided initialize method, which is suitable for our tasks

        Args:
            optimizer_partial (Callable): Partial initialized torch optimizer (with parameters)
            lr_scheduler_partial (Callable): Partial initialized torch lr scheduler
                (with parameters)
            sequence_encoder (SeqEncoderContainer): Ptls sequence encoder
                (including sequence encoder and single transaction encoder)
        """
        super().__init__(
            sequence_encoder=None,
            optimizer_partial=optimizer_partial,
            lr_scheduler_partial=lr_scheduler_partial,
        )
        self.sequence_encoder_model = sequence_encoder
        self.frozen_encoder = frozen_encoder
        self.encoding_splitter = encoding_splitter
        self.training_splitter = training_splitter
        self.col_time = col_time

    def _encode_and_split(self, x: PaddedBatch, y: torch.Tensor, step: int = 20):
        dates = x.payload[self.col_time]   # (B, T)
        dates_len = dates.shape[1]
        start_pos = np.arange(0, dates_len - step, 1)

        def encode():
            for s in start_pos:
                torch.cuda.empty_cache()
                payload = {k: v[:, s : s + step] for k, v in x.payload.items()}
                seq_lens = minimum(maximum(x.seq_lens - s, torch.zeros_like(x.seq_lens)), torch.full_like(x.seq_lens, step))
                pb = PaddedBatch(payload, seq_lens)

                yield self.frozen_encoder(pb).detach().cpu()

        emb_sequences = torch.stack([*encode()], dim=1)

        def split():
            for i, elem in enumerate(emb_sequences):
                indexes = self.training_splitter.split(dates)
                yield [elem[ix] for ix in indexes], [y[i].item()] * len(indexes)

        emb_sequences, extended_y = zip(*split())

        emb_sequences = list(chain(*emb_sequences)) # B
        emb_sequences = pad_sequence(emb_sequences, batch_first=True).to(x.device)

        extended_y = torch.tensor(list(chain(*extended_y))).to(x.device)
        
        return PaddedBatch(payload=emb_sequences, length=x.seq_lens), extended_y
    
    def shared_step(self, x, y):
        x, y = self._encode_and_split(x, y)
        y_h = self.sequence_encoder(x)
        if self._head is not None:
            y_h = self._head(y_h)
        return y_h, y


In [8]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_churn")
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]
cfg_model["trainer_coles"]["trainer"]["devices"] = [1]

In [9]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head(10)

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
5,1,3,2017-10-16 00:00:00,380.0,0,0,0,0
6,1,3,2017-10-10 00:00:00,378.0,0,0,0,0
7,1,3,2017-10-16 00:00:00,199.0,0,0,0,0
8,1,3,2017-10-11 00:00:00,400.0,0,0,0,0
9,1,1,2017-07-26 00:00:00,598.0,0,0,0,0


In [10]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True
)

In [11]:
dataset = preprocessor.fit_transform(df)

In [12]:
train, val = train_test_split(dataset, test_size=.2)

In [13]:
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train, splitter = ptls.frames.coles.split_strategy.NoSplit())
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val, splitter=ptls.frames.coles.split_strategy.NoSplit)

In [14]:
datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

In [15]:
dl = datamodule.train_dataloader()

In [16]:
b = next(iter(dl))

In [17]:
s = 30
step = 20
torch.minimum(torch.maximum(b[0].seq_lens - s, torch.zeros_like(b[0].seq_lens)), torch.full_like(b[0].seq_lens, step))

tensor([ 0, 20, 20, 20, 20,  0, 20, 20, 20, 20, 20, 20, 20, 14, 20, 20, 20, 20,
        20,  0, 20, 20, 20,  7, 20,  5, 20,  0, 20,  0, 20, 20,  0, 20, 20, 20,
         0,  3, 20,  0, 20, 20, 20,  5, 20, 20,  0, 20, 20, 17, 20, 20, 20, 20,
        12, 17, 20, 20, 20, 20, 20, 20, 20, 20, 20,  0, 20, 20, 20, 14, 20, 20,
        20,  0, 20, 20, 20, 20,  4, 20, 20, 20, 20, 20, 20, 20,  1, 20, 19, 20,
        20,  0,  0, 20, 20, 20, 20, 20, 20,  0, 20,  6, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 16, 20, 20, 20,
        20, 20])

In [18]:
b[0].seq_lens

tensor([ 23, 134, 113, 184,  69,  18,  66,  50, 159,  95, 118, 247, 307,  44,
        101, 137,  67, 156, 117,  16, 159, 108, 104,  37, 107,  35,  64,  23,
         65,  26, 107, 217,  20, 113, 118,  85,  30,  33,  64,  19, 202,  79,
        225,  35, 118, 330,  18, 126, 178,  47, 121, 127, 106,  64,  42,  47,
        196, 145, 135, 135,  56, 144,  66,  69, 108,  19, 105, 131, 127,  44,
        190, 139,  98,  17,  81, 194,  74,  83,  34, 202, 173,  72, 109, 186,
        200, 140,  31,  91,  49,  85, 181,  23,  15,  86, 143,  75,  56,  93,
         52,  29,  65,  36, 201, 146, 202, 150, 107, 113, 272, 203, 128, 115,
        173,  70, 149,  95, 133, 319,  61,  62, 123, 157,  46, 173, 204,  68,
        124,  73])

In [19]:
b[0].payload['global_target'].shape

torch.Size([128, 330])

In [20]:
import numpy as np
import torch

def is_seq_feature(k: str, x):
    """Check is value sequential feature
    Synchronized with ptls.data_load.padded_batch.PaddedBatch.is_seq_feature

    Iterables are:
        np.array
        torch.Tensor

    Not iterable:
        list    - dont supports indexing

    Parameters
    ----------
    k:
        feature_name
    x:
        value for check

    Returns
    -------
        True if value is iterable
    """
    if k == 'event_time':
        return True
    if k.startswith('target'):
        return False
    if type(x) in (np.ndarray, torch.Tensor):
        return True
    return False

In [21]:
model: CustomCoLES = instantiate(cfg_model["model"])

In [22]:
b[0].payload.keys()

dict_keys(['global_target', 'holiday_target', 'weekend_target', 'churn_target', 'event_time', 'mcc_code', 'amount'])

In [23]:
for k, v in b[0].payload.items():
    print(k, is_seq_feature(k, v))

global_target True
holiday_target True
weekend_target True
churn_target True
event_time True
mcc_code True
amount True


In [24]:
t = torch.stack([model(b[0]) for _ in range(2)], dim = 1)

In [25]:
def a():
    yield 1, 2
    yield 3, 4

In [26]:
c, d = zip(*a())

In [27]:
c

(1, 3)

In [36]:
from functools import partial

seq_encoder = model.seq_encoder.to("cuda:0")

optimizer = partial(torch.optim.Adam, lr=0.004)
lr_scheduler = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=0.902, patience=2)
mdl = CoLESonCoLES(optimizer, lr_scheduler, model.seq_encoder, model.seq_encoder)

In [37]:
b = b[0].to("cuda:0"), b[1].to("cuda:0")

In [38]:
x, y = mdl._encode_and_split(*b)

cuda:0
Embedding...
Finished embeddings...


In [None]:
y

tensor([  0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   2,   2,   2,   2,
          2,   3,   3,   3,   3,   3,   4,   4,   4,   4,   4,   5,   5,   5,
          5,   5,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   8,   8,
          8,   8,   8,   9,   9,   9,   9,   9,  10,  10,  10,  10,  10,  11,
         11,  11,  11,  11,  12,  12,  12,  12,  12,  13,  13,  13,  13,  13,
         14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  16,  16,  16,  16,
         16,  17,  17,  17,  17,  17,  18,  18,  18,  18,  18,  19,  19,  19,
         19,  19,  20,  20,  20,  20,  20,  21,  21,  21,  21,  21,  22,  22,
         22,  22,  22,  23,  23,  23,  23,  23,  24,  24,  24,  24,  24,  25,
         25,  25,  25,  25,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,
         28,  28,  28,  28,  28,  29,  29,  29,  29,  29,  30,  30,  30,  30,
         30,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  33,  33,
         33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  

In [None]:
torch.cuda.empty_cache()

In [29]:
!nvidia-smi

Wed Oct  4 19:29:14 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:3B:00.0 Off |                  N/A |
| 73%   61C    P2    86W / 170W |   4593MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:5E:00.0 Off |                  N/A |
| 57%   56C    P2   137W / 170W |  11976MiB / 12288MiB |    100%      Default |
|       