In [1]:
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [3]:
from pathlib import Path

import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate


from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule
import ptls

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomCoLES

In [4]:
from typing import Callable, Dict

from ptls.nn.seq_encoder.containers import SeqEncoderContainer
from ptls.frames.coles import CoLESModule


class CustomCoLES(CoLESModule):
    """
    Custom coles module inhereted from ptls coles module.
    """

    def __init__(
        self,
        optimizer_partial: Callable,
        lr_scheduler_partial: Callable,
        sequence_encoder: SeqEncoderContainer,
    ) -> None:
        """Overrided initialize method, which is suitable for our tasks

        Args:
            optimizer_partial (Callable): Partial initialized torch optimizer (with parameters)
            lr_scheduler_partial (Callable): Partial initialized torch lr scheduler
                (with parameters)
            sequence_encoder (SeqEncoderContainer): Ptls sequence encoder
                (including sequence encoder and single transaction encoder)
        """
        super().__init__(
            seq_encoder=sequence_encoder,
            optimizer_partial=optimizer_partial,
            lr_scheduler_partial=lr_scheduler_partial,
        )
        self.sequence_encoder_model = sequence_encoder

    def get_seq_encoder_weights(self) -> Dict:
        """Get weights of the sequnce encoder in torch format

        Returns:
            dict: Encoder weights
        """
        return self.sequence_encoder_model.state_dict()

In [5]:
from typing import Optional
from ptls.nn.head import Head
from src.coles.datamodule import SampleAll
from ptls.frames.coles.losses import ContrastiveLoss
from ptls.frames.coles.metric import BatchRecallTopK
from ptls.frames.coles.sampling_strategies import HardNegativePairSelector
from ptls.frames.coles.split_strategy import AbsSplit, SampleSlices
import numpy as np
import torch
from torch import minimum, maximum
from ptls.data_load.padded_batch import PaddedBatch
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.frames.coles import CoLESModule
from torchmetrics import Metric

from torch import nn


def is_seq_feature(k: str, x):
    """Check is value sequential feature
    Synchronized with ptls.data_load.padded_batch.PaddedBatch.is_seq_feature

    Iterables are:
        np.array
        torch.Tensor

    Not iterable:
        list    - dont supports indexing

    Parameters
    ----------
    k:
        feature_name
    x:
        value for check

    Returns
    -------
        True if value is iterable
    """
    if k == 'event_time':
        return True
    if k.startswith('target'):
        return False
    if type(x) in (np.ndarray, torch.Tensor):
        return True
    return False


class CoLESonCoLES(CoLESModule):
    """
    Coles on coles embeddings model
    """

    def __init__(
        self,
        frozen_encoder: SeqEncoderContainer,
        learning_encoder: AbsSeqEncoder,
        optimizer_partial: Callable,
        lr_scheduler_partial: Callable,
        col_time: str = "event_time",
        encoding_splitter: AbsSplit = SampleAll(20, 1),
        training_splitter: AbsSplit = SampleSlices(split_count=5, cnt_max=150, cnt_min=15)
    ) -> None:
        """ 
        something should be placed here
        """
        super().__init__(seq_encoder=frozen_encoder,
                         optimizer_partial=optimizer_partial,
                         lr_scheduler_partial=lr_scheduler_partial)
    
        self.learning_encoder = learning_encoder

        self.encoding_splitter = encoding_splitter
        self.training_splitter = training_splitter
        self.col_time = col_time 

    @property
    def is_requires_reduced_sequence(self):
        return False
    
    def _encode_and_split(self, x: PaddedBatch, y: torch.Tensor, step: int = 20):
        dates = x.payload[self.col_time]   # (B, T)
        dates_len = dates.shape[1]
        start_pos = np.arange(0, dates_len - step, 1)

        def encode():
            for s in start_pos:
                torch.cuda.empty_cache()
                payload = {k: v[:, s : s + step] for k, v in x.payload.items() if is_seq_feature(k, v)}
                seq_lens = minimum(maximum(x.seq_lens - s, torch.zeros_like(x.seq_lens)), torch.full_like(x.seq_lens, step))
                pb = PaddedBatch(payload, seq_lens)

                yield self._seq_encoder(pb).payload.detach().cpu()

        emb_sequences = torch.stack([*encode()], dim=1)

        def split():
            for i, elem in enumerate(emb_sequences):
                indexes = self.training_splitter.split(dates)
                yield [elem[ix] for ix in indexes], [y[i].item()] * len(indexes)

        emb_sequences, extended_y = zip(*split())

        emb_sequences = list(chain(*emb_sequences)) # B
        emb_sequences = pad_sequence(emb_sequences, batch_first=True).to(x.device)

        extended_y = torch.tensor(list(chain(*extended_y))).to(x.device)
        
        return PaddedBatch(payload=emb_sequences, length=x.seq_lens), extended_y
    
    def forward(self, x):
        return self.learning_encoder(x)
    
    def shared_step(self, x, y):
        x, y = self._encode_and_split(x, y)
        y_h = self(x)
        y_h = self._head(y_h)
        return y_h, y


In [6]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_churn")
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]
cfg_model["trainer_coles"]["trainer"]["devices"] = [1]

In [7]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head(10)

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
5,1,3,2017-10-16 00:00:00,380.0,0,0,0,0
6,1,3,2017-10-10 00:00:00,378.0,0,0,0,0
7,1,3,2017-10-16 00:00:00,199.0,0,0,0,0
8,1,3,2017-10-11 00:00:00,400.0,0,0,0,0
9,1,1,2017-07-26 00:00:00,598.0,0,0,0,0


In [8]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True
)

In [9]:
dataset = preprocessor.fit_transform(df)

In [10]:
train, val = train_test_split(dataset, test_size=.2)

In [11]:
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train, splitter = ptls.frames.coles.split_strategy.NoSplit())
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val, splitter=ptls.frames.coles.split_strategy.NoSplit)

In [12]:
datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

In [13]:
dl = datamodule.train_dataloader()

In [14]:
b = next(iter(dl))

In [16]:
b[0].payload["global_target"].shape

torch.Size([128, 350])

In [17]:
b = (b[0].to("cuda:0"), b[1].to("cuda:0"))

In [18]:
model: CustomCoLES = instantiate(cfg_model["model"])

In [19]:
from functools import partial

frozen_encoder = model.seq_encoder.to("cuda:0")
learning_encoder = model.seq_encoder.seq_encoder.to("cuda:0")

optimizer = partial(torch.optim.Adam, lr=0.004)
lr_scheduler = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=0.902, patience=2)
mdl = CoLESonCoLES(frozen_encoder=frozen_encoder, learning_encoder=learning_encoder, optimizer_partial=optimizer, lr_scheduler_partial=lr_scheduler)

In [20]:
mdl._seq_encoder.is_reduce_sequence 

False

In [21]:
x, y = mdl.shared_step(*b)

AssertionError: LSTM: Expected input to be 2-D or 3-D but received 4-D tensor

In [None]:
torch.cuda.empty_cache()

In [29]:
!nvidia-smi

Wed Oct  4 19:29:14 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:3B:00.0 Off |                  N/A |
| 73%   61C    P2    86W / 170W |   4593MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:5E:00.0 Off |                  N/A |
| 57%   56C    P2   137W / 170W |  11976MiB / 12288MiB |    100%      Default |
|       