In [4]:
import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [5]:
from pathlib import Path

import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate


from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomCoLES

In [6]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_churn")
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [7]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))

In [8]:
df.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0


In [9]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True
)

In [10]:
dataset = preprocessor.fit_transform(df)

In [15]:
from ptls.frames.coles.split_strategy import AbsSplit
from typing import List
import numpy as np

class TimeCLSampler(AbsSplit):
    def __init__(
        self,
        min_len: int,
        max_len: int,
        llambda: float,
        rlambda: float,
        split_count: int,
    ) -> None:
        """
        TimeCL sampler implementation, ptls-style.
        For details, see Algorithm 1 from the paper: http://mesl.ucsd.edu/pubs/Ranak_AAAI2023_PrimeNet.pdf
        Args:
            seq_len (int): desired subsequence length (i.e. sliding window size)
            stride (int): margin between subsequent windows
        """
        self.min_len = min_len
        self.max_len = max_len
        self.llambda = llambda
        self.rlambda = rlambda
        self.split_count = split_count

    def split(self, dates: np.array) -> List[np.array]:
        date_len = dates.shape[0]

        assert date_len >= self.max_len, "Timeseries length too small"

        time_deltas = np.concatenate(
            (
                [dates[1] - dates[0]],
                0.5 * (dates[2:] - dates[:-2]),
                [dates[-1] - dates[-2]],
            )
        )

        idxs = np.arange(date_len)
        idxs = sorted(idxs, key=lambda idx: time_deltas[idx])

        dense_timestamps, sparse_timestamps = (
            idxs[: date_len // 2],
            idxs[date_len // 2 :],
        )

        lengths = np.random.randint(self.min_len, self.max_len, size=self.split_count)
        lambdas = np.random.uniform(self.llambda, self.rlambda, size=self.split_count)

        n_dense, n_sparse = np.floor(lengths * lambdas).astype(int), np.ceil(
            lengths * (1 - lambdas)).astype(int)

        idxs = [
            list(np.random.choice(dense_timestamps, size=n_d))
            + list(np.random.choice(sparse_timestamps, size=n_s))
            for (n_d, n_s) in list(zip(n_dense, n_sparse))
        ]

        return [sorted(idx) for idx in idxs]

In [16]:
from ptls.frames.coles import ColesDataset
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter

In [17]:
ds = ColesDataset(MemoryMapDataset(dataset, [SeqLenFilter(60)]), splitter=TimeCLSampler(min_len=15, max_len=60, llambda=0.3, rlambda=0.5, split_count=5))

In [22]:
from tqdm import trange

In [23]:
for i in trange(len(ds)):
    ds[i]

100%|██████████| 3031/3031 [00:06<00:00, 470.77it/s]
