In [1]:
import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
from pathlib import Path

import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate


from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomCoLES

In [3]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_churn")
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [4]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head(10)

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
5,1,3,2017-10-16 00:00:00,380.0,0,0,0,0
6,1,3,2017-10-10 00:00:00,378.0,0,0,0,0
7,1,3,2017-10-16 00:00:00,199.0,0,0,0,0
8,1,3,2017-10-11 00:00:00,400.0,0,0,0,0
9,1,1,2017-07-26 00:00:00,598.0,0,0,0,0


In [5]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True
)

In [6]:
dataset = preprocessor.fit_transform(df)

In [7]:
train, val = train_test_split(dataset, test_size=.2)

In [8]:
cfg_model["dataset"]

{'_target_': 'src.coles.CustomColesDataset', 'min_len': 15, 'col_time': 'event_time', 'splitter': {'_target_': 'ptls.frames.coles.split_strategy.SampleSlices', 'split_count': 5, 'cnt_min': 15, 'cnt_max': 150}}

In [9]:
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)

In [10]:
# from tqdm import trange
# times, cnt = 0, 0
# for idx in trange(len(train_data)):
#     samples = train_data[idx]
#     times += sum([elem['event_time'][-1] - elem['event_time'][0] for elem in samples])
#     cnt += len(samples)
# print(times / (cnt * 60 * 60))
# print(times / (cnt * 60 * 60 * 24))

In [10]:
datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

In [16]:
model: CustomCoLES = instantiate(cfg_model["model"])

In [17]:
model.seq_encoder.state_dict()

OrderedDict([('trx_encoder.embeddings.mcc_code.weight',
              tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [-1.2887,  0.5496, -0.0436,  ...,  0.1698, -1.4161, -0.1803],
                      [-0.0719, -0.4414,  0.1291,  ..., -0.4333, -0.1972,  0.4534],
                      ...,
                      [-0.1141, -0.9562,  0.2312,  ...,  1.0724, -0.5990, -1.8831],
                      [-0.3615,  0.4621, -0.0129,  ...,  0.7438,  0.3810,  0.4972],
                      [-1.1478, -0.1528,  0.4683,  ...,  0.8271,  0.5681,  0.7161]])),
             ('trx_encoder.numerical_batch_norm.bn.weight', tensor([1.])),
             ('trx_encoder.numerical_batch_norm.bn.bias', tensor([0.])),
             ('trx_encoder.numerical_batch_norm.bn.running_mean',
              tensor([0.])),
             ('trx_encoder.numerical_batch_norm.bn.running_var', tensor([1.])),
             ('trx_encoder.numerical_batch_norm.bn.num_batches_tracked',
              

In [14]:
import torch
torch.save(model.seq_encoder.state_dict(), "saved_models/coles_no_training.pth")

In [5]:
import torch
model.load_state_dict(torch.load("coles_default.pth"))

<All keys matched successfully>

In [6]:
torch.save(model.seq_encoder.state_dict(), "coles_default.pth")

In [12]:
model_checkpoint: ModelCheckpoint = instantiate(
    cfg_model["trainer_coles"]["checkpoint_callback"],
    monitor=model.metric_name,
    mode="max"
)

In [13]:
early_stopping: EarlyStopping = instantiate(
    cfg_model["trainer_coles"]["early_stopping"],
    monitor=model.metric_name,
    mode="max"
)

In [14]:
logger: TensorBoardLogger = instantiate(cfg_model["trainer_coles"]["logger"])

In [15]:
trainer: Trainer = instantiate(
    cfg_model["trainer_coles"]["trainer"],
    callbacks=[model_checkpoint, early_stopping],
    logger=logger
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [16]:
trainer.fit(model, datamodule)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 4.3 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
4.3 M     Trainable params
0         Non-trainable params
4.3 M     Total params
17.257    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved. New best score: 0.212


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.328 >= min_delta = 0.01. New best score: 0.540


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.071 >= min_delta = 0.01. New best score: 0.611


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.050 >= min_delta = 0.01. New best score: 0.662


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.026 >= min_delta = 0.01. New best score: 0.687


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.019 >= min_delta = 0.01. New best score: 0.706


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.026 >= min_delta = 0.01. New best score: 0.732


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.019 >= min_delta = 0.01. New best score: 0.751


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.014 >= min_delta = 0.01. New best score: 0.765


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.015 >= min_delta = 0.01. New best score: 0.780


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric recall_top_k did not improve in the last 5 records. Best score: 0.780. Signaling Trainer to stop.


In [17]:
import torch
torch.save(model.state_dict(), f"coles_default.pth")