In [74]:
import os
os.chdir("/app/")

from omegaconf import OmegaConf, DictConfig
from hydra.utils import instantiate
from torch.utils.data import DataLoader
from pytorch_lightning import LightningModule, Trainer
from src.preprocessing import preprocess
from ptls.data_load.utils import FeatureDict, PaddedBatch, collate_feature_dict
from ptls.data_load.datasets import MemoryMapDataset
from torch import nn
import statistics
import torch
import seaborn as sns
from matplotlib import pyplot as plt
import random
from functools import partial

from collections import defaultdict
import pandas as pd
import numpy as np

from src.datasets import SlidingWindowDataset

In [3]:
train, val, test = preprocess(OmegaConf.load("config/preprocessing/churn_nodup_100.yaml"))

In [31]:
class SplitRandomizer(FeatureDict):
    def __init__(self, part_size: int):
        self.part_size = part_size
        
    def _shuffle(self, parts):
        raise NotImplementedError()
    
    def __call__(self, fd: dict):
        seq_len = self.get_seq_len(fd)
        idx = torch.arange(seq_len)
        parts = list(torch.split(idx, self.part_size))
        
        parts = self._shuffle(parts)
        
        return {
            k: torch.cat([v[part] for part in parts])
            if self.is_seq_feature(k, v) else v
            for k, v in fd.items()
        }
    

class Shuffle(SplitRandomizer):
    def _shuffle(self, parts):
        random.shuffle(parts)
        return parts


class NeighborSwap(SplitRandomizer):
    def __init__(self, part_size: int, p: float):
        super().__init__(part_size)
        self.p = p
    
    def _shuffle(self, parts):
        for i in range(len(parts) - 1):
            if random.random() < self.p:
                parts[i], parts[i + 1] = parts[i + 1], parts[i]
                
        return parts

In [76]:
class EncoderPredictor(LightningModule):
    def __init__(self, encoder_cfg: DictConfig, weights_path: str):
        super().__init__()
        self.encoder: nn.Module = instantiate(encoder_cfg, is_reduce_sequence=True)
        self.encoder.load_state_dict(torch.load(weights_path))
        
    def predict_step(self, batch: PaddedBatch, *args, **kwargs):
        return self.encoder(batch)


In [77]:
def predict(
    datas: list[dict],
    batch_size: int,
    encoder_cfg_path: str, 
    weights_path: str,
):
    encoder = EncoderPredictor(OmegaConf.load(encoder_cfg_path), weights_path)

    dataloaders = [
        DataLoader(
            MemoryMapDataset(data),
            batch_size=batch_size,
            collate_fn=collate_feature_dict
        )
        for data in datas
    ]
    
    trainer = Trainer(
        logger=False,
        accelerator="gpu",
        devices=1
    )
    
    all_preds = trainer.predict(encoder, dataloaders)
    return [torch.cat(dataloader_preds)  for dataloader_preds in all_preds]


In [78]:
part_sizes = [1, 7, 14, 31, 90, 180, 365]

In [88]:
part_sizes_masks = [
    torch.BoolTensor([FeatureDict.get_seq_len(row) > 2 * part_size for row in val])
    for part_size in part_sizes
]

In [79]:
data_swap = [[NeighborSwap(part_size, 0.2)(row) for row in val] for part_size in part_sizes]
data_shuf = [[Shuffle(part_size)(row) for row in val] for part_size in part_sizes]

data_dict = {
    "swap": [val, *data_swap],
    "shuf": [val, *data_shuf]
}


In [80]:
models = {
    "coles_new": dict(
        encoder_cfg_path="config/backbone/encoder/coles_churn_100.yaml",
        weights_path="saved_models/coles_churn_100.pth"
    ),
    "coles_old": dict(
        encoder_cfg_path="config/backbone/encoder/coles_churn.yaml",
        weights_path="coles_best_state_dict.pth"
    ),
    "nlp_new": dict(
        encoder_cfg_path="config/backbone/encoder/ae_nlp.yaml",
        weights_path="saved_models/ae_nlp.pth"
    ),
    "nlp_old": dict(
        encoder_cfg_path="config/backbone/encoder/ae_nlp.yaml",
        weights_path="saved_models/ae_nlp_churn_100.pth"
    )
}

In [96]:
result_dict = {}

for data_name, datas in data_dict.items():
    result_dict[data_name] = {}
    for model_name, model_args in models.items():
        preds = predict(
            *datas,
            **model_args,
            batch_size=32
        )
        
        preds_orig = preds[0]
        results = []
        for part_sizes_mask, preds_rand in zip(part_sizes_masks, preds[1:]):
            orig_masked = preds_orig[part_sizes_mask]
            rand_masked = preds_rand[part_sizes_mask]
            
            results.append(torch.cosine_similarity(orig_masked, rand_masked, dim=-1).mean().item())
            
        result_dict[data_name][model_name] = results
        
        del preds

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 0it [00:00, ?it/s]

In [97]:
pd.DataFrame(result_dict["swap"], index=part_sizes)

Unnamed: 0,coles_new,coles_old,nlp_new,nlp_old
1,0.999099,0.999454,0.989854,0.987001
7,0.998969,0.999601,0.97942,0.974817
14,0.999061,0.999668,0.9844,0.980893
31,0.998146,0.999692,0.986323,0.984771
90,1.0,1.0,1.0,1.0
180,,,,
365,,,,


In [98]:
pd.DataFrame(result_dict["shuf"], index=part_sizes)

Unnamed: 0,coles_new,coles_old,nlp_new,nlp_old
1,0.988675,0.995116,0.909995,0.898983
7,0.991332,0.997718,0.918375,0.908441
14,0.99178,0.99808,0.927512,0.92064
31,0.992884,0.998776,0.946774,0.941986
90,0.988638,0.997104,0.961812,0.964229
180,,,,
365,,,,
