## **Загружаем данные, если работаете удалённо**

In [1]:
USING_COLAB = True

In [None]:
if USING_COLAB:
  %pip install -q pytorch-lifestream

  # loading data
  !gdown 1_RQxjVFlve12NMTSyvEWlL6m9eylHmxU
  !gdown 12NVI9hbSnFjfiT27d-FkaeLmkz3WgvwI
  !gdown 1jkpplWIKV6IS7AHBPEPcgmiQC8T6RuuJ
  !gdown 1V9X-iDGABK0njxTm6nVJPDH-cquHqJ8s
  !gdown 1fXx465_ICgmZ1-9_Sl45w7GHLHzndyRo

### Импорты

In [None]:
# Other tools
import pickle
import random
import os

from collections import Counter
from tqdm import tqdm
from copy import deepcopy

# Train
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Visual
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# Preprocessing
import pandas as pd
import numpy as np

from datetime import datetime

# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# PTLS
import pytorch_lightning as pl

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from sklearn.model_selection import train_test_split

def gini(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Session settings
pd.set_option('display.max_rows', 15)

plt.rcParams['font.weight'] = 'semibold'
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['font.size'] = 18
plt.rcParams['savefig.format'] = 'pdf'

# make results reproducable
def set_seed(seed=42):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(SEED)

SEED = 69
set_seed(SEED)

## **Стандратный препроцессинг **

In [None]:
def preprocess_data(
    train,
    test,
    scale=True,
    init_features2drop=None,
    cat_features=[]
  ):

    constant_features_names = ['agg_BoardOfDirectors__g_contractor__Name__count__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__IndexOfDueDiligence__mean__ALL_TIME', 'agg_spark_extended_report__g_contractor__CreditLimitSum__last__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__PaymentIndex__mean__ALL_TIME', 'agg_spark_extended_report__g_contractor__CompanySizeRevenue__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgeeActiveCount__last__ALL_TIME', 'contract_date', 'contract_init_sum', 'agg_spark_extended_report__g_contractor__PledgerActiveCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgeeCeasedCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgerCeasedCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedNetLiabilitiesSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedClaimsSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__FailureScore__mean__ALL_TIME']


    # for now lets drop columns where num of nans is greater that  a half of df length
    isna_stat = train.isna().sum()[train.isna().sum() > 0]
    features2drop = [col for col in isna_stat.index if isna_stat[col] > (train.shape[0] * (3/4))]
    train.drop(columns=features2drop, inplace=True)
    test.drop(columns=features2drop, inplace=True)

    # other we`ll fill with median
    for col in isna_stat.index:
        if col not in features2drop:
            train_imputer = SimpleImputer(strategy="median").fit(train[col].values.reshape(-1, 1))
            train[col] = train_imputer.transform(train[col].values.reshape(-1, 1))
            test[col] = train_imputer.transform(test[col].values.reshape(-1, 1))

    #translate to timestamp
    dates_columns = train.select_dtypes('object').columns

    new_dates_cols = []

    for date_col in dates_columns:
        train[f"{date_col}_day"] = train[date_col].apply(lambda x: datetime.fromisoformat(x).day)
        train[f"{date_col}_month"] = train[date_col].apply(lambda x: datetime.fromisoformat(x).month)

        test[f"{date_col}_day"] = test[date_col].apply(lambda x: datetime.fromisoformat(x).day)
        test[f"{date_col}_month"] = test[date_col].apply(lambda x: datetime.fromisoformat(x).month)

        train[date_col] = train[date_col].apply(lambda x: datetime.fromisoformat(x).timestamp())
        test[date_col] = test[date_col].apply(lambda x: datetime.fromisoformat(x).timestamp())

        new_dates_cols.extend([f"{date_col}_day", f"{date_col}_month"])

    if init_features2drop is not None:
      train = train.drop(columns=init_features2drop)
      test = test.drop(columns=init_features2drop)

    return train, test, constant_features_names

## **Загружаем и препроцессим дату**

In [None]:
if not USING_COLAB:
  root_data_path = "data"
else:
  root_data_path = "/content"

X_train = pd.read_csv(f"{root_data_path}/train_X.csv")
y_train_all = pd.read_csv(f"{root_data_path}/train_y.csv")

train = X_train.merge(y_train_all, on=["contract_id", "report_date"], how="left")
test = pd.read_csv(f"{root_data_path}/test2_X.csv")

y_train_all = train["default6"]
train_orig = deepcopy(train)

In [None]:
for_drop = ["project_id", "building_id", "contractor_id", "specialization_id"]
cat_features = [ "report_date_day", "report_date_month", "contract_date_day", "contract_date_month"]

train, test, constant_features_names = preprocess_data(
    train.copy(),
    test.copy(),
    scale=False,
    init_features2drop=for_drop,
)

In [None]:
train = train.drop(columns=["contract_date", "default6", "contract_date_month", "contract_date_day", "report_date_month"])
test = test.drop(columns=["contract_date", "contract_date_month", "contract_date_day", "report_date_month"])

cat_features = ['report_date_day']

### Составляем датасет

In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="contract_id",
    col_event_time="report_date",
    event_time_transformation="none",
    cols_category=cat_features,
    cols_numerical=list(train.columns[2:-2]),
    return_records=True,
)

dataset = preprocessor.fit_transform(train)
dataset = sorted(dataset, key=lambda x: x["contract_id"])

train_split, val_split = train_test_split(dataset, test_size=0.2, random_state=SEED)

dataset_test = preprocessor.fit_transform(test)
dataset_test = sorted(dataset_test, key=lambda x: x["contract_id"])

### Создаём модельку на основе кастомной модельки CoLES от PTLS

In [None]:
trx_encoder_params = dict(
    embeddings_noise=0.000,
    # numeric_values={"amount_rur": "identity"},
    embeddings={
        "report_date_day": {"in": 31, "out": 1},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=64,
    type="gru",
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.0001),
    lr_scheduler_partial=partial(
        torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9
    ),
)

## Даталоадер

In [None]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train_split,
            i_filters=[SeqLenFilter(min_seq_len=1)],
        ),
        splitter=SampleSlices(
            split_count=1,
            cnt_min=1,
            cnt_max=30,
        ),
    ),
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=val_split,
            i_filters=[SeqLenFilter(min_seq_len=1)],
        ),
        splitter=SampleSlices(
            split_count=1,
            cnt_min=1,
            cnt_max=30,
        ),
    ),
    train_num_workers=4,
    train_batch_size=32,

)

### Запускаем обучение

In [None]:
trainer = pl.Trainer(
    max_epochs=15,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=True,
    log_every_n_steps=3
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(
    model=model,
    train_dataloaders=train_dl,
    # val_dataloaders=val_dl
)
print(trainer.logged_metrics)
torch.save(seq_encoder.state_dict(), "coles-emb.pt")

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 13.0 K | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
13.0 K    Trainable params
0         Non-trainable params
13.0 K    Total params
0.052     Total estimated model params size (MB)
14        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=15` reached.


{'loss': tensor(0.), 'seq_len': tensor(4.5000), 'valid/recall_top_k': tensor(0.0346)}


### Получаем эмбеддинги

In [None]:
train_dl = inference_data_loader(dataset, num_workers=0, batch_size=64)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(dataset_test, num_workers=0, batch_size=64)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
train_embeddings = pd.DataFrame(train_embeds.numpy(), columns=[f"emb_{i}" for i in range(train_embeds.shape[1])])
test_embeddings = pd.DataFrame(test_embeds.numpy(), columns=[f"emb_{i}" for i in range(test_embeds.shape[1])])
train_embeddings['contract_id'] = [x['contract_id'] for x in dataset]
test_embeddings['contract_id'] = [x['contract_id'] for x in dataset_test]

In [None]:
train_embeddings.to_parquet('train_embs.parquet', index=False)
test_embeddings.to_parquet('test_embs.parquet', index=False)