In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3, 4, 5, 6, 7"

import pandas as pd
import numpy as np
import torch
from functools import partial
import pytorch_lightning as pl
import warnings
warnings.filterwarnings("ignore")

from torch.utils.data import DataLoader

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from tqdm.auto import tqdm
import lightgbm as ltb

from datetime import datetime

pd.set_option('display.expand_frame_repr', False)

  from .autonotebook import tqdm as notebook_tqdm


# Downstream with all modales

Использование эмбеддингов для даунстрим задачи. Для всех таргетов одни и те же параметры бустинга для простоты

In [31]:
train_geo_trx = pd.read_parquet("geo_trx_train.parquet")
not_only_trx_geo = pd.read_parquet("geo_trx_not_only_trx.parquet")

train_dial = pd.read_parquet("dial_features_train.parquet")
test_dial = pd.read_parquet("dial_features_test.parquet")

In [38]:
df = train_geo_trx.merge(train_dial, on=['client_id'], how='outer')
df = df.fillna(0)

In [42]:
df_test = not_only_trx_geo.merge(test_dial, on='client_id', how='left')
df_test = df_test.fillna(0)
df_test.shape

In [45]:
df_test.head()

Unnamed: 0,client_id,emb_0000_x,emb_0001_x,emb_0002_x,emb_0003_x,emb_0004_x,emb_0005_x,emb_0006_x,emb_0007_x,emb_0008_x,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,03478d5f75a2b651bfd3ae66836b0a54313d1cea05d75e...,0.987249,-0.958141,-0.964381,0.778118,0.615004,-0.996484,0.973766,0.999945,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4bf106c9764392df0850cd907daa93e97dad7df8b35cb9...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4c9f58011f50bef4ea99b4f22f5a3264ed1cfb60d23b9f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,51d17a1af833d5640f5402d450bdf16dea81329a73648d...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52c6fd670cfd93f9075fbdd580d3d4819afa2661a39253...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):

#         train_embeddings = pd.read_parquet(self.train_path)
        train_embeddings = df.copy()
        X_train = train_embeddings.drop(columns=self.drop_feat)

        clfs = dict()
        for col_target in self.all_targets:
            clf = ltb.LGBMClassifier(**self.params)
            y_train = train_embeddings[col_target]
            clf.fit(X_train, y_train)
            print(f'Model fitted, target: {col_target}')
            clfs[col_target] = clf

        return clfs

    def get_scores(
        self,
        clfs
    ):
        scores = pd.DataFrame([])

#         test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        test_embeddings_curr = df_test.copy()
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)

        scores.to_csv(self.result_path)

        return scores

In [48]:
params = {
      "n_estimators": 500,
      "boosting_type": "gbdt",
      "objective": "binary",
      "subsample": 0.5,
      "subsample_freq": 1,
      "learning_rate": 0.02,
      "feature_fraction": 0.75,
      "max_depth": 6,
      "lambda_l1": 1,
      "lambda_l2": 1,
      "min_data_in_leaf": 50,
      "random_state": 42,
      "n_jobs": 8,
#       "device": "gpu",
}

dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    params=params,
    result_path='submission_coles_trx_geo_dialogs.csv'
)

scores = dw.run()
scores

Model fitted, target: target_1
Model fitted, target: target_2
Model fitted, target: target_3
Model fitted, target: target_4


Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,03478d5f75a2b651bfd3ae66836b0a54313d1cea05d75e...,0.004371,0.001967,0.002457,0.003013
1,4bf106c9764392df0850cd907daa93e97dad7df8b35cb9...,0.000138,0.000020,0.000067,0.000057
2,4c9f58011f50bef4ea99b4f22f5a3264ed1cfb60d23b9f...,0.000138,0.000020,0.000067,0.000057
3,51d17a1af833d5640f5402d450bdf16dea81329a73648d...,0.000138,0.000020,0.000067,0.000057
4,52c6fd670cfd93f9075fbdd580d3d4819afa2661a39253...,0.000138,0.000020,0.000067,0.000057
...,...,...,...,...,...
140483,c6041ce381f3df521d1dae3350ccf9b7a5c295270aaa65...,0.004238,0.000418,0.002639,0.001482
140484,c4995db29ee447c347d7b92619350762b26c93500b90ce...,0.005018,0.000465,0.008929,0.008369
140485,c45249a15c44bde22eec62b6881983769cd86bc6958cac...,0.004689,0.000400,0.004132,0.002855
140486,cc92973ca2f42eab12d0af7bc64a5489691af4a135f42f...,0.001998,0.001347,0.002099,0.000998


In [49]:
sub = pd.read_csv('submission_coles_trx_geo_dialogs.csv', index_col='Unnamed: 0')

## Add target features (Попытка)

In [55]:
df[['client', 'month']] = df['client_id'].str.split('_month=', expand = True)
df['client_id'] = df['client'] + '_month=' + df['month'].astype(str)
df['month'] = pd.to_numeric(df['month'])

In [57]:
df.sort_values(by=['client', 'month'], inplace=True)

df['any_target'] = df[['target_1', 'target_2', 'target_3', 'target_4']].any(axis=1).astype(int)

for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    df[f'{target}_count'] = df.groupby('client')[target].cumsum() - df[target]

100%|██████████| 5/5 [00:01<00:00,  3.36it/s]


In [58]:
last_seen = df[['client', 'month']].copy()
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    last_seen[target] = df.groupby('client')[target].transform(lambda x: x.where(x == 1).ffill().shift().fillna(0))

100%|██████████| 5/5 [06:35<00:00, 79.01s/it]


In [59]:
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    df[f'last_{target}_month'] = last_seen.groupby('client')['month'].transform(lambda x: x.where(df[target] == 1).ffill().shift().fillna(0))

100%|██████████| 5/5 [12:41<00:00, 152.32s/it]


In [60]:
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    df[f'{target}_months_ago'] = df['month'] - df[f'last_{target}_month']

df.drop(columns=['last_target_1_month', 'last_target_2_month', 'last_target_3_month', 'last_target_4_month', 'last_any_target_month', 'client', 'month'], inplace=True)

100%|██████████| 5/5 [00:00<00:00, 103.83it/s]


In [61]:
df.to_parquet("train_trx_geo_dial_target.parquet", index=False, engine="pyarrow", compression="snappy")

#### test

In [65]:
test_target_b = pd.read_parquet("test_target.parquet")

In [66]:
test_target_b

Unnamed: 0,mon,target_1,target_2,target_3,target_4,client_id
0,2022-05-31,0,0,0,0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...
1,2022-05-31,0,0,0,0,0433d23e224b7a520656da6181efadb8d556bb293158c9...
2,2022-04-30,0,0,0,0,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...
3,2022-10-31,0,0,0,0,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...
4,2022-10-31,0,0,0,0,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...
...,...,...,...,...,...,...
1439357,2022-10-31,0,0,0,0,10720d45fe5c441e85eb7ef5271e620be56edb2de87dc2...
1439358,2022-07-31,0,0,0,0,c879fae5376c00b5d56098cfe450c755330e44a351eaa4...
1439359,2022-11-30,0,0,0,0,771c72f26c0036a4fa6d5e965628e32efa94ed141033db...
1439360,2022-10-31,0,0,0,0,c3f8a8f4ff091e711ac102c77f46ec309b5ec27b3a0ed3...


In [67]:
test_target_b['month'] = pd.to_numeric(pd.to_datetime(test_target_b['mon']).dt.month)

In [68]:
test_target_b['month'].value_counts()

5     140488
4     140488
10    140488
8     140488
6     140488
2     140488
3     140488
9     140488
7     140488
11     94402
12     48877
Name: month, dtype: int64

In [69]:
test_target_b.sort_values(by=['client_id', 'month'], inplace=True)

test_target_b['any_target'] = test_target_b[['target_1', 'target_2', 'target_3', 'target_4']].any(axis=1).astype(int)

for target in ['target_1', 'target_2', 'target_3', 'target_4', 'any_target']:
    test_target_b[f'{target}_count'] = test_target_b.groupby('client_id')[target].cumsum() - test_target_b[target]

In [71]:
last_seen = test_target_b[['client_id', 'month']].copy()
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    last_seen[target] = test_target_b.groupby('client_id')[target].transform(lambda x: x.where(x == 1).ffill().shift().fillna(0))

100%|██████████| 5/5 [07:02<00:00, 84.47s/it]


In [72]:
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    test_target_b[f'last_{target}_month'] = last_seen.groupby('client_id')['month'].transform(lambda x: x.where(df[target] == 1).ffill().shift().fillna(0))

100%|██████████| 5/5 [11:45<00:00, 141.17s/it]


In [73]:
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    test_target_b[f'{target}_months_ago'] = test_target_b['month'] - test_target_b[f'last_{target}_month']

test_target_b.drop(columns=['last_target_1_month', 'last_target_2_month', 'last_target_3_month', 'last_target_4_month', 'last_any_target_month'], inplace=True)

100%|██████████| 5/5 [00:00<00:00, 243.86it/s]


In [74]:
test_target_b = test_target_b.sort_values(by = 'month', ascending = False)

In [75]:
test_target_b = test_target_b.drop_duplicates(subset = 'client_id', keep = 'first')

In [83]:
df[['client_id', 'any_target', 'target_1_count',
       'target_2_count', 'target_3_count', 'target_4_count',
       'any_target_count', 'target_1_months_ago', 'target_2_months_ago',
       'target_3_months_ago', 'target_4_months_ago', 'any_target_months_ago']].to_parquet('train_target_feats.parquet', index=False)

In [84]:
test_target_b[['client_id', 'any_target', 'target_1_count',
       'target_2_count', 'target_3_count', 'target_4_count',
       'any_target_count', 'target_1_months_ago', 'target_2_months_ago',
       'target_3_months_ago', 'target_4_months_ago', 'any_target_months_ago']].to_parquet('test_target_feats.parquet', index=False)

## Downstream with target features

In [86]:
test_target_feats = pd.read_parquet('test_target_feats.parquet')
df_test_f = df_test.merge(test_target_feats, on='client_id')

In [93]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):

#         train_embeddings = pd.read_parquet(self.train_path)
        train_embeddings = df.copy()
        X_train = train_embeddings.drop(columns=self.drop_feat)

        clfs = dict()
        for col_target in self.all_targets:
            clf = ltb.LGBMClassifier(**self.params)
            y_train = train_embeddings[col_target]
            clf.fit(X_train, y_train)
            print(f'Model fitted, target: {col_target}')
            clfs[col_target] = clf

        return clfs

    def get_scores(
        self,
        clfs
    ):
        scores = pd.DataFrame([])

#         test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        test_embeddings_curr = df_test_f.copy()
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)

        scores.to_csv(self.result_path)

        return scores

In [94]:
params = {
      "n_estimators": 500,
      "boosting_type": "gbdt",
      "objective": "binary",
      "subsample": 0.5,
      "subsample_freq": 1,
      "learning_rate": 0.02,
      "feature_fraction": 0.75,
      "max_depth": 6,
      "lambda_l1": 1,
      "lambda_l2": 1,
      "min_data_in_leaf": 50,
      "random_state": 42,
      "n_jobs": 8,
#       "device": "gpu",
}

dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    params=params,
    result_path='submission_coles_trx_geo_dialogs_targets.csv'
)

scores = dw.run()
scores

Model fitted, target: target_1
Model fitted, target: target_2
Model fitted, target: target_3
Model fitted, target: target_4


Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,03478d5f75a2b651bfd3ae66836b0a54313d1cea05d75e...,0.000011,0.000011,0.000010,0.000012
1,4bf106c9764392df0850cd907daa93e97dad7df8b35cb9...,0.000007,0.000005,0.000007,0.000007
2,4c9f58011f50bef4ea99b4f22f5a3264ed1cfb60d23b9f...,0.000007,0.000005,0.000007,0.000007
3,51d17a1af833d5640f5402d450bdf16dea81329a73648d...,0.000007,0.000005,0.000006,0.000007
4,52c6fd670cfd93f9075fbdd580d3d4819afa2661a39253...,0.000007,0.000005,0.000007,0.000007
...,...,...,...,...,...
140483,c6041ce381f3df521d1dae3350ccf9b7a5c295270aaa65...,0.000012,0.000006,0.000011,0.000011
140484,c4995db29ee447c347d7b92619350762b26c93500b90ce...,0.000012,0.000005,0.000012,0.000017
140485,c45249a15c44bde22eec62b6881983769cd86bc6958cac...,0.000013,0.000007,0.000010,0.000011
140486,cc92973ca2f42eab12d0af7bc64a5489691af4a135f42f...,0.000010,0.000015,0.000012,0.000008


In [95]:
sub = pd.read_csv('submission_coles_trx_geo_dialogs_targets.csv', index_col='Unnamed: 0')

In [96]:
sub

Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,03478d5f75a2b651bfd3ae66836b0a54313d1cea05d75e...,0.000011,0.000011,0.000010,0.000012
1,4bf106c9764392df0850cd907daa93e97dad7df8b35cb9...,0.000007,0.000005,0.000007,0.000007
2,4c9f58011f50bef4ea99b4f22f5a3264ed1cfb60d23b9f...,0.000007,0.000005,0.000007,0.000007
3,51d17a1af833d5640f5402d450bdf16dea81329a73648d...,0.000007,0.000005,0.000006,0.000007
4,52c6fd670cfd93f9075fbdd580d3d4819afa2661a39253...,0.000007,0.000005,0.000007,0.000007
...,...,...,...,...,...
140483,c6041ce381f3df521d1dae3350ccf9b7a5c295270aaa65...,0.000012,0.000006,0.000011,0.000011
140484,c4995db29ee447c347d7b92619350762b26c93500b90ce...,0.000012,0.000005,0.000012,0.000017
140485,c45249a15c44bde22eec62b6881983769cd86bc6958cac...,0.000013,0.000007,0.000010,0.000011
140486,cc92973ca2f42eab12d0af7bc64a5489691af4a135f42f...,0.000010,0.000015,0.000012,0.000008


In [97]:
sub = sub.drop(columns='client_id')

In [98]:
sub.to_csv('submission_coles_trx_geo_dialogs_targets.csv', index=False)