In [1]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from sklearn.model_selection import train_test_split
from pathlib import Path

from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
from sklearn.preprocessing import LabelEncoder
from pytorch_lightning import loggers as pl_loggers

import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head

from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule

from ptls.data_load.utils import collate_feature_dict
import random
from ptls.frames.inference_module import InferenceModule

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
base_path = Path('/home/COLES/poison/data')

In [5]:
class LSTMclassifier(torch.nn.Module):
    def __init__(self,
                input_size: int = None,
                hidden_size: int = None,
                num_layers: int = 1,
                bias: bool = True,
                batch_first: bool = True,
                dropout: float = 0,
                bidirectional: bool = False,
                # proj_size: int = 0,
                num_classes: int = 1,
                logsoftmax: bool = True):

        super().__init__()

        self.lstm = torch.nn.LSTM(
                    input_size=input_size,
                    hidden_size=hidden_size,
                    num_layers=num_layers,
                    bias=bias,
                    batch_first=batch_first,
                    dropout=dropout,
                    bidirectional=bidirectional,
        )

        d = 2 if bidirectional else 1

        if logsoftmax:
            activation = torch.nn.LogSoftmax(dim=-1)
        else:
            activation = torch.nn.Softmax(dim=-1)

        self.linear = torch.nn.Sequential(
            torch.nn.Linear(d * num_layers * hidden_size, num_classes),
            activation
        )


    def forward(self, input):
        output, (h_n, c_n) = self.lstm(input.payload)
        # output, h_n = self.lstm(input.payload)
        batch_size = h_n.shape[-2]
        h_n = h_n.view(batch_size, -1)
        return self.linear(h_n)

In [4]:
%reload_ext tensorboard

In [5]:
%tensorboard --logdir .

Launching TensorBoard...

# AGE dataset

### Load data

In [4]:
df_target = pd.read_csv(base_path / 'age' / 'train_target.csv')

df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target['bins'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train['bins'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 30000 records to train: 20000, valid: 3000, test: 7000


In [5]:
df_trx = pd.read_csv(base_path / 'age' / 'transactions_train.csv')
df_trx.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [6]:
df_trx_train = pd.merge(df_trx, df_target_train['client_id'], on='client_id', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['client_id'], on='client_id', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['client_id'], on='client_id', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))


Split 26450577 transactions to train: 17622321, valid: 2634248, test: 6194008


In [7]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=False,
)

In [8]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

CPU times: user 46 s, sys: 8.63 s, total: 54.6 s
Wall time: 54.9 s


In [9]:
df_data_train.head()

Unnamed: 0,client_id,trans_date,event_time,small_group,amount_rur
0,6,"[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(4), tensor(3), tensor(1), tensor(3), t...","[tensor(4.0540, dtype=torch.float64), tensor(1..."
1,7,"[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(3), tensor(53), tensor(1), tensor(5), ...","[tensor(18.3190, dtype=torch.float64), tensor(..."
2,12,"[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(1), tensor(19), tensor(13), tensor(6),...","[tensor(3.0220, dtype=torch.float64), tensor(2..."
3,13,"[tensor(0), tensor(3), tensor(8), tensor(8), t...","[tensor(0), tensor(3), tensor(8), tensor(8), t...","[tensor(5), tensor(3), tensor(11), tensor(10),...","[tensor(47.6730, dtype=torch.float64), tensor(..."
4,14,"[tensor(0), tensor(0), tensor(1), tensor(1), t...","[tensor(0), tensor(0), tensor(1), tensor(1), t...","[tensor(1), tensor(2), tensor(23), tensor(1), ...","[tensor(21.9360, dtype=torch.float64), tensor(..."


In [10]:
df_target = df_target.rename(columns={'bins': 'target_bin'})

In [11]:
df_data_train = pd.merge(df_data_train, df_target, on='client_id')
df_data_valid = pd.merge(df_data_valid, df_target, on='client_id')
df_data_test = pd.merge(df_data_test, df_target, on='client_id')

In [12]:
df_data_train.head()

Unnamed: 0,client_id,trans_date,event_time,small_group,amount_rur,target_bin
0,6,"[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(4), tensor(3), tensor(1), tensor(3), t...","[tensor(4.0540, dtype=torch.float64), tensor(1...",1
1,7,"[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(3), tensor(53), tensor(1), tensor(5), ...","[tensor(18.3190, dtype=torch.float64), tensor(...",0
2,12,"[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(1), tensor(19), tensor(13), tensor(6),...","[tensor(3.0220, dtype=torch.float64), tensor(2...",2
3,13,"[tensor(0), tensor(3), tensor(8), tensor(8), t...","[tensor(0), tensor(3), tensor(8), tensor(8), t...","[tensor(5), tensor(3), tensor(11), tensor(10),...","[tensor(47.6730, dtype=torch.float64), tensor(...",2
4,14,"[tensor(0), tensor(0), tensor(1), tensor(1), t...","[tensor(0), tensor(0), tensor(1), tensor(1), t...","[tensor(1), tensor(2), tensor(23), tensor(1), ...","[tensor(21.9360, dtype=torch.float64), tensor(...",0


In [13]:
df_data_train = df_data_train.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [14]:
rec = df_data_train[0]
{k: v[:10] if type(v) is torch.Tensor else v for k, v in rec.items()}

{'client_id': 6,
 'trans_date': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'event_time': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 18,  2]),
 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
         12.9380, 28.1620], dtype=torch.float64),
 'target_bin': 1}

In [15]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

In [16]:
len(dataset_train)

20000

### Model

In [149]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'small_group': {'in': 150, 'out': 32},
        },
        numeric_values={
            'amount_rur': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=64,
    is_reduce_sequence=False
)

In [150]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target_bin', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target_bin', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target_bin', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

In [151]:
lstm = LSTMclassifier(input_size=64, hidden_size=48, num_classes=4)

In [152]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=lstm,
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task='multiclass', num_classes=4),
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=20, gamma=0.5),
)

### Training

In [21]:
trainer = pl.Trainer(
    max_epochs=40,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

logger.version = 17


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type           | Params
-------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder  | 23.9 K
1 | head          | LSTMclassifier | 22.1 K
2 | loss          | NLLLoss        | 0     
3 | train_metrics | ModuleDict     | 0     
4 | valid_metrics | ModuleDict     | 0     
5 | test_metrics  | ModuleDict     | 0     
-------------------------------------------------
46.0 K    Trainable params
0         Non-trainable params
46.0 K    Total params
0.184     Total estimated model params size (MB)


Epoch 39: 100%|██████████| 160/160 [00:36<00:00,  4.40it/s, loss=0.777, v_num=17, seq_len=868.0, y=1.660, val_Accuracy=0.624]
CPU times: user 4min 54s, sys: 1min 43s, total: 6min 37s
Wall time: 22min 56s


In [34]:
torch.save(sup_module.state_dict(), 'saves/age_lstm.pth')

In [34]:
sup_module.load_state_dict(torch.load('saves/age_lstm.pth'))

<All keys matched successfully>

#### Evaluation

In [23]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [24]:
inf_module = InferenceModule(
    sup_module,
    model_out_name='log_prob',
)

In [25]:
df_predict = trainer.predict(inf_module, inference_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 157it [00:00, ?it/s]

Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, -245.40it/s]  


In [26]:
df_predict = pd.concat(df_predict, axis=0)

In [27]:
df_predict.head(5)

Unnamed: 0,client_id,target_bin,log_prob_0000,log_prob_0001,log_prob_0002,log_prob_0003
0,10,3,-1.23697,-1.41279,-1.835638,-1.18168
1,11,3,-1.692923,-1.008816,-2.782853,-0.942874
2,18,1,-1.551351,-0.829859,-3.89288,-1.104003
3,33,0,-0.679796,-5.210264,-0.854611,-2.774658
4,34,2,-1.169414,-6.041386,-0.39039,-4.577525


In [28]:
df_predict[['log_prob_0000', 'log_prob_0001', 'log_prob_0002', 'log_prob_0003']] = df_predict[['log_prob_0000', 'log_prob_0001', 'log_prob_0002', 'log_prob_0003']].apply(np.exp, axis=0)

In [29]:
df_predict

Unnamed: 0,client_id,target_bin,log_prob_0000,log_prob_0001,log_prob_0002,log_prob_0003
0,10,3,0.290262,0.243463,0.159512,0.306763
1,11,3,0.183981,0.364650,0.061862,0.389507
2,18,1,0.211961,0.436111,0.020387,0.331541
3,33,0,0.506720,0.005460,0.425449,0.062371
4,34,2,0.310549,0.002378,0.676793,0.010280
...,...,...,...,...,...,...
995,49974,3,0.172752,0.407057,0.009316,0.410875
996,49981,0,0.217746,0.227161,0.007462,0.547631
997,49986,1,0.120932,0.404314,0.002992,0.471762
998,49990,2,0.046789,0.000688,0.950808,0.001715


In [30]:
y_pred = df_predict[[f'log_prob_{i:04d}' for i in range(4)]].values.argmax(axis=1)
y_true = df_predict['target_bin'].values

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve

In [32]:
accuracy_score(y_true, y_pred)

0.6131428571428571

In [33]:
f1_score(y_true, y_pred, average='macro')

0.6103077732319542

# CHURN dataset

### Data

In [4]:
df = pd.read_csv(base_path / 'churn' / 'train.csv')

In [5]:
df.head()

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.0,C2C_OUT,0,0.0


In [6]:
enc = LabelEncoder()
df['MCC'] = enc.fit_transform(df['MCC'])

In [7]:
df['MCC'].nunique()

344

In [8]:
df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')

In [9]:
df = df[df['currency'] == 810]

In [10]:
df_target = df[['cl_id', 'target_flag']].drop_duplicates()
df_target.reset_index(inplace=True, drop=True)

In [11]:
df_target['target_flag'].value_counts()

1    2760
0    2201
Name: target_flag, dtype: int64

In [12]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=1000, stratify=df_target['target_flag'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=500, stratify=df_target_train['target_flag'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df_) for df_ in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 4961 records to train: 3461, valid: 500, test: 1000


In [13]:
df_trx = df[['cl_id', 'MCC', 'TRDATETIME', 'amount']]

In [14]:
df_trx_train = pd.merge(df_trx, df_target_train['cl_id'], on='cl_id', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['cl_id'], on='cl_id', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['cl_id'], on='cl_id', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df_) for df_ in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 477644 transactions to train: 336573, valid: 49767, test: 91304


In [15]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='cl_id',
    col_event_time='TRDATETIME',
    event_time_transformation='dt_to_timestamp',
    cols_category=['MCC'],
    cols_numerical=['amount'],
    return_records=False,
)

In [16]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

  return pd.to_datetime(x).astype('datetime64[s]').astype('int64') // 1000000000
  return pd.to_datetime(x).astype('datetime64[s]').astype('int64') // 1000000000
  return pd.to_datetime(x).astype('datetime64[s]').astype('int64') // 1000000000


CPU times: user 2.92 s, sys: 140 ms, total: 3.06 s
Wall time: 3.13 s


In [17]:
df_data_train = pd.merge(df_data_train, df_target, on='cl_id')
df_data_valid = pd.merge(df_data_valid, df_target, on='cl_id')
df_data_test = pd.merge(df_data_test, df_target, on='cl_id')

In [18]:
df_data_train.head()

Unnamed: 0,cl_id,event_time,MCC,amount,target_flag
0,0,"[tensor(1507811047), tensor(1508544000), tenso...","[tensor(2), tensor(19), tensor(1), tensor(9), ...","[tensor(20000., dtype=torch.float64), tensor(5...",0
1,1,"[tensor(1500422400), tensor(1500508800), tenso...","[tensor(3), tensor(1), tensor(3), tensor(3), t...","[tensor(265.5000, dtype=torch.float64), tensor...",0
2,5,"[tensor(1489795200), tensor(1489795200), tenso...","[tensor(28), tensor(39), tensor(28), tensor(15...","[tensor(13990., dtype=torch.float64), tensor(2...",1
3,10,"[tensor(1498867200), tensor(1498867200), tenso...","[tensor(1), tensor(1), tensor(1), tensor(5), t...","[tensor(466.2500, dtype=torch.float64), tensor...",0
4,11,"[tensor(1498176000), tensor(1498176000), tenso...","[tensor(117), tensor(1), tensor(8), tensor(117...","[tensor(1400., dtype=torch.float64), tensor(29...",0


In [19]:
df_data_train = df_data_train.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [20]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

### Build model

In [21]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head

In [26]:
trx_encoder=TrxEncoder(
        embeddings={
            'MCC': {'in': 344, 'out': 8},
        },
        numeric_values={
            'amount': 'log',
        },
        embeddings_noise=0.001,
    )

In [27]:
trx_encoder.output_size

9

In [23]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'MCC': {'in': 344, 'out': 8},
        },
        numeric_values={
            'amount': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=8,
    is_reduce_sequence=False
)

In [49]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target_flag', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target_flag', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target_flag', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

In [50]:
lstm = LSTMclassifier(input_size=8, hidden_size=16, num_classes=2, num_layers=2, bidirectional=True)

In [64]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=lstm,
    loss=torch.nn.NLLLoss(),
    metric_list=[torchmetrics.Accuracy(), torchmetrics.F1Score(), torchmetrics.AUROC()],
    optimizer_partial=partial(torch.optim.AdamW, lr=1e-3, weight_decay=1e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingWarmRestarts, T_0=5),
)



In [65]:
tb_logger = pl_loggers.TensorBoardLogger(save_dir="lightning_logs/churn")
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=7, verbose=False, mode="max")
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    logger=tb_logger)
    #callbacks=[early_stop_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [66]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type           | Params
-------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder  | 786   
1 | head          | LSTMclassifier | 9.9 K 
2 | loss          | NLLLoss        | 0     
3 | train_metrics | ModuleDict     | 0     
4 | valid_metrics | ModuleDict     | 0     
5 | test_metrics  | ModuleDict     | 0     
-------------------------------------------------
10.6 K    Trainable params
0         Non-trainable params
10.6 K    Total params
0.043     Total estimated model params size (MB)


logger.version = 23
                                                                           



Epoch 13:  97%|█████████▋| 28/29 [00:13<00:00,  2.07it/s, loss=0.682, v_num=23, seq_len=111.0, y=1.000, val_Accuracy=0.556, val_F1Score=0.556, val_AUROC=0.658]



Epoch 21:  97%|█████████▋| 28/29 [00:12<00:00,  2.21it/s, loss=0.686, v_num=23, seq_len=97.40, y=0.800, val_Accuracy=0.556, val_F1Score=0.556, val_AUROC=0.350]

Exception ignored in: <function _releaseLock at 0x7f2fbe479050>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 221, in _releaseLock
    def _releaseLock():
KeyboardInterrupt


Epoch 22:   0%|          | 0/29 [00:00<?, ?it/s, loss=0.686, v_num=23, seq_len=97.40, y=0.800, val_Accuracy=0.556, val_F1Score=0.556, val_AUROC=0.535]         

Epoch 23:   0%|          | 0/29 [00:00<?, ?it/s, loss=0.688, v_num=23, seq_len=104.0, y=0.400, val_Accuracy=0.556, val_F1Score=0.556, val_AUROC=0.346]         

In [28]:
torch.save(sup_module.state_dict(), 'saves/churn_lstm.pth')

### Evaluation

In [29]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [30]:
inf_module = InferenceModule(
    sup_module,
    model_out_name='log_prob',
)

In [31]:
sup_module

SequenceToTarget(
  (seq_encoder): RnnSeqEncoder(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict(
        (MCC): NoisyEmbedding(
          40, 8, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (numeric_values): ModuleDict(
        (amount): LogScaler()
      )
      (numerical_batch_norm): RBatchNorm(
        (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (seq_encoder): RnnEncoder(
      (rnn): GRU(9, 8, batch_first=True)
      (reducer): LastStepEncoder()
    )
  )
  (head): LSTMclassifier(
    (lstm): LSTM(8, 16, num_layers=2, batch_first=True, bidirectional=True)
    (linear): Sequential(
      (0): Linear(in_features=64, out_features=2, bias=True)
      (1): LogSoftmax(dim=-1)
    )
  )
  (loss): NLLLoss()
  (train_metrics): ModuleDict(
    (Accuracy): Accuracy()
  )
  (valid_metrics): ModuleDict(
    (Accuracy): Accuracy()
  )
  (test_metrics): ModuleDict(
    (Accuracy):

In [32]:
df_predict = trainer.predict(inf_module, inference_dl)
df_predict = pd.concat(df_predict, axis=0)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 28it [00:00, ?it/s]

Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, -191.57it/s] 


In [33]:
df_predict

Unnamed: 0,cl_id,target_flag,log_prob_0000,log_prob_0001
0,22,0,-0.848768,-0.558514
1,25,1,-0.848768,-0.558514
2,28,1,-0.848768,-0.558514
3,49,0,-0.848768,-0.558514
4,54,0,-0.848768,-0.558514
...,...,...,...,...
995,10180,0,-0.806717,-0.591170
996,10184,1,-0.795618,-0.600208
997,10190,0,-0.799779,-0.596798
998,10207,1,-0.803378,-0.593870


In [34]:
df_predict[['log_prob_0000', 'log_prob_0001']] = df_predict[['log_prob_0000', 'log_prob_0001']].apply(np.exp, axis=0)

In [35]:
df_predict

Unnamed: 0,cl_id,target_flag,log_prob_0000,log_prob_0001
0,22,0,0.427942,0.572058
1,25,1,0.427942,0.572058
2,28,1,0.427942,0.572058
3,49,0,0.427942,0.572058
4,54,0,0.427942,0.572058
...,...,...,...,...
995,10180,0,0.446321,0.553679
996,10184,1,0.451302,0.548698
997,10190,0,0.449428,0.550572
998,10207,1,0.447814,0.552186


In [41]:
from sklearn.metrics import f1_score, roc_auc_score

In [38]:
y_pred = df_predict[[f'log_prob_{i:04d}' for i in range(2)]].values.argmax(axis=1)
y_true = df_predict['target_flag'].values

In [39]:
f1_score(y_true, y_pred)

0.7146529562982006

In [46]:
roc_auc_score(y_true, df_predict[[f'log_prob_{i:04d}' for i in range(2)]].values.max(axis=1))

0.6769840884049517

# Default dataset

In [6]:
df_target = pd.read_csv(base_path / 'default' / 'target_finetune.csv')

In [7]:
df_target['target'].value_counts()

0    6818
1     262
Name: target, dtype: int64

In [8]:
len(df_target)

7080

In [9]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=1200, stratify=df_target['target'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=600, stratify=df_target_train['target'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 7080 records to train: 5280, valid: 600, test: 1200


In [10]:
df_target_train['target'].value_counts()

0    5084
1     196
Name: target, dtype: int64

In [11]:
df_trx = pd.read_csv(base_path / 'default' / 'transactions_finetune.csv')
df_trx.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,69,5541,48,-342.89792,2021-03-05 02:52:36
1,69,5533,48,-1251.8812,2021-03-05 09:43:28
2,69,5331,48,-87.30924,2021-03-05 11:17:23
3,69,5921,48,-1822.177,2021-03-05 13:41:03
4,69,5311,48,-427.12363,2021-03-05 19:14:23


In [12]:
df_trx['mcc_code'].nunique()

309

In [13]:
df_trx_train = pd.merge(df_trx, df_target_train['user_id'], on='user_id', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['user_id'], on='user_id', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['user_id'], on='user_id', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))


Split 2124000 transactions to train: 1584000, valid: 180000, test: 360000


In [14]:
preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='transaction_dttm',
    event_time_transformation='dt_to_timestamp',
    cols_category=['mcc_code'],
    cols_numerical=['transaction_amt'],
    return_records=False,
)

In [15]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

  return pd.to_datetime(x).astype('datetime64[s]').astype('int64') // 1000000000


CPU times: user 7.41 s, sys: 658 ms, total: 8.07 s
Wall time: 8.08 s


In [16]:
df_data_train = pd.merge(df_data_train, df_target, on='user_id')
df_data_valid = pd.merge(df_data_valid, df_target, on='user_id')
df_data_test = pd.merge(df_data_test, df_target, on='user_id')

In [17]:
df_data_valid['target'].value_counts()

0    578
1     22
Name: target, dtype: int64

In [18]:
df_data_train['target'].value_counts()

0    5084
1     196
Name: target, dtype: int64

In [19]:
df_data_train.columns

Index(['user_id', 'currency_rk', 'event_time', 'mcc_code', 'transaction_amt',
       'target'],
      dtype='object')

In [20]:
df_data_train_ = pd.DataFrame(columns=df_data_train.columns)

In [21]:
pos_samples = list(df_data_train['user_id'][df_data_train['target'] == 1])
neg_samples = list(df_data_train['user_id'][df_data_train['target'] == 0])

In [22]:
for _ in range(2640):
    pos_uid = np.random.choice(pos_samples)
    neg_uid = np.random.choice(neg_samples)

    df_data_train_ = df_data_train_.append(df_data_train[df_data_train['user_id'] == pos_uid])
    df_data_train_ = df_data_train_.append(df_data_train[df_data_train['user_id'] == neg_uid])

In [23]:
df_data_train = df_data_train_.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [24]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

### Model

In [25]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'mcc_code': {'in': 150, 'out': 32},
        },
        numeric_values={
            'transaction_amt': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=8,
    is_reduce_sequence=False
)

In [26]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

In [27]:
lstm = LSTMclassifier(input_size=8, hidden_size=4, num_classes=2, logsoftmax=False)

In [28]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=lstm,
    loss=torch.nn.CrossEntropyLoss(), #TRY BCE WITH WEIGHT
    metric_list=[torchmetrics.Accuracy(task='binary', num_classes=2, average='macro'), 
                    torchmetrics.F1Score(task='binary', num_classes=2, average='macro'),
                    torchmetrics.Precision(task='binary', num_classes=2, average='macro'),
                    torchmetrics.Recall(task='binary', num_classes=2, average='macro')],
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=1e-2),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=20, gamma=0.5),
)

### Training

In [29]:
callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=3, verbose=False, mode="min")
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    callbacks=[callback]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [30]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

logger.version = 20


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params
---------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder    | 5.8 K 
1 | head          | LSTMclassifier   | 234   
2 | loss          | CrossEntropyLoss | 0     
3 | train_metrics | ModuleDict       | 0     
4 | valid_metrics | ModuleDict       | 0     
5 | test_metrics  | ModuleDict       | 0     
---------------------------------------------------
6.1 K     Trainable params
0         Non-trainable params
6.1 K     Total params
0.024     Total estimated model params size (MB)


                                                                           



Epoch 29: 100%|██████████| 43/43 [00:26<00:00,  1.63it/s, loss=0.686, v_num=20, seq_len=300.0, y=0.500, val_Accuracy=0.566, val_F1Score=0.278, val_Precision=0.511, val_Recall=0.566] 
CPU times: user 48.4 s, sys: 1min, total: 1min 49s
Wall time: 13min 18s


### Evaluation

In [31]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [32]:
inf_module = InferenceModule(
    sup_module,
    model_out_name='prob',
)

In [33]:
df_predict = trainer.predict(inf_module, inference_dl)
df_predict = pd.concat(df_predict, axis=0)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 42it [00:00, ?it/s]

Predicting DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, -181.45it/s] 


In [34]:
y_pred = df_predict[[f'prob_{i:04d}' for i in range(2)]].values.argmax(axis=1)
y_true = df_predict['target'].values

In [38]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, roc_curve

In [36]:
roc_auc_score(y_true, df_predict['prob_0001'].values)

0.6093111041207927

In [37]:
f1_score(y_true, df_predict['prob_0001'].values > 0.5)

0.0839002267573696

# Raif

In [6]:
df_target = pd.read_csv(base_path / 'raif' / 'clients_last_2_fixed.csv', delimiter=';')

In [7]:
df_target.head(5)

Unnamed: 0,cnum_,categorycode,gender,age,married_,residenttype
0,0CCCDO,81,M,32,not_married,R
1,0CCCFO,70,F,42,not_married,R
2,0CCCGC,50,F,33,married,R
3,0CCCGG,70,M,58,not_married,R
4,0CCCGO,60,M,38,married,R


In [8]:
married_cnums = list(df_target['cnum_'][df_target['married_'] == 'married'])
not_married_cnums = list(df_target['cnum_'][df_target['married_'] == 'not_married'])

In [9]:
rand_clients = random.sample(married_cnums, 432254) + random.sample(not_married_cnums, 823671)

In [10]:
df_target = df_target.rename(columns={'married_' : 'target', 'cnum_' : 'cnum'})

In [11]:
df_target['target'] = df_target['target'].map({'not_married': 0, 'married': 1})

In [12]:
df_target.drop(df_target[df_target['cnum'].isin(rand_clients)].index, inplace=True)

In [13]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=20000, stratify=df_target['target'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=10000, stratify=df_target_train['target'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 100000 records to train: 70000, valid: 10000, test: 20000


In [14]:
df_trx = pd.read_csv(base_path / 'raif' / 'transactions_last_2.csv', delimiter=';')
df_trx.head()

Unnamed: 0,purchdate,amount,mcc,mrchcity,mrchname,cnum
0,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
1,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
2,2019-10-25 00:00:00,7716900.0,5691,MOSCOW,HXL4K TL4EEXD,ELETCO
3,2019-11-29 00:00:00,5100000.0,6513,SANKT-PETERBU,7X 1E7HXD H1DKW1IKCL,EL0KSF
4,2019-12-03 00:00:00,5023956.6,5944,MOSKVA,73 7HJXK7 (W7X),MPKIJJ


In [15]:
df_trx = df_trx.drop(columns=['mrchcity', 'mrchname'])

In [16]:
df_trx_train = pd.merge(df_trx, df_target_train['cnum'], on='cnum', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['cnum'], on='cnum', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['cnum'], on='cnum', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 104161408 transactions to train: 5494581, valid: 795646, test: 1569799


In [17]:
preprocessor = PandasDataPreprocessor(
    col_id='cnum',
    col_event_time='purchdate',
    event_time_transformation='dt_to_timestamp',
    cols_category=['mcc'],
    cols_numerical=['amount'],
    return_records=False,
)

In [18]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

  return pd.to_datetime(x).astype('datetime64[s]').astype('int64') // 1000000000


CPU times: user 1min 3s, sys: 3.67 s, total: 1min 6s
Wall time: 1min 8s


In [19]:
df_data_train = pd.merge(df_data_train, df_target, on='cnum')
df_data_valid = pd.merge(df_data_valid, df_target, on='cnum')
df_data_test = pd.merge(df_data_test, df_target, on='cnum')

In [20]:
df_data_train = df_data_train.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [21]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

In [22]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

### Model

In [23]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'mcc': {'in': 150, 'out': 32},
        },
        numeric_values={
            'amount': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=128,
    is_reduce_sequence=False
)

In [24]:
lstm = LSTMclassifier(input_size=128, hidden_size=128, num_classes=2, logsoftmax=False)

In [25]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=lstm,
    loss=torch.nn.CrossEntropyLoss(),
    metric_list=torchmetrics.Accuracy(task='multiclass', num_classes=2),
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=20, gamma=0.5),
)

### Training

In [32]:
tb_logger = pl_loggers.TensorBoardLogger(save_dir="lightning_logs/raif")
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=2, verbose=False, mode="min")
trainer = pl.Trainer(
    max_epochs=40,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    logger=tb_logger,
    callbacks=[early_stop_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [67]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params
---------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder    | 67.5 K
1 | head          | LSTMclassifier   | 132 K 
2 | loss          | CrossEntropyLoss | 0     
3 | train_metrics | ModuleDict       | 0     
4 | valid_metrics | ModuleDict       | 0     
5 | test_metrics  | ModuleDict       | 0     
---------------------------------------------------
199 K     Trainable params
0         Non-trainable params
199 K     Total params
0.800     Total estimated model params size (MB)


logger.version = 7
Sanity Checking: 0it [00:00, ?it/s]

Epoch 5: 100%|██████████| 557/557 [01:02<00:00,  8.86it/s, loss=0.641, v_num=7, seq_len=79.90, y=0.482, val_Accuracy=0.606]
CPU times: user 2min 39s, sys: 57.7 s, total: 3min 37s
Wall time: 6min 33s


In [29]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [42]:
inf_module = InferenceModule(
    sup_module,
    model_out_name='prob',
)

In [43]:
df_predict = trainer.predict(inf_module, inference_dl)
df_predict = pd.concat(df_predict, axis=0)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Predicting DataLoader 0: 100%|██████████| 20/20 [00:04<00:00,  4.23it/s]


In [44]:
df_predict

Unnamed: 0,cnum,categorycode,gender,age,target,residenttype,prob_0000,prob_0001
0,0CCCNN,70,F,32,0,R,0.658944,0.341056
1,0CCCSC,60,M,64,1,R,0.334053,0.665947
2,0CCSFY,50,F,33,1,R,0.994009,0.005991
3,0CDCNO,70,M,57,1,R,0.653496,0.346504
4,0CDXNH,70,M,39,1,R,0.906400,0.093600
...,...,...,...,...,...,...,...,...
995,ZYYDHX,81,M,54,1,R,0.217439,0.782561
996,ZYYDNS,70,M,34,0,R,0.731909,0.268091
997,ZYYFFO,50,F,42,1,R,0.663436,0.336564
998,ZYYFSO,70,M,45,1,R,0.279702,0.720298


In [46]:
y_pred = df_predict[[f'prob_{i:04d}' for i in range(2)]].values.argmax(axis=1)
y_true = df_predict['target'].values

In [26]:
sup_module.load_state_dict(torch.load('saves/raif_lstm.pth'))

<All keys matched successfully>

In [40]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

In [48]:
roc_auc_score(y_true, df_predict['prob_0001'].values)

0.62920024

In [49]:
accuracy_score(y_true, y_pred)

0.589

In [78]:
torch.save(sup_module.state_dict(), 'saves/raif_lstm.pth')

In [50]:
f1_score(y_true, y_pred)

0.5454042694392214