# Colab setup

In [None]:
!python --version

Python 3.8.16


In [None]:
!pip install pip==22.3.1


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
!pip install pytorch-lifestream -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
!pip install pytorch-lightning==1.6.5 -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
!pip install torch==1.12.1


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Setup

In [None]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data preproccessing

In [None]:
import os
import pandas as pd

data_path = '../data/'

source_data = pd.read_csv(os.path.join(data_path, 'for_sequential_encoding_basis_graph_edge.csv'))

"""
The above dataset was prepared by samriddh using the process mentioned in this notebook - which is hidden as of now
https://colab.research.google.com/drive/1SMhyQWijcdAPVG_dr4YSJo0icc3t4hLY
"""

  source_data = pd.read_csv(os.path.join(data_path, 'for_sequential_encoding_basis_graph_edge.csv'))


In [None]:
source_data.dtypes

Unnamed: 0                           int64
user_id                              int64
product_type                        object
destination_bank_account_number     object
customer_id                         object
amount                             float64
transaction_date                    object
reciver_user_id                     object
dtype: object

In [None]:
source_data.user_id = source_data.user_id.astype(str)

In [None]:
source_data.transaction_date = pd.to_datetime(source_data.transaction_date)

In [None]:
# source_data.apply(lambda x : str(x.user_id)+"_"+source_data['reciver_user_id'],axis=1)

##### Creating Pair of edge

In [None]:
source_data['edge_pair'] = source_data['user_id'] + "_" + source_data['reciver_user_id']

In [None]:
#Creating Pair Identifier in the name of key_id

In [None]:
key1 = pd.DataFrame(source_data.edge_pair.unique(),columns=['edge_pair'])

In [None]:
key1.sort_values("edge_pair",ascending=True,inplace=True)

In [None]:
key1 = key1.reset_index().reset_index()

In [None]:
key1.drop(['index'],axis=1,inplace=True)

In [None]:
key1.columns = ['pair_id','edge_pair']

In [None]:
source_data = source_data.merge(key1)

##### Creating Time Reference

In [None]:
key2 = source_data.groupby('user_id')[['transaction_date']].min().reset_index()

In [None]:
key2 = key2.rename(columns={'transaction_date':'first_txn_date'})

In [None]:
source_data = source_data.merge(key2,on='user_id',how='left')

In [None]:
source_data['time_delta'] = (source_data.transaction_date - source_data.first_txn_date).dt.days

##### Creating Transaction Category

In [None]:
key3 = pd.DataFrame(source_data.product_type.unique(),columns=['product_type']).reset_index().rename(columns={'index':'txn_type_id'})

In [None]:
key3

Unnamed: 0,txn_type_id,product_type
0,0,PPOB
1,1,Payment Out
2,2,Payment In
3,3,QRIS
4,4,SALDO


In [None]:
source_data = source_data.merge(key3,on='product_type',how='left')

#### Processing

In [None]:
checkpoint = source_data.copy()

In [None]:
source_data = source_data[['pair_id','time_delta','txn_type_id','amount']]

In [None]:
source_data.head(2)

Unnamed: 0,pair_id,time_delta,txn_type_id,amount
0,726,174,0,29050.0
1,726,174,0,17500.0


In [None]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='pair_id',
    col_event_time='time_delta',
    event_time_transformation='none',
    cols_category=['txn_type_id'],
    cols_numerical=['amount'],
    return_records=True,
)

In [None]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 4.25 s, sys: 195 ms, total: 4.44 s
Wall time: 4.45 s


In [None]:
import pickle

with open('../data/preprocessor_edge_based_256Rd.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [None]:
dataset = sorted(dataset, key=lambda x: x['pair_id'])

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(20112, 5029)

In [None]:
train[0].keys()

dict_keys(['pair_id', 'time_delta', 'event_time', 'txn_type_id', 'amount'])

## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are:

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

### Model definition

In [None]:
source_data.head(2)

Unnamed: 0,pair_id,time_delta,txn_type_id,amount
0,726,174,0,29050.0
1,726,174,0,17500.0


In [None]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount': 'identity'},
    embeddings={
        'time_delta': {'in': 800, 'out': 16},
        'txn_type_id': {'in': 5, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

### Data loader

In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

### Trainer

In [None]:
torch.device("mps")

device(type='mps')

In [None]:
torch.backends.mps.is_available()

True

In [None]:
torch.backends.mps.is_built()

True

In [None]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=5,
#     gpus=1 if torch.backends.mps.is_available() else 0,
    accelerator="mps",
    enable_progress_bar=True,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Training

In [None]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 236 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
236 K     Trainable params
0         Non-trainable params
236 K     Total params
0.947     Total estimated model params size (MB)


logger.version = 6


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

{'loss': tensor(479.4875), 'seq_len': tensor(72.8551)}
CPU times: user 3min 57s, sys: 3min 35s, total: 7min 33s
Wall time: 4min 19s


### Save sequence encoder for other experiments

In [None]:
torch.save(seq_encoder.state_dict(), "data_research/coles-emb-2.pt")
##missing

In [None]:
# torch.vstack(trainer.predict(model, train_dl, ))

##

In [None]:
trainer.loggers

[<pytorch_lightning.loggers.tensorboard.TensorBoardLogger at 0x304815fd0>]

## Inference

In [None]:
%%timeit
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

  rank_zero_warn(


Predicting: 12it [00:00, ?it/s]



Predicting: 12it [00:00, ?it/s]

(torch.Size([20112, 256]), torch.Size([5029, 256]))

In [None]:
out1 = pd.concat([pd.DataFrame(train),pd.DataFrame(train_embeds).add_prefix("E_", axis=1)],axis=1)
out2 = pd.concat([pd.DataFrame(test),pd.DataFrame(test_embeds).add_prefix("E_", axis=1)],axis=1)

In [None]:
out_concat = pd.concat([out1,out2],axis=0)

In [None]:
out_concat = out_concat.merge(key1)

In [None]:
out_concat['source'] = out_concat.edge_pair.apply(lambda x : x.split("_")[0])

In [None]:
out_concat['destination'] = out_concat.edge_pair.apply(lambda x : x.split("_")[1])

In [None]:
out_concat.to_csv("../data/rnn_out_edge_based_256Rd.csv")

In [None]:
out_concat.edge_pair.nunique()

25141

In [None]:
# # join target and embeddings

# df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
# df_target = df_target.set_index('client_id')
# df_target.rename(columns={"bins": "target"}, inplace=True)

# train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
# train_df['client_id'] = [x['client_id'] for x in train]
# train_df = train_df.merge(df_target, how='left', on='client_id')

# test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
# test_df['client_id'] = [x['client_id'] for x in test]
# test_df = test_df.merge(df_target, how='left', on='client_id')

# print(train_df.shape, test_df.shape)

Obtained embeddings can be used as features for model training

For example:

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# embed_columns = [x for x in train_df.columns if x.startswith('embed')]
# x_train, y_train = train_df[embed_columns], train_df['target']
# x_test, y_test = test_df[embed_columns], test_df['target']

# clf = RandomForestClassifier()
# clf.fit(x_train, y_train)
# clf.score(x_test, y_test)