# Prepare model package for deployment

In [16]:
from collections import defaultdict
import pickle
import re
import pandas as pd
from cyvers_ai_ds.features import AssetCentricFEPipeline
import boto3

## 1 Create preprocessor object with required features for the model

In [2]:
model_name = 'resampled_no_phishing'

In [3]:
selected = [
    "snd_rcv_amt_usd_sum_tx_mean_log_to_median_ratio",
    "snd_rcv_mean_amt_usd_tx_median_log_to_median_ratio",
    "amount_usd_tx_sum_log_to_median_ratio",
    "snd_rcv_life_time_sec_tx_min_log_to_median_ratio",
    "snd_rcv_mean_time_diff_sec_tx_sum_log_to_median_ratio",
    "snd_rcv_tx_cnt_tx_sum_log_to_median_ratio",
    "snd_rcv_time_diff_sec_tx_mean_log_to_median_ratio",
    "snd_rcv_tkn_type_cnt_tx_mean_log_to_median_ratio",
]

In [4]:
tx_agg_list = defaultdict(list)
tx_agg_list['block_time'] = ['first']
tx_agg_list['sender_id'] = ['first']
for s in selected:
    f, agg = re.findall('(.+)_tx_(.+)_log', s)[0]
    tx_agg_list[f].append(agg)
tx_agg_list = list(tx_agg_list.items())

In [5]:
tx_agg_list

[('block_time', ['first']),
 ('sender_id', ['first']),
 ('snd_rcv_amt_usd_sum', ['mean']),
 ('snd_rcv_mean_amt_usd', ['median']),
 ('amount_usd', ['sum']),
 ('snd_rcv_life_time_sec', ['min']),
 ('snd_rcv_mean_time_diff_sec', ['sum']),
 ('snd_rcv_tx_cnt', ['sum']),
 ('snd_rcv_time_diff_sec', ['mean']),
 ('snd_rcv_tkn_type_cnt', ['mean'])]

In [6]:
rolling_features = [
    s.replace('_log_to_median_ratio', '')
    for s in selected
]

In [7]:
preprocessor = AssetCentricFEPipeline(
    tx_agg_list=tx_agg_list,
    rolling_features=rolling_features
)

Test preprocessing and inference

In [8]:
raw_data = pd.read_csv('data/raw_data.csv', parse_dates=['block_time'], nrows=1000)

In [9]:
tx_data, sender_data = preprocessor.execute(raw_data)

In [10]:
with open('artifacts/' + model_name + '.pkl', 'rb') as f:
    model = pickle.load(f)

In [11]:
pred_labels = model.predict(tx_data.loc[:, selected].fillna(0))

## 2 Test and package model instance

In [12]:
from cyvers_ai_ds.models.v1 import TimeAmountClassifier

In [13]:
model_pipeline = TimeAmountClassifier(
    {
        'model': model,
        'preprocessor': preprocessor
    }
)

In [14]:
result = model_pipeline.infer(raw_data)
result.head()

Unnamed: 0,transaction_id,pred_prob,pred_label
0,0x12efb3a947d2c34bc20cceb797386f9f0c211f340374...,0.024746,0
1,0x2162f4e376fd15cfb91951d415adfabe6c9c5b5a431c...,0.024746,0
2,0xa16a83a95d62ed8fe73f1b8dd8daa1ffa0851f2d46c5...,0.024746,0
3,0x57a5f87de3d4825ed3dd37d3cdb333564fbbf1807141...,0.022359,0
4,0x5a2c8cb7b78d89e846300dce97cdf97c57d8163a4489...,0.024746,0


Pickle final version for production

In [15]:
with open('artifacts/model_pipline.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

Backup in s3

In [17]:
from utils.s3 import MODELS_BUCKET
from utils import MODEL_NAME
bucket = boto3.resource('s3').Bucket(MODELS_BUCKET)
bucket.upload_file('artifacts/model_pipline.pkl', MODEL_NAME + 'artifacts/model_pipline.pkl')