[![CyVers](https://i.imgur.com/yyhmZET.png)](https://www.cyvers.ai/)

# BlockChain Attack Data Set - Exploratory Data Analysis (EDA)

> Notebook by:
> - Royi Avital Royi@cyvers.ai

## Revision History

| Version | Date       | Content / Changes                      |
|---------|------------|----------------------------------------|
| 0.1.000 | 30/06/2022 | First version                          |
|         |            |                                        |

In [None]:
# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Misc
import datetime
import os
from platform import python_version
import random
import warnings

# EDA Tools
import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

# Machine Learning
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
# from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.svm import SVC
from catboost import CatBoostClassifier, Pool
import catboost as cb

# Metrics
from sklearn.metrics import confusion_matrix, fbeta_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Ensemble Engines
import lightgbm
from xgboost import XGBClassifier

# Visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from bokeh.plotting import figure, show

# Jupyter
from ipywidgets import interact, Dropdown, Layout

In [None]:
# Configuration
%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

In [None]:
# Constants

DATA_FOLDER_NAME    = 'BlockChainAttacksDataSet'
DATA_FOLDER_PATTERN = 'DataSet0'
DATA_FILE_EXT       = 'csv'

PROJECT_DIR_NAME = 'CyVers' #<! Royi: Anton, don't change it, it should be a team constant
PROJECT_DIR_PATH = os.path.join(os.getcwd()[:os.getcwd().find(PROJECT_DIR_NAME)], PROJECT_DIR_NAME) #>! Pay attention, it will create issues in cases you name the folder `CyVersMe` or anything after / before `CyVers`

# Feature extractors constants

# Assets
# By amount:
SUM_ASSET       = 'SUM (Asset)'
MEAN_ASSET      = 'MEAN (Asset)'
STD_ASSET       = 'STD (Asset)'
VAR_ASSET       = 'VAR (Asset)'
MEDIAN_ASSET    = 'MEDIAN (Asset)'
COUNT_ASSET     = 'COUNT (Asset)'
MIN_ASSET       = 'MIN (Asset)'
MAX_ASSET       = 'MAX (Asset)'
# By time:
TD_MEAN_ASSET   = 'TD_MEAN (Asset)'
TD_STD_ASSET    = 'TD_STD (Asset)'
TD_MEDIAN_ASSET = 'TD_MEDIAN (Asset)'
TD_MIN_ASSET    = 'TD_MIN (Asset)'
TD_MAX_ASSET    = 'TD_MAX (Asset)'

# User
SUM_USR         = 'SUM (User)'
MEAN_USR        = 'MEAN (User)'
STD_USR         = 'STD (User)'
VAR_USR         = 'VAR (User)'
MEDIAN_USR      = 'MEDIAN (User)'
COUNT_USR       = 'COUNT (User)'
MIN_USR         = 'MIN (User)'
MAX_USR         = 'MAX (User)'
# By time:
TD_MEAN_USR     = 'TD_MEAN (User)'
TD_STD_USR      = 'TD_STD (User)'
TD_MEDIAN_USR   = 'TD_MEDIAN (User)'
TD_MIN_USR      = 'TD_MIN (User)'
TD_MAX_USR      = 'TD_MAX (User)' 
#######
HOUR            = 'Hour'
WEEKDAY         = 'Weekday'
TIME_INTRVL     = 'Time Interval'

test_train_selection_proportion_ = 0.7
###list of numeric columns
num_cols = ['Amount','Amount [USD]',  SUM_ASSET, MEAN_ASSET, STD_ASSET, VAR_ASSET, MEDIAN_ASSET, COUNT_ASSET, MIN_ASSET, MAX_ASSET, TD_MEAN_ASSET, TD_STD_ASSET, TD_MEDIAN_ASSET , TD_MIN_ASSET, TD_MAX_ASSET, 
                                      SUM_USR, MEAN_USR, STD_USR, VAR_USR, MEDIAN_USR, COUNT_USR, MIN_USR, MAX_USR, TD_MEAN_USR, TD_STD_USR, TD_MEDIAN_USR, TD_MIN_USR, TD_MAX_USR, HOUR, WEEKDAY, TIME_INTRVL]
categor_cols = ['Currency', 'Currency Type' , 'Receiver Type']

numAttacksColName = 'Number of Attacks'
attackTypeColName = 'Attack Type'

In [None]:
# CyVers Packages
from DataSetsAuxFun import *

In [None]:
# Parameters
dataSetRotoDir = os.path.join(PROJECT_DIR_PATH, DATA_FOLDER_NAME)

runTsne = False

# Amount USD Outlier threshold
amountUsdOutlierThr = 1e9

testSetRatio = 1.0 / 3.0

In [None]:
# Auxiliary Functions

def print_scores(preds, y_test):
    print('recall : ' , recall_score(y_test,preds))#(preds, y_test))
    print('Test Precision Score: ' , precision_score(y_test, preds))#(preds, y_test))
    print('Test F1 Score: ' , fbeta_score(y_test,preds, beta=1))
    print('Confusion Matrix: \n' , confusion_matrix(y_test,preds))

def model_scores(preds, y_test):
    return recall_score(y_test,preds) , precision_score(y_test,preds) , fbeta_score(y_test,preds, beta=1)


In [None]:
# Loading / Generating Data
lCsvFile = ExtractCsvFiles(dataSetRotoDir, folderNamePattern = DATA_FOLDER_PATTERN)
print(f'The number of file found: {len(lCsvFile)}')

# dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
dfData, dAssetFile = LoadCsvFilesDf(lCsvFile, baseFoldePath = '')
numRows, numCols = dfData.shape

print(f"The number of rows (Samples): {numRows}, The number of columns: {numCols}, number of unique sender id's: {dfData['Sender ID'].unique().shape}")

In [None]:
# Convert time data into Pandas format
dfData['Transaction Time'] = pd.to_datetime(dfData['Transaction Time'], infer_datetime_format = 'True') #<! Stable time format

In [None]:
# Sort data by transaction date
dfData.sort_values('Transaction Time', inplace = True)

# Information about the Data Before Pre Processing

1. See the labeled cases.
2. Count the Labels data.
3. Number of unique assets.
4. Pandas' `info()` and `describe()`.

After this phase, the data is _read only_.

In [None]:
# Look at attack cases
dfData.loc[dfData['Label'] == 1, :]

In [None]:
dfData['Label'].value_counts()

In [None]:
len(dfData['Sender ID'].unique())

# Pre Processing

1. Remove invalid data.
2. Remove outliers.

In [None]:
# Detecting invalid `Amount USD`

dsInValidTrnsUsd = ((dfData['Amount [USD]'] == 0) | (dfData['Amount [USD]'].isna()) | (dfData['Amount [USD]'] == ''))

print(f'Number of invalid `Amount [USD]`: {dsInValidTrnsUsd.sum()}')

In [None]:
# Remove invalid data
dfData.drop(dfData.index[dsInValidTrnsUsd], inplace = True) #<! Royi: Should we do a reset index?

In [None]:
# Detecting Outliers in the `Amount [USD]`

dsOutlierTrnsUsd = ((dfData['Amount [USD]'] >= amountUsdOutlierThr) | (dfData['Amount [USD]'] <= 0))

print(f'Number of outliers `Amount [USD]`: {dsOutlierTrnsUsd.sum()}')

In [None]:
# Remove outliers
dfData.drop(dfData.index[dsOutlierTrnsUsd], inplace = True) #<! Royi: Should we do a reset index?

In [None]:
# From now on this is the data to work with
numRows, numCols = dfData.shape

print(f'The number of rows (Samples): {numRows}, The number of columns: {numCols}')

# Meet the Data

Basic infomration about the data.

## Feature Engineering

This section adds features and engineers them.  
Most features work on the `Sender ID` group.

#### Amount Based Features:

1. The STD of the user vs the average STD of all other users of the asset.
2. The Median of the user vs the average STD of all other users of the asset.
3. 

#### Date Based Features

1. The day of the week.
2. Weekend.
3. Hour of the day.
4. STD fo the time difference of the user vs. the avergae of all other users.
5. Median fo the time difference of the user vs. the avergae of all other users.

**Remark**: For wallets with a lot of activity we need to analyze the "activity hours" and profile it.


The features are:

 1. Day of the Week.

Remarks:

 *  Features x-y are time / frequency related.
 *  Features z-t are trasnaction realted.


In [None]:
# Pre Process

dfGbs = GrpBySender(dfData)

In [None]:
# Features - Amount Based
'''    TYPE_SUM     TYPE_MEAN            TYPE_STD              TYPE_VAR                    TYPE_MEDIAN           TYPE_COUNT                  TYPE_MIN              TYPE_MAX                    '''
#sum_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_SUM)
#mean_s      = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MEAN)
#std_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_STD)
#var_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_VAR)
#median_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MEDIAN)
#count_s     = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_COUNT)
#min_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MIN)
#max_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MAX)

#dfData[SUM_ASSET]     = sum_s
#dfData[MEAN_ASSET]    = mean_s
#dfData[STD_ASSET]     = std_s
#dfData[VAR_ASSET]     = var_s
#dfData[MEDIAN_ASSET]  = median_s
#dfData[COUNT_ASSET]   = count_s
#dfData[MIN_ASSET]     = min_s
#dfData[MAX_ASSET]     = max_s

sum_s       = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_SUM)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_SUM)
mean_s      = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_MEAN)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MEAN)
std_s       = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_STD)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_STD)
var_s       = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_VAR)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_VAR)
median_s    = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_MEDIAN)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MEDIAN)
count_s     = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_COUNT)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_COUNT)
min_s       = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_MIN)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MIN)
max_s       = dfGbs.AggBySender(colName = dfGbs.amountUSDColLabel, grpLabel = None, calcType = CalcType.TYPE_MAX)#dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MAX)

dfData[SUM_ASSET]     = sum_s
dfData[MEAN_ASSET]    = mean_s
dfData[STD_ASSET]     = std_s
dfData[VAR_ASSET]     = var_s
dfData[MEDIAN_ASSET]  = median_s
dfData[COUNT_ASSET]   = count_s
dfData[MIN_ASSET]     = min_s
dfData[MAX_ASSET]     = max_s


In [None]:
# Features - Time Based
'TYPE_TIME_DIFF_MEAN      TYPE_TIME_DIFF_STD TYPE_TIME_DIFF_MEDIAN  TYPE_TIME_DIFF_MIN      TYPE_TIME_DIFF_MAX'      
#td_mean_s   = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEAN)
#td_std_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_STD)
#td_median_s = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEDIAN)
#td_min_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
#td_max_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)

#dfData[TD_MEAN_ASSET]   = td_mean_s
#dfData[TD_STD_ASSET]    = td_std_s
#dfData[TD_MEDIAN_ASSET] = td_median_s
#dfData[TD_MIN_ASSET]    = td_min_s
#dfData[TD_MAX_ASSET]    = td_max_s

td_mean_s   = dfGbs.AggBySender(colName = dfGbs.timeDiffAssetColLabel, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEAN)
td_std_s    = dfGbs.AggBySender(colName = dfGbs.timeDiffAssetColLabel, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_STD)
td_median_s = dfGbs.AggBySender(colName = dfGbs.timeDiffAssetColLabel, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEDIAN)
td_min_s    = dfGbs.AggBySender(colName = dfGbs.timeDiffAssetColLabel, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
td_max_s    = dfGbs.AggBySender(colName = dfGbs.timeDiffAssetColLabel, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)



dfData[TD_MEAN_ASSET]   = td_mean_s
dfData[TD_STD_ASSET]    = td_std_s
dfData[TD_MEDIAN_ASSET] = td_median_s
dfData[TD_MIN_ASSET]    = td_min_s
dfData[TD_MAX_ASSET]    = td_max_s

In [None]:
# Features - Time Based

dfData['Hour']      = dfData['Transaction Time'].dt.hour
dfData['Weekday']   = dfData['Transaction Time'].dt.dayofweek

In [None]:
# Features - Amount Based (User)
'''    TYPE_SUM     TYPE_MEAN            TYPE_STD              TYPE_VAR                    TYPE_MEDIAN           TYPE_COUNT                  TYPE_MIN              TYPE_MAX                    '''
#sum_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_SUM)
#mean_s      = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEAN)
#std_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_STD)
#var_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_VAR)
#median_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEDIAN)
#count_s     = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_COUNT)
#min_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MIN)
#max_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MAX)

#dfData[SUM_USR]     = sum_s
#dfData[MEAN_USR]    = mean_s
#dfData[STD_USR]     = std_s
#dfData[VAR_USR]     = var_s
#dfData[MEDIAN_USR]  = median_s
#dfData[COUNT_USR]   = count_s
#dfData[MIN_USR]     = min_s
#dfData[MAX_USR]     = max_s

sum_s       = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_SUM)#_AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_SUM)
mean_s      = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEAN)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEAN)
std_s       = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_STD)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_STD)
var_s       = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_VAR)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_VAR)
median_s    = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEDIAN)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEDIAN)
count_s     = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_COUNT)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_COUNT)
min_s       = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MIN)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MIN)
max_s       = dfGbs.AggByReceiver(colName = dfGbs.amountUSDColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MAX)#dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MAX)

dfData[SUM_USR]     = sum_s
dfData[MEAN_USR]    = mean_s
dfData[STD_USR]     = std_s
dfData[VAR_USR]     = var_s
dfData[MEDIAN_USR]  = median_s
dfData[COUNT_USR]   = count_s
dfData[MIN_USR]     = min_s
dfData[MAX_USR]     = max_s

In [None]:
# Features - Time Based (User)
'TYPE_TIME_DIFF_MEAN      TYPE_TIME_DIFF_STD TYPE_TIME_DIFF_MEDIAN  TYPE_TIME_DIFF_MIN      TYPE_TIME_DIFF_MAX'      
#td_mean_s   = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEAN)
#td_std_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_STD)
#td_median_s = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEDIAN)
#td_min_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
#td_max_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)

#dfData[TD_MEAN_USR]   = td_mean_s
#dfData[TD_STD_USR]    = td_std_s
#dfData[TD_MEDIAN_USR] = td_median_s
#dfData[TD_MIN_USR]    = td_min_s
#dfData[TD_MAX_USR]    = td_max_s

td_mean_s   = dfGbs.AggByReceiver(colName = dfGbs.timeDiffUserColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEAN)
td_std_s    = dfGbs.AggByReceiver(colName = dfGbs.timeDiffUserColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_STD)
td_median_s = dfGbs.AggByReceiver(colName = dfGbs.timeDiffUserColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEDIAN)
td_min_s    = dfGbs.AggByReceiver(colName = dfGbs.timeDiffUserColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
td_max_s    = dfGbs.AggByReceiver(colName = dfGbs.timeDiffUserColLabel, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)

dfData[TD_MEAN_USR]   = td_mean_s
dfData[TD_STD_USR]    = td_std_s
dfData[TD_MEDIAN_USR] = td_median_s
dfData[TD_MIN_USR]    = td_min_s
dfData[TD_MAX_USR]    = td_max_s

In [None]:
#### Time interval calculations for groups-subgroups
# need to do approximately this:
## dfData.groupby(['Sender ID','Receiver ID'])['Transaction Time'].apply(lambda x: (x.max() - x.min())/ np.timedelta64(1, 's')).shape

ds_SentValue = pd.Series(index = dfGbs.dfData.index)
for ii in range(len(dfGbs.lSubGrpUsrLabelIdx)):
    for i in range(len(dfGbs.lSubGrpUsrLabelIdx[ii])):
        ival = dfGbs.dfData['Transaction Time'][dfGbs.lSubGrpUsrLabelIdx[ii][i]]
        dd = (ival.max() - ival.min()) / np.timedelta64(1, 's')
        ds_SentValue[dfGbs.lSubGrpUsrLabelIdx[ii][i]] =  dd
dfData['Time Interval'] = ds_SentValue
#### probably should be added to methods file

# dfData['Max Time (User)'] = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)
# dfData['Min Time (User)'] = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
# dfData['Time Interval'] = (dfData['Max Time (User)'] - dfData['Min Time (User)']).dt.total_seconds()


In [None]:
dfData_ = dfData.copy(deep=True)

In [None]:
def ml_input_create(dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ , selection_option = 'file_selection', use_categorical = True):
    
    dfData[dfData.columns[~dfData.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID',  'Currency Hash','tx_hash', 'Currency',  'Currency Type', 'Receiver Type','Label'])]].fillna(0, inplace=True) 
    #dfData.fillna(0, inplace=True)
    ####################### create inputs for ML algos:
    #### get ids of cases attributed to files, then randomly select ids for train and test subsets

    if selection_option == 'file_selection':
        all_ids = list(dAssetFile.keys()) 
        rnd_choices_train = random.choices(all_ids, k = int(test_train_selection_proportion_*len(all_ids)))
        
        rnd_choices_test = list(set(all_ids) - set(rnd_choices_train)) + list(set(rnd_choices_train) - set(all_ids))

        for cat_col in categor_cols:
            dfData[cat_col] = dfData[cat_col].astype("category", copy = False)
            
        scaler = StandardScaler()

        dfData[num_cols] = scaler.fit_transform(dfData[num_cols])

        
                
        dfData_train = dfData[dfData['Sender ID'].isin(rnd_choices_train)]
        dfData_test = dfData[dfData['Sender ID'].isin(rnd_choices_test)]
        
        
        if use_categorical:
            X_train = dfData_train[dfData_train.columns[~dfData_train.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID',  'Currency Hash','tx_hash','Label'])]]
            X_train.rename(columns = {'Amount [USD]':'Amount USD'}, inplace = True)
            Y_train = dfData_train['Label']
        
            X_test = dfData_test[dfData_train.columns[~dfData_test.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID',  'Currency Hash','tx_hash','Label'])]]
            X_test.rename(columns = {'Amount [USD]':'Amount USD'}, inplace = True)
            Y_test = dfData_test['Label']
            

        else:
            X_train = dfData_train[dfData_train.columns[~dfData_train.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type','tx_hash','Label'])]]
            X_train.rename(columns = {'Amount [USD]':'Amount USD'}, inplace = True)
            Y_train = dfData_train['Label']
        
            X_test = dfData_test[dfData_train.columns[~dfData_test.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type','tx_hash','Label'])]]
            X_test.rename(columns = {'Amount [USD]':'Amount USD'}, inplace = True)
            Y_test = dfData_test['Label']
                

   
    if selection_option == 'simple_selection':
               
        dfData_train, dfData_test = train_test_split(dfData, test_size=(1 - test_train_selection_proportion_), random_state=seedNum, stratify=dfData[['Label']])

        #X_train = dfData[dfData.columns[~dfData.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type','tx_hash','Label'])]].to_numpy()
        X_train = dfData_train[dfData_train.columns[~dfData_train.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type','tx_hash','Label'])]]
        X_train.rename(columns = {'Amount [USD]':'Amount USD'}, inplace = True)
        Y_train = dfData_train['Label']
    
        X_test = dfData_test[dfData_train.columns[~dfData.columns.isin(['Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type','tx_hash','Label'])]]
        X_test.rename(columns = {'Amount [USD]':'Amount USD'}, inplace = True)
        Y_test = dfData_test['Label']
        
        
    
        #sc = StandardScaler()
        #X_scaled = sc.fit_transform(X)
        #X_train, X_test, Y_train, Y_test = train_test_split( X_scaled, Y, test_size=test_train_selection_proportion_, random_state=42) #<! Royi: You should use startified split as the data is imbalanced

    return X_train, Y_train, X_test, Y_test 


    

In [None]:
@timer
def model_train(x,y,eval = None, model_ = 'xgboost'):
    if model_ == 'xgboost':
        model = XGBClassifier(tree_method="gpu_hist", random_state=seedNum, enable_categorical=True)#use_label_encoder=False,#model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
        model.fit(x, y)
    if model_ == 'svm':
        model = SVC(kernel='linear')
        model.fit(x, y)

    if model_ == 'nn':
        n_vars = x.shape[1]
        model = MLPClassifier(solver='lbfgs', alpha=1e-5,  hidden_layer_sizes=(n_vars, 2*n_vars), random_state=1)
        model.fit(x, y)
    
    if model_ == 'catboost':
        train_data = cb.Pool(x, y, cat_features=['Currency', 'Currency Type', 'Receiver Type']) 
        eval_data = eval
        model = CatBoostClassifier(iterations=10)
        model.fit(train_data, eval_set=eval_data)
            

    return model



@timer
def model_inference(x, model):#, model_ = 'xgboost'):
    return model.predict(x)    

In [None]:
def k_fold_training(k_fold, dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,selection_option = 'file_selection', model_ = 'xgboost'):

    max = 0 ; r = []
    for i in range(k_fold):
        
        if model_ == 'xgboost':
            X_train, Y_train, X_test, Y_test ,  = ml_input_create(dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,selection_option = selection_option, use_categorical = True)
            model = model_train(X_test, Y_test,eval = None, model_ = 'xgboost')
            y_pred_xgboost = model_inference(X_test, model)#y_pred_xgboost = model.predict(X_test)
            
            _, _, f1_score = model_scores(y_pred_xgboost, Y_test)
            if f1_score > max : 
                max = f1_score
                r = [max, model, X_train, Y_train, X_test, Y_test]
            
        
        if model_ == 'svm':
            
            X_train, Y_train, X_test, Y_test = ml_input_create(dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,selection_option = selection_option, use_categorical = False)
            model = model_train(X_test, Y_test,eval = None, model_ = 'svm')
            y_pred = model_inference(X_test, model) #y_pred = model.predict(X_test)

            
            _, _, f1_score = model_scores(y_pred, Y_test)
            if f1_score > max : 
                max = f1_score
                r = [max, model]
            
        if model_ == 'catboost':
            X_train, Y_train, X_test, Y_test  = ml_input_create(dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_  ,selection_option = selection_option, use_categorical = True)
            #train_data = cb.Pool(X_train, Y_train, cat_features=['Currency', 'Currency Type', 'Receiver Type']) 
            eval_data = cb.Pool(X_test, Y_test, cat_features=['Currency', 'Currency Type', 'Receiver Type'])
            #model = CatBoostClassifier(iterations=10) ;    #model.fit(train_data, eval_set=eval_data)
            model = model_train(X_train,Y_train,eval = eval_data, model_ = 'catboost')
            preds_catboost_y = model_inference(eval_data, model)#preds_catboost_y = model.predict(eval_data)
            #preds_catboost_y = model.predict(X_test)
            
            
            _, _, f1_score = model_scores(preds_catboost_y, Y_test)
            if f1_score > max : 
                max = f1_score
                r = [max, model]

        if model_ == 'nn':
            X_train, Y_train, X_test, Y_test  = ml_input_create(dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,selection_option = selection_option, use_categorical = False)
            model = model_train(X_test, Y_test,eval = None, model_ = 'nn')
            
            y_nn_pred = model.predict(X_test)    
            _, _, f1_score = model_scores(y_nn_pred, Y_test)
            if f1_score > max : 
                max = f1_score
                r = [max, model]
                    
            
    return r     



In [None]:
#r, X_test_sasa, Y_test_sasa, X_test_sama, Y_test_sama = k_fold_training(20, dfData_, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,dfSasaSama ,selection_option = 'file_selection', model_ = 'xgboost')
#r, X_test_sasa, Y_test_sasa, X_test_sama, Y_test_sama = k_fold_training(10, dfData, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,dfSasaSama ,selection_option = 'file_selection', model_ = 'svm') ### <<-- forever
r  = k_fold_training(10, dfData_, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,dfSasaSama ,selection_option = 'file_selection', model_ = 'catboost')
print(r)
best_model_ = r[1]
y_pred_sasa = model_inference(X_test_sasa, best_model_)#best_model_.predict(X_test_sasa)
y_pred_sama = model_inference(X_test_sama, best_model_)#best_model_.predict(X_test_sama)

#print_scores(y_pred_xgboost, Y_test)
print('sasa scores : ')
print_scores(y_pred_sasa, Y_test_sasa)
print('sama scores : ')
print_scores(y_pred_sama, Y_test_sama)

In [None]:
from DataSetsAuxFun_ import *

In [None]:
dfData_

In [None]:
y_ = model_inference(X_test, best_model_)
print_scores(y_, Y_test)


In [None]:
GenClassifierSummaryResults(Y_test, y_ )

In [None]:
r = k_fold_training(10, dfData_, dAssetFile, categor_cols, num_cols , test_train_selection_proportion_ ,selection_option = 'file_selection', model_ = 'xgboost')
print(r[0:2])
best_model_ = r[1]


In [None]:
from joblib import dump, load
dump(best_model_, 'model_old_878.joblib')

In [None]:
loaded = load('model_old.joblib') 

y_pred = model_inference(r[-2], loaded)#best_model_.predict(X_test_sasa)

print('scores : ')
print_scores(y_pred, r[-1])
