[![CyVers](https://i.imgur.com/yyhmZET.png)](https://www.cyvers.ai/)

# BlockChain Attack Data Set - Exploratory Data Analysis (EDA)

> Notebook by:
> - Royi Avital Royi@cyvers.ai

## Revision History

| Version | Date       | Content / Changes                                   |
|---------|------------|-----------------------------------------------------|
| 0.1.000 | 30/06/2022 | First version                                       |
| 0.2.000 | 25/07/2022 | Added model selection and visualization of features |
|         |            |                                                     |

In [None]:
# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Misc
import datetime
import os
from platform import python_version
import random
import warnings

# EDA Tools
import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

# Machine Learning
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
# from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import confusion_matrix, fbeta_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedGroupKFold, train_test_split

# Ensemble Engines
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from bokeh.plotting import figure, show

# Jupyter
from ipywidgets import interact, Dropdown, Layout

In [None]:
# Configuration
%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

In [None]:
# Constants

DATA_FOLDER_NAME    = 'BlockChainAttacksDataSet'
DATA_FOLDER_PATTERN = 'DataSet001'
DATA_FILE_EXT       = 'csv'

PROJECT_DIR_NAME = 'CyVers' #<! Royi: Anton, don't change it, it should be a team constant
PROJECT_DIR_PATH = os.path.join(os.getcwd()[:os.getcwd().find(PROJECT_DIR_NAME)], PROJECT_DIR_NAME) #>! Pay attention, it will create issues in cases you name the folder `CyVersMe` or anything after / before `CyVers`

# Feature extractors constants

# Assets
# By amount:
SUM_ASSET       = 'SUM (Asset)'
MEAN_ASSET      = 'MEAN (Asset)'
STD_ASSET       = 'STD (Asset)'
VAR_ASSET       = 'VAR (Asset)'
MEDIAN_ASSET    = 'MEDIAN (Asset)'
COUNT_ASSET     = 'COUNT (Asset)'
MIN_ASSET       = 'MIN (Asset)'
MAX_ASSET       = 'MAX (Asset)'
# By time:
TD_MEAN_ASSET   = 'TD_MEAN (Asset)'
TD_STD_ASSET    = 'TD_STD (Asset)'
TD_MEDIAN_ASSET = 'TD_MEDIAN (Asset)'
TD_MIN_ASSET    = 'TD_MIN (Asset)'
TD_MAX_ASSET    = 'TD_MAX (Asset)'

# User
SUM_USR         = 'SUM (User)'
MEAN_USR        = 'MEAN (User)'
STD_USR         = 'STD (User)'
VAR_USR         = 'VAR (User)'
MEDIAN_USR      = 'MEDIAN (User)'
COUNT_USR       = 'COUNT (User)'
MIN_USR         = 'MIN (User)'
MAX_USR         = 'MAX (User)'
# By time:
TD_MEAN_USR     = 'TD_MEAN (User)'
TD_STD_USR      = 'TD_STD (User)'
TD_MEDIAN_USR   = 'TD_MEDIAN (User)'
TD_MIN_USR      = 'TD_MIN (User)'
TD_MAX_USR      = 'TD_MAX (User)' 
#######
HOUR            = 'Hour'
WEEKDAY         = 'Weekday'
TIME_INTRVL     = 'Time Interval'

TRAIN_BY_TSX    = 1
TRAIN_BY_FILES  = 2

test_train_selection_proportion_ = 0.7
###list of numeric columns
num_cols = ['Amount','Amount [USD]',  SUM_ASSET, MEAN_ASSET, STD_ASSET, VAR_ASSET, MEDIAN_ASSET, COUNT_ASSET, MIN_ASSET, MAX_ASSET, TD_MEAN_ASSET, TD_STD_ASSET, TD_MEDIAN_ASSET , TD_MIN_ASSET, TD_MAX_ASSET, 
                                      SUM_USR, MEAN_USR, STD_USR, VAR_USR, MEDIAN_USR, COUNT_USR, MIN_USR, MAX_USR, TD_MEAN_USR, TD_STD_USR, TD_MEDIAN_USR, TD_MIN_USR, TD_MAX_USR, HOUR, WEEKDAY, TIME_INTRVL]
categor_cols = ['Currency', 'Currency Type' , 'Receiver Type']

numAttacksColName = 'Number of Attacks'
attackTypeColName = 'Attack Type'

In [None]:
# CyVers Packages
from DataSetsAuxFun import *

In [None]:
# Parameters
dataSetRotoDir = os.path.join(PROJECT_DIR_PATH, DATA_FOLDER_NAME)

# Features Analysis
numCrossValPps = 4

# Training
trainMode = TRAIN_BY_FILES
numKFolds = 3
gridSearchScore = 'f1' #<! Use strings from `sklearn.metrics.get_scorer_names()`

# Amount USD Outlier threshold
amountUsdOutlierThr = 1e9

randomState = 42

In [None]:
# Loading / Generating Data
lCsvFile = ExtractCsvFiles(dataSetRotoDir, folderNamePattern = DATA_FOLDER_PATTERN)
print(f'The number of file found: {len(lCsvFile)}')

# dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
dfData, dAssetFile = LoadCsvFilesDf(lCsvFile, baseFoldePath = '')
numRows, numCols = dfData.shape

print(f"The number of rows (Samples): {numRows}, The number of columns: {numCols}, number of unique sender id's: {dfData['Sender ID'].unique().shape}")
print(f'The data list of columns is: {dfData.columns} with {len(dfData.columns)} columns')

In [None]:
# Convert time data into Pandas format
dfData['Transaction Time'] = pd.to_datetime(dfData['Transaction Time'], infer_datetime_format = 'True') #<! Stable time format

In [None]:
# Sort data by transaction date
dfData.sort_values('Transaction Time', inplace = True)
# dfData.reset_index(drop = True, inplace = True)

# Pre Processing

1. Remove invalid data.
2. Remove outliers.

In [None]:
# Detecting invalid `Amount USD`

dsInValidTrnsUsd = ((dfData['Amount [USD]'] == 0) | (dfData['Amount [USD]'].isna()) | (dfData['Amount [USD]'] == ''))

print(f'Number of invalid `Amount [USD]`: {dsInValidTrnsUsd.sum()}')

In [None]:
# Remove invalid data
dfData.drop(dfData.index[dsInValidTrnsUsd], inplace = True) #<! Royi: Should we do a reset index?

In [None]:
# Detecting Outliers in the `Amount [USD]`

dsOutlierTrnsUsd = ((dfData['Amount [USD]'] >= amountUsdOutlierThr) | (dfData['Amount [USD]'] <= 0))

print(f'Number of outliers `Amount [USD]`: {dsOutlierTrnsUsd.sum()}')

In [None]:
# Remove outliers
dfData.drop(dfData.index[dsOutlierTrnsUsd], inplace = True) #<! Royi: Should we do a reset index?

In [None]:
# From now on this is the data to work with
numRows, numCols = dfData.shape

print(f'The number of rows (Samples): {numRows}, The number of columns: {numCols}')

## Feature Engineering

This section adds features and engineers them.  
Most features work on the `Sender ID` group.

#### Amount Based Features:

1. The STD of the user vs the average STD of all other users of the asset.
2. The Median of the user vs the average STD of all other users of the asset.
3. 

#### Date Based Features

1. The day of the week.
2. Weekend.
3. Hour of the day.
4. STD fo the time difference of the user vs. the avergae of all other users.
5. Median fo the time difference of the user vs. the avergae of all other users.

**Remark**: For wallets with a lot of activity we need to analyze the "activity hours" and profile it.


The features are:

 1. Day of the Week.

Remarks:

 *  Features x-y are time / frequency related.
 *  Features z-t are trasnaction realted.


In [None]:
# Pre Process

dfGbs = GrpBySender(dfData)

In [None]:
sum_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_SUM)

In [None]:
sum_s_       = dfGbs._AnalyseRecieverIdRoyi(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_SUM)

In [None]:
np.allclose(sum_s, sum_s_)

In [None]:
# Features - Amount Based
'''    TYPE_SUM     TYPE_MEAN            TYPE_STD              TYPE_VAR                    TYPE_MEDIAN           TYPE_COUNT                  TYPE_MIN              TYPE_MAX                    '''
sum_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_SUM)
mean_s      = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MEAN)
# std_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_STD)
# var_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_VAR)
# median_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MEDIAN)
# count_s     = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_COUNT)
# min_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MIN)
# max_s       = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_MAX)

dfData[SUM_ASSET]     = sum_s
dfData[MEAN_ASSET]    = mean_s
# dfData[STD_ASSET]     = std_s
# dfData[VAR_ASSET]     = var_s
# dfData[MEDIAN_ASSET]  = median_s
# dfData[COUNT_ASSET]   = count_s
# dfData[MIN_ASSET]     = min_s
# dfData[MAX_ASSET]     = max_s

In [None]:
# Features - Time Based
'TYPE_TIME_DIFF_MEAN      TYPE_TIME_DIFF_STD TYPE_TIME_DIFF_MEDIAN  TYPE_TIME_DIFF_MIN      TYPE_TIME_DIFF_MAX'      
# td_mean_s   = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEAN)
# td_std_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_STD)
# td_median_s = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEDIAN)
# td_min_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
# td_max_s    = dfGbs._SentValue(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)

# dfData[TD_MEAN_ASSET]   = td_mean_s
# dfData[TD_STD_ASSET]    = td_std_s
# dfData[TD_MEDIAN_ASSET] = td_median_s
# dfData[TD_MIN_ASSET]    = td_min_s
# dfData[TD_MAX_ASSET]    = td_max_s

In [None]:
# Features - Time Based

dfData['Hour']      = dfData['Transaction Time'].dt.hour
dfData['Weekday']   = dfData['Transaction Time'].dt.dayofweek

In [None]:
# Features - Amount Based (User)
'''    TYPE_SUM     TYPE_MEAN            TYPE_STD              TYPE_VAR                    TYPE_MEDIAN           TYPE_COUNT                  TYPE_MIN              TYPE_MAX                    '''
sum_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_SUM)
mean_s      = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEAN)
# std_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_STD)
# var_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_VAR)
# median_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MEDIAN)
# count_s     = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_COUNT)
# min_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MIN)
# max_s       = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_MAX)

dfData[SUM_USR]     = sum_s
dfData[MEAN_USR]    = mean_s
# dfData[STD_USR]     = std_s
# dfData[VAR_USR]     = var_s
# dfData[MEDIAN_USR]  = median_s
# dfData[COUNT_USR]   = count_s
# dfData[MIN_USR]     = min_s
# dfData[MAX_USR]     = max_s

In [None]:
# Features - Time Based (User)
'TYPE_TIME_DIFF_MEAN      TYPE_TIME_DIFF_STD TYPE_TIME_DIFF_MEDIAN  TYPE_TIME_DIFF_MIN      TYPE_TIME_DIFF_MAX'      
# td_mean_s   = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEAN)
# td_std_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_STD)
# td_median_s = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MEDIAN)
# td_min_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MIN)
# td_max_s    = dfGbs._AnalyseRecieverId(amountCol = AmountType.AMOUNT_USD, tokenId = None, grpLabel = None, subGrpLabel = None, calcType = CalcType.TYPE_TIME_DIFF_MAX)

# dfData[TD_MEAN_USR]   = td_mean_s
# dfData[TD_STD_USR]    = td_std_s
# dfData[TD_MEDIAN_USR] = td_median_s
# dfData[TD_MIN_USR]    = td_min_s
# dfData[TD_MAX_USR]    = td_max_s

## Display Features

In [None]:
# Selected features for analysis
# lSlctdFeatures  = ['Amount [USD]', 'Receiver Type', 'Label', 'SUM (Asset)', 'MEAN (Asset)',
#        'STD (Asset)', 'VAR (Asset)', 'MEDIAN (Asset)', 'COUNT (Asset)',
#        'MIN (Asset)', 'MAX (Asset)', 'TD_MEAN (Asset)', 'TD_STD (Asset)',
#        'TD_MEDIAN (Asset)', 'TD_MIN (Asset)', 'TD_MAX (Asset)', 'Hour',
#        'Weekday', 'SUM (User)', 'MEAN (User)', 'STD (User)', 'VAR (User)',
#        'MEDIAN (User)', 'COUNT (User)', 'MIN (User)', 'MAX (User)',
#        'TD_MEAN (User)', 'TD_STD (User)', 'TD_MEDIAN (User)', 'TD_MIN (User)',
#        'TD_MAX (User)', 'Time Interval', 'TrnsFrequency [Hz] (User)', 'Amount Ratio', 'Time Ratio']
lSlctdFeatures = ['Amount [USD]', 'Hour', 'Weekday', 'SUM (Asset)', 'MEAN (Asset)', 'SUM (User)', 'MEAN (User)']
numFeatures     = len(lSlctdFeatures)

In [None]:
# Display the Scatter of the Features

oDropdwon = Dropdown(
    options     = lSlctdFeatures,
    value       = 'Amount [USD]',
    description = 'Select Feature:',
    style       = {'description_width' : 'initial'}
)

interact(lambda yColName: DisplayScatterFeature(dfData, 'Label', yColName, 'Suspicious'), yColName = oDropdwon)

In [None]:
# Display the Density of the Features

oDropdwon = Dropdown(
    options     = lSlctdFeatures,
    value       = 'Amount [USD]',
    description = 'Select Feature:',
    style       = {'description_width' : 'initial'}
)

interact(lambda yColName: DisplayKdeFeature(dfData, yColName, 'Label', 'Suspicious'), yColName = oDropdwon)

In [None]:
# Pre Processing Data
dfData.replace([np.inf, -np.inf], np.nan, inplace = True)
dfData.fillna(0, inplace = True)
dfX = dfData[lSlctdFeatures].copy()

In [None]:
# Input Data for Classifier

mX = dfX.to_numpy()
vY = dfData['Label'].to_numpy()
# Scaling the data
hStdScaler = StandardScaler()
mX = hStdScaler.fit_transform(mX)

### Display the PPS Matrix

The idea is to see the relationship between the features not by a linear correlation but by the ability to predict them.

In [None]:
# Creating the DF for the PPS analysis
dfTmp = dfData[lSlctdFeatures + ['Label']].copy()
dfTmp['Label'] = pd.Categorical(dfTmp['Label'])  

In [None]:
# Feature PPS - Which featuers are important?

# Pay attention, cross validation is K-Fold -> Don't over split the data
mPPS = pps.matrix(dfTmp, **{'cross_validation': numCrossValPps, 'random_seed': randomState})[['x', 'y', 'ppscore']].pivot(columns = 'x', index = 'y', values = 'ppscore') #<! We should set `Label` as a categorial variable

# Visualization of PPS
hF, hA = plt.subplots(figsize = (30, 30))
sns.heatmap(mPPS, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, vmin = 0, vmax = 1, ax = hA) 

plt.setp(hA.get_xticklabels(), ha = "center", rotation = 45)
plt.setp(hA.get_yticklabels(), rotation = 'horizontal')
hA.set_title('Predictive Power Score (PPS)')

In [None]:
# Validation the columns are legit
dfX.columns

## Model Training and Evaluation

### Data Split

In [None]:

if trainMode == TRAIN_BY_FILES:
    hKFoldSplt = StratifiedGroupKFold(n_splits = numKFolds, shuffle = True, random_state = randomState)
    gKFoldSplit = hKFoldSplt.split(mX, vY, groups = dfData['Sender ID'])
else:
    hKFoldSplt = StratifiedKFold(n_splits = numKFolds, shuffle = True, random_state = randomState)
    gKFoldSplit = hKFoldSplt.split(mX, vY)


### Model Training

In [None]:
skPipeline = Pipeline([('clf', XGBClassifier())])
dPipelineParams = {'clf': [XGBClassifier(), LGBMClassifier()]}

In [None]:
gridSearchCv = GridSearchCV(skPipeline, dPipelineParams, scoring = gridSearchScore, cv = hKFoldSplt)

if trainMode == TRAIN_BY_FILES:
    gridSearchCvF = gridSearchCv.fit(mX, vY, groups = dfData['Sender ID'])
else:
    gridSearchCvF = gridSearchCv.fit(mX, vY)
