[![CyVers](https://i.imgur.com/yyhmZET.png)](https://www.cyvers.ai/)

# SASA and SAMA Data Set - Exploratory Data Analysis (EDA)

> Notebook by:
> - Royi Avital Royi@cyvers.ai

## Revision History

| Version | Date       | Name            | Content / Changes     |
|---------|------------|-----------------|-----------------------|
| 1.0.000 | 14/07/2022 | Royi Avital     | First version         |
|         |            |                 |                       |

In [None]:
# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Miscellaneous
import datetime
import os
from platform import python_version
import random
import warnings

# EDA Tools
import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

# Machine Learning
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
# from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree

# Metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, fbeta_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Ensemble Engines
import lightgbm
from xgboost import XGBClassifier

# Visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from bokeh.plotting import figure, show

# Jupyter
from ipywidgets import interact, Dropdown, Layout

In [None]:
# Configuration
%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

In [None]:
# Constants

# DATA_FOLDER_NAME    = 'BlockChainAttacksDataSet'
DATA_FOLDER_NAME    = 'DataSet'
DATA_FOLDER_PATTERN = 'DataSet'
DATA_FILE_EXT       = 'csv'

PROJECT_DIR_NAME = 'CyVers' #<! Royi: Anton, don't change it, it should be a team constant
PROJECT_DIR_PATH = os.path.join(os.getcwd()[:os.getcwd().find(PROJECT_DIR_NAME)], PROJECT_DIR_NAME) #>! Pay attention, it will create issues in cases you name the folder `CyVersMe` or anything after / before `CyVers`

# Feature extractors constants

# Assets
# By amount:
SUM_ASSET       = 'SUM (Asset)'
MEAN_ASSET      = 'MEAN (Asset)'
STD_ASSET       = 'STD (Asset)'
VAR_ASSET       = 'VAR (Asset)'
MEDIAN_ASSET    = 'MEDIAN (Asset)'
COUNT_ASSET     = 'COUNT (Asset)'
MIN_ASSET       = 'MIN (Asset)'
MAX_ASSET       = 'MAX (Asset)'
# By time:
TD_MEAN_ASSET   = 'TD_MEAN (Asset)'
TD_STD_ASSET    = 'TD_STD (Asset)'
TD_MEDIAN_ASSET = 'TD_MEDIAN (Asset)'
TD_MIN_ASSET    = 'TD_MIN (Asset)'
TD_MAX_ASSET    = 'TD_MAX (Asset)'

# User
SUM_USR         = 'SUM (User)'
MEAN_USR        = 'MEAN (User)'
STD_USR         = 'STD (User)'
VAR_USR         = 'VAR (User)'
MEDIAN_USR      = 'MEDIAN (User)'
COUNT_USR       = 'COUNT (User)'
MIN_USR         = 'MIN (User)'
MAX_USR         = 'MAX (User)'
# By time:
TD_MEAN_USR     = 'TD_MEAN (User)'
TD_STD_USR      = 'TD_STD (User)'
TD_MEDIAN_USR   = 'TD_MEDIAN (User)'
TD_MIN_USR      = 'TD_MIN (User)'
TD_MAX_USR      = 'TD_MAX (User)' 

In [None]:
# CyVers Packages
from DataSetsAuxFun import *

In [None]:
# Parameters

csvFileName = 'AttacksDataSet_2022_07_13.csv'

dataSetRotoDir = os.path.join(PROJECT_DIR_PATH, DATA_FOLDER_NAME)

runTsne = False

# Amount USD Outlier threshold
amountUsdOutlierThr = 1e9

testSetRatio = 1.0 / 3.0
randomState = 42

In [None]:
# Loading / Generating Data

dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
numRows, numCols = dfData.shape

print(f'The number of rows (Samples): {numRows}, The number of columns: {numCols}')

In [None]:
dfData.head(20)

In [None]:
dfData.info()

In [None]:
dfData.describe()

## Feature Engineering

This section adds features and engineers them.  
It is assuemd the files havd a single unique `Sender`. Hence all analysis is done on the eceivers.


The features are:

 1. 

Remarks:

 *  Features x-y are time / frequency related.
 *  Features z-t are trasnaction realted.


In [None]:
vFeatures = dfData.columns
vFeatures

In [None]:
# Frequency of the User Transactions
dfData['TrnsFrequency [Hz] (User)'] = dfData['COUNT (User)'] / dfData['Time Interval']

In [None]:
# Ratio Between User Std to Asset STD

dfData['Amount Ratio']  = dfData['STD (User)'] / dfData['STD (Asset)']
dfData['Time Ratio']    = dfData['TD_STD (User)'] / dfData['TD_STD (Asset)']

## Display Features

In [None]:
# Selected features for analysis
lSlctdFeatures  = ['Amount [USD]', 'Receiver Type', 'Label', 'SUM (Asset)', 'MEAN (Asset)',
       'STD (Asset)', 'VAR (Asset)', 'MEDIAN (Asset)', 'COUNT (Asset)',
       'MIN (Asset)', 'MAX (Asset)', 'TD_MEAN (Asset)', 'TD_STD (Asset)',
       'TD_MEDIAN (Asset)', 'TD_MIN (Asset)', 'TD_MAX (Asset)', 'Hour',
       'Weekday', 'SUM (User)', 'MEAN (User)', 'STD (User)', 'VAR (User)',
       'MEDIAN (User)', 'COUNT (User)', 'MIN (User)', 'MAX (User)',
       'TD_MEAN (User)', 'TD_STD (User)', 'TD_MEDIAN (User)', 'TD_MIN (User)',
       'TD_MAX (User)', 'Time Interval', 'TrnsFrequency [Hz] (User)', 'Amount Ratio', 'Time Ratio']
# lSlctdFeatures  = ['Amount', 'Num Trns User', 'Sum Value User', 'STD Value User', 'Max Value User', 'Min Value User', 'Active Duration User', 'Frequency Trns. / Days', 'STD Time Diff', 'Max Time Diff', 'Min Time Diff']
numFeatures     = len(lSlctdFeatures)

In [None]:
# Display the Scatter of teh Features

oDropdwon = Dropdown(
    options     = lSlctdFeatures,
    value       = 'Amount [USD]',
    description = 'Select Feature:',
    style       = {'description_width' : 'initial'}
)

interact(lambda yColName: DisplayScatterFeature(dfData, 'Label', yColName, 'Suspicious'), yColName = oDropdwon)

In [None]:
# Display the Density of the Features

oDropdwon = Dropdown(
    options     = lSlctdFeatures,
    value       = 'Amount [USD]',
    description = 'Select Feature:',
    style       = {'description_width' : 'initial'}
)

interact(lambda yColName: DisplayKdeFeature(dfData, yColName, 'Label', 'Suspicious'), yColName = oDropdwon)

In [None]:
# Pre Processing Data
dfData.replace([np.inf, -np.inf], np.nan, inplace = True)
dfData.fillna(0, inplace = True)
dfX = dfData[dfData.columns[~dfData.columns.isin(['Unnamed: 0', 'Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Amount', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type', 'Label'])]].copy()

In [None]:
# Input Data for Classifier

mX = dfX.to_numpy()
vY = dfData['Label'].to_numpy()
# Scaling the data
hStdScaler = StandardScaler()
mX = hStdScaler.fit_transform(mX)

In [None]:
# dfTmp = dfData[dfData.columns[~dfData.columns.isin(['Unnamed: 0', 'Transaction ID', 'Transaction Time', 'Sender ID', 'Receiver ID', 'Amount', 'Currency', 'Currency Hash', 'Currency Type', 'Receiver Type', 'Label'])]]

In [None]:
# lSlctdFeatures = dfTmp.columns.to_list()
lSlctdFeatures = dfX.columns.to_list()
type(lSlctdFeatures)

In [None]:
dfTmp = dfData[lSlctdFeatures + ['Label']].copy()

In [None]:
dfTmp['Label'] = pd.Categorical(dfTmp['Label'])  

In [None]:
dfTmp.info()

In [None]:
# Feature PPS - Which featuers are important?

# Pay attention, cross validation is K-Fold -> Don't over split the data
mPPS = pps.matrix(dfTmp, **{'cross_validation': 5, 'random_seed': randomState})[['x', 'y', 'ppscore']].pivot(columns = 'x', index = 'y', values = 'ppscore') #<! We should set `Label` as a categorial variable

# Visualization of PPS
hF, hA = plt.subplots(figsize = (30, 30))
sns.heatmap(mPPS, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, vmin = 0, vmax = 1, ax = hA) 

plt.setp(hA.get_xticklabels(), ha = "center", rotation = 45)
plt.setp(hA.get_yticklabels(), rotation = 'horizontal')
hA.set_title('Predictive Power Score (PPS)')

In [None]:
# Validation the columns are legit
dfX.columns

In [None]:
# Train Test Split
# TODO: Split by Files
mXTrain, mXTest, vYTrain, vYTest = train_test_split(mX, vY, test_size = testSetRatio, random_state = randomState, stratify = vY)

In [None]:
xgbModel = XGBClassifier(use_label_encoder = False)
xgbModel.fit(mXTrain, vYTrain)
vYPred = xgbModel.predict(mXTest)

In [None]:
mConfMat = confusion_matrix(vYTest, vYPred, labels = xgbModel.classes_)
cmDisp = ConfusionMatrixDisplay(confusion_matrix = mConfMat, display_labels = xgbModel.classes_)


cmPlot = cmDisp.plot()
hA = cmPlot.ax_
hA.grid(False)


In [None]:
# Scores Summary 

dsScoreSumm     = GenClassifierSummaryResults(vYTest, vYPred)
dfScoreSummary  = pd.DataFrame(dsScoreSumm, columns = ['Score'])
dfScoreSummary