[![CyVers](https://i.imgur.com/yyhmZET.png)](https://www.cyvers.ai/)

# Validate Production Features and Reules Calculations  
This notebook validates the calculationdone online as described in https://github.com/CyVers-AI/SolidusBlindTest/issues/10.

> Notebook by:
> - Royi Avital Royi@cyvers.ai

## Revision History

| Version | Date       | Name            | Content / Changes     |
|---------|------------|-----------------|-----------------------|
| 1.0.000 | 27/06/2022 | Royi Avital     | First version         |
|         |            |                 |                       |

In [None]:
# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Misc
import datetime
import os
from platform import python_version
import random

from numba import njit

# Machine Learning
from sklearn.preprocessing import LabelEncoder

# EDA Tools
import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

# Ensemble Engines
import lightgbm
import xgboost

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, show

# Jupyter
from ipywidgets import interact, Dropdown, Layout

In [None]:
# Configuration
%matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

In [None]:
# Constants

DATA_FOLDER_NAME    = 'DataSet'
DATA_FILE_EXT       = 'csv'

In [None]:
# Parameters

csvFileName = 'Bit2CSoftware001.csv'

tsxAmountThr    = 8
tsxTimeDiffThr  = -0.99

In [None]:
# Loading / Generating Data

dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
numRows, numCols = dfData.shape

print(f'The number of rows (Samples): {numRows}, The number of columns: {numCols}')

In [None]:
# Convert time data into Pandas format
dfData['Transaction Time'] = pd.to_datetime(dfData['Transaction Time'], infer_datetime_format = 'True') #<! Stable time format

In [None]:
# User this if needed ot filter the Sender
# dfData = dfData.loc[dfData['Sender ID'] == '0x7c49e1c0e33f3efb57d64b7690fa287c8d15b90a', :]
# dfData.reset_index(inplace = True)
# dfData.to_csv(os.path.join(DATA_FOLDER_NAME, 'Bit2CSoftware001.csv'))

In [None]:
dfData.head(20)

In [None]:
dfData.info()

In [None]:
len(dfData['Receiver ID'].unique())

In [None]:
dfData.describe()

In [None]:
# We need data sorted by ascending time 

dfData.sort_values('Transaction Time', inplace = True)

In [None]:
dfData.reset_index(inplace = True)
dfData.head(100)

## Feature Calculation

Features are calculated by time (Using current and past data only).


In [None]:
dfData['Rolling Average Amount [USD]'] = np.nan #<! Rolling average over time of all data
dfData['Time Difference Group [Sec]'] = np.nan #<! The time difference from the previous transaction of teh same user
dfData['Rolling Average Time Difference [Sec]'] = np.nan #<! The average of teh average time difference per group (Until the i-th transaction where i is the row index sorted by transaction time)

dfData.head()

In [None]:
dfData['Rolling Average Amount [USD]'] = dfData['Amount [USD]'].cumsum() / np.arange(1.0, dfData.shape[0] + 1)
dfData.head()

In [None]:
# Data grouped by user as most operations work on users
dfGrpUser = dfData.sort_values('Transaction Time').groupby('Receiver ID')

In [None]:
for grpName, dfGroup in dfGrpUser:

    vIndx = dfGroup.index

    dfData.loc[vIndx, 'Time Difference Group [Sec]'] = dfGroup['Transaction Time'].diff().dt.total_seconds()
    dfData.loc[vIndx[0], 'Time Difference Group [Sec]'] = 0

dfData

In [None]:
# We need, per row, calculate the mean of all 'Rolling Average Time Difference User [Sec]' before it, but just one per group of users.
# We can create the following data structure:

# Row Index, IDGrp001, IdGrp002, IdGrp003, ..., IdGrp100, ...
# 1          meanVal   NaN       NaN

# Namely for we calculate the avergae per ID per row.

# Remark: This approach only works for feasible number fo rows (~50K) as the output size if numRows x numUniqueId

dfDataPivot = dfData[['Receiver ID', 'Time Difference Group [Sec]']].set_index('Receiver ID', append = True).unstack(-1)

In [None]:
dfDataPivot.shape

In [None]:
dfDataPivot.head(20)

In [None]:
# Running mean of each column
dfDataPivot = dfDataPivot.fillna(0).cumsum() / dfDataPivot.notnull().astype(int).cumsum()

In [None]:
dfDataPivot

In [None]:
# The output is the mean over the rows
# We just need to make sure it keeps the order in tact
dfData['Rolling Average Time Difference [Sec]'] = dfDataPivot.mean(axis = 1)

In [None]:
# Rules - Amount

dfData['Amount Rule'] = ((dfData['Amount [USD]'] - dfData['Rolling Average Amount [USD]']) / dfData['Rolling Average Amount [USD]']) >= tsxAmountThr

In [None]:
# Rules - Time Difference

dfData['Time Diff Rule'] = ((dfData['Time Difference Group [Sec]'] - dfData['Rolling Average Time Difference [Sec]']) / dfData['Rolling Average Time Difference [Sec]']) <= tsxTimeDiffThr
dfData['Time Diff Rule'].iloc[dfData['Rolling Average Time Difference [Sec]'] == 0] = True #<! To match the code which for a single Tx group only use the amount rule (See https://github.com/CyVers-AI/features-creator/blob/7b0acbb4b6554e1d700ae5a7801403a3a6a19241/app/services/rule_impl.py#L90)

In [None]:
dfData['Alert Rule'] = dfData['Amount Rule'] & dfData['Time Diff Rule']

In [None]:
dfData.head(40)

In [None]:
dfData['Alert Rule'].sum()

In [None]:
dfData.to_csv(os.path.join(DATA_FOLDER_NAME, 'Validate.csv'))