[![CyVers](https://i.imgur.com/yyhmZET.png)](https://www.cyvers.ai/)

# Validate Production Features and Reules Calculations  
This notebook validates the calculationdone online as described in https://github.com/CyVers-AI/SolidusBlindTest/issues/10.

> Notebook by:
> - Royi Avital Royi@cyvers.ai

## Revision History

| Version | Date       | Name            | Content / Changes     |
|---------|------------|-----------------|-----------------------|
| 1.0.000 | 27/06/2022 | Royi Avital     | First version         |
|         |            |                 |                       |

In [None]:
# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Misc
import datetime
import os
from platform import python_version
import random

from numba import njit

# Machine Learning
from sklearn.preprocessing import LabelEncoder

# EDA Tools
import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

# Ensemble Engines
import lightgbm
import xgboost

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, show

# Jupyter
from ipywidgets import interact, Dropdown, Layout

In [None]:
# Configuration
%matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

In [None]:
# Constants

DATA_FOLDER_NAME    = 'DataSet'
DATA_FILE_EXT       = 'csv'

In [None]:
# Parameters

csvFileName = 'Dataset Bit2C Test Case.csv'

tsxAmountThr    = 8
tsxTimeDiffThr  = -0.99

In [None]:
# Loading / Generating Data

dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
numRows, numCols = dfData.shape

print(f'The number of rows (Samples): {numRows}, The number of columns: {numCols}')

In [None]:
# Convert time data into Pandas format
dfData['Transaction Time'] = pd.to_datetime(dfData['Transaction Time'], infer_datetime_format = 'True') #<! Stable time format

In [None]:
dfData.head(20)

In [None]:
dfData.info()

In [None]:
len(dfData['Receiver ID'].unique())

In [None]:
dfData.describe()

In [None]:
# We need data sorted by ascending time 

dfData.sort_values('Transaction Time', inplace = True)

In [None]:
dfData.reset_index(inplace = True)
dfData.head(100)

## Feature Calculation

Features are calculated by time (Using current and past data only).

In [None]:
dfData['Rolling Average Amount [USD]'] = np.nan #<! Rolling average over time of all data
dfData['Time Difference Group [Sec]'] = np.nan #<! The time difference from the previous transaction of teh same user
dfData['Rolling Average Time Difference [Sec]'] = np.nan #<! The average of teh average time difference per group (Until the i-th transaction where i is the row index sorted by transaction time)

dfData.head()

In [None]:
dfData['Rolling Average Amount [USD]'] = dfData['Amount [USD]'].cumsum() / np.arange(1.0, dfData.shape[0] + 1)
dfData.head()

In [None]:
dsUniqueUser = dfData['Receiver ID'].unique()
dsUniqueUser

In [None]:
numUniqUsr = len(dsUniqueUser)

In [None]:
labelEncoder = LabelEncoder()
labelEncoder.fit(dsUniqueUser)
labelEncoder.classes_

dfData['UserEnc'] = labelEncoder.transform(dfData['Receiver ID'])

In [None]:
# Calculating the relative time in [Sec]
dfData['TransactionTime [Sec]'] = (dfData['Transaction Time'] - dfData['Transaction Time'].loc[0]).dt.total_seconds()

In [None]:
# NumPy vector of the user encoding in UINT
vUserEnc = dfData['UserEnc'].to_numpy()
vUserEnc

In [None]:
# Numpy vector of the relative transaction time
vTimeDelta = dfData['TransactionTime [Sec]'].to_numpy()
vTimeDelta

In [None]:
vGrpNumTrns         = np.zeros(numUniqUsr)
vGrpLastTime        = np.zeros(numUniqUsr)
vGrpTimeDiff        = np.zeros(dfData.shape[0])
vGrpTimeDiffAvg     = np.empty(numUniqUsr)
vGrpTimeDiffAvg.fill(np.nan)
vAvgGrpTimeDiffAvg  = np.zeros(dfData.shape[0])

In [None]:
# @njit(fastmath = True)
@njit
def CalcluateAvgGrpTimeDiffAvg(vUserEnc, vTimeDelta, vGrpNumTrns, vGrpLastTime, vGrpTimeDiffAvg, vGrpTimeDiff, vAvgGrpTimeDiffAvg):
    numTrns = vAvgGrpTimeDiffAvg.size

    for ii in range(numTrns):
        vGrpNumTrns[vUserEnc[ii]]      += 1
        vGrpLastTime[vUserEnc[ii]]      = np.where(vGrpNumTrns[vUserEnc[ii]] == 1, vTimeDelta[ii], vGrpLastTime[vUserEnc[ii]]) #<! To make sure the time difference for the 1st one is zero
        currDiff                        = vTimeDelta[ii] - vGrpLastTime[vUserEnc[ii]]
        vGrpTimeDiffAvg[vUserEnc[ii]]   = np.where(np.isnan(vGrpTimeDiffAvg[vUserEnc[ii]]), 0, vGrpTimeDiffAvg[vUserEnc[ii]])
        vGrpTimeDiffAvg[vUserEnc[ii]]  += ((currDiff - vGrpTimeDiffAvg[vUserEnc[ii]]) / vGrpNumTrns[vUserEnc[ii]]) #<! Welford's algorithm
        vGrpLastTime[vUserEnc[ii]]      = vTimeDelta[ii]
        vGrpTimeDiff[ii]                = currDiff
        vAvgGrpTimeDiffAvg[ii]          = np.nanmean(vGrpTimeDiffAvg)

In [None]:
CalcluateAvgGrpTimeDiffAvg(vUserEnc, vTimeDelta, vGrpNumTrns, vGrpLastTime, vGrpTimeDiffAvg, vGrpTimeDiff, vAvgGrpTimeDiffAvg)

In [None]:
dfData['Time Difference Group [Sec]']           = vGrpTimeDiff
dfData['Rolling Average Time Difference [Sec]'] = vAvgGrpTimeDiffAvg

In [None]:
# Rules - Amount

dfData['Amount Rule'] = ((dfData['Amount [USD]'] - dfData['Rolling Average Amount [USD]']) / dfData['Rolling Average Amount [USD]']) >= tsxAmountThr

In [None]:
# Rules - Time Difference

dfData['Time Diff Rule'] = ((dfData['Time Difference Group [Sec]'] - dfData['Rolling Average Time Difference [Sec]']) / dfData['Rolling Average Time Difference [Sec]']) <= tsxTimeDiffThr
dfData['Time Diff Rule'].iloc[dfData['Rolling Average Time Difference [Sec]'] == 0] = True #<! To match the code which for a single Tx group only use the amount rule (See https://github.com/CyVers-AI/features-creator/blob/7b0acbb4b6554e1d700ae5a7801403a3a6a19241/app/services/rule_impl.py#L90)

In [None]:
dfData['Alert Rule'] = dfData['Amount Rule'] & dfData['Time Diff Rule']

In [None]:
dfData.head(40)

In [None]:
dfData['Alert Rule'].sum()

In [None]:
dfData.to_csv(os.path.join(DATA_FOLDER_NAME, 'Validate.csv'))