[![CyVers](https://i.imgur.com/yyhmZET.png)](https://www.cyvers.ai/)

# Validate Production Features and Reules Calculations  
This notebook validates the calculationdone online as described in https://github.com/CyVers-AI/SolidusBlindTest/issues/10.

> Notebook by:
> - Royi Avital Royi@cyvers.ai
> - Anton Rudenko Anton@cyvers.ai

## Revision History

| Version | Date       | Name            | Content / Changes     |
|---------|------------|-----------------|-----------------------|
| 1.0.000 | 27/06/2022 | Royi Avital     | First version         |
|         |            |                 |                       |

In [None]:
# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Misc
import random

import os
import datetime
from platform import python_version

# EDA Tools
import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

# Ensemble Engines
import lightgbm
import xgboost

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, show

# Jupyter
from ipywidgets import interact, Dropdown, Layout

In [None]:
# Configuration
%matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

In [None]:
# Constants

DATA_FOLDER_NAME    = 'DataSet'
DATA_FILE_EXT       = 'csv'

In [None]:
# Parameters

csvFileName ='Dataset Bit2C.csv'#'Dataset Bit2C Test Case.csv' #

tsxAmountThr    = 8
tsxTimeDiffThr  = -0.99

In [None]:
# Loading / Generating Data

dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
numRows, numCols = dfData.shape

print(f'The number of rows (Samples): {numRows}, The number of columns: {numCols}')

In [None]:
# Convert time data into Pandas format
dfData['Transaction Time'] = pd.to_datetime(dfData['Transaction Time'], infer_datetime_format = 'True') #<! Stable time format

In [None]:
dfData.head(20)

In [None]:
dfData.info()

In [None]:
len(dfData['Receiver ID'].unique())

In [None]:
dfData.describe()

In [None]:
# We need data sorted by ascending time 

dfData.sort_values('Transaction Time', inplace = True)

In [None]:
dfData.reset_index(inplace = True)
dfData.head(100)

## Feature Engineering

This section adds features and engineers them.  
It is assuemd the files havd a single unique `Sender`. Hence all analysis is done on the eceivers.


The features are:

 1. 

Remarks:

 *  Features x-y are time / frequency related.
 *  Features z-t are trasnaction realted.


In [None]:
dfData['Rolling Average Amount [USD]'] = np.nan #<! Rolling average over time of all data
dfData['Time Difference Group [Sec]'] = np.nan #<! The time difference from the previous transaction of teh same user
dfData['Rolling Average Time Difference [Sec]'] = np.nan #<! The average of teh average time difference per group (Until the i-th transaction where i is the row index sorted by transaction time)

dfData.head()

In [None]:
dfData['Rolling Average Amount [USD]'] = dfData['Amount [USD]'].cumsum() / np.arange(1.0, dfData.shape[0] + 1)
dfData.head()

In [None]:

def f(df):
  #tsxAmountThr    = 8
  #tsxTimeDiffThr  = -0.99
  d = {}
  time_str = 'Transaction Time' 
  id_str = 'Receiver ID'
  rol_av_am_str ='Rolling Average Amount [USD]' ; am_str = 'Amount [USD]'
  time_diff_str = 'Time Difference Group [Sec]'
  roll_avgr_time_diff = 'Rolling Average Time Difference [Sec]'
  rule_alrt_str = 'Alert Rule'
  time_rule_str = 'Time Diff Rule'
  amount_rule_str = 'Amount Rule' 
   

  df.sort_values(time_str, inplace=True)
  for index, row in df.iterrows():
    if row[id_str] not in d:
      group_id = row[id_str]
      previous = row[time_str] ; current = row[time_str]  
      counter = 1
      time_diff = (current - previous).total_seconds()
      average = time_diff
      d[group_id] = {'current' : current, 'counter': counter , 'average' : time_diff}  
      df.at[index, time_diff_str] = time_diff
      #df.at[index, 'group_average'] = average
      

    else:
      group_id = row[id_str]
      previous = d[group_id]['current'] ; current = row[time_str]  
      counter = d[group_id]['counter'] + 1 

      time_diff = (current - previous).total_seconds()

      average = (d[group_id]['average']*(counter - 1) + time_diff) / counter

      d[group_id] = {'current' : current, 'counter': counter , 'average' : average}
      df.at[index, time_diff_str] = time_diff
      #df.at[index, 'group_average'] = average
      
      
    
    avrg =np.mean([d[gr]['average'] for gr in d.keys()])
    df.at[index, roll_avgr_time_diff] = avrg
      
  df[amount_rule_str] = ((df[am_str] - df[rol_av_am_str]) / df[rol_av_am_str]) >= tsxAmountThr
  df[time_rule_str] = ((df[time_diff_str] - df[roll_avgr_time_diff]) / df[roll_avgr_time_diff]) <= tsxTimeDiffThr
  df[rule_alrt_str] = df[amount_rule_str] & df[time_rule_str]
  return df
            

        
    

In [None]:
output = f(dfData)

In [None]:
output['Alert Rule'].sum()
#output['Alert Rule'].value_counts()

In [None]:
output.to_csv(os.path.join(DATA_FOLDER_NAME, 'Validate_my.csv'))