In [None]:
import datetime
from platform import python_version
import random
import warnings

# Ensemble Engines
from xgboost import XGBClassifier

# Visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from DataSetsAuxFun import *
#from PredictAssetData import *

from joblib import load
import pickle
import ruamel.yaml#import yaml

In [None]:
with open(r'params_no_gas_3.yml') as file:#with open(r'params_no_gas.yml') as file:
    params = ruamel.yaml.safe_load(file)


In [None]:
warnings.filterwarnings("ignore")

seedNum = params['seedNum']#512
np.random.seed(seedNum)
random.seed(seedNum)

sns.set_theme() #>! Apply SeaBorn theme

# %% Constants

DATA_FOLDER_NAME    = 'Validated_Hack_Cases_V1'#params['DATA_FOLDER_NAME'] #'BlockChainAttacksDataSet'
DATA_FOLDER_PATTERN = 'DataSet0'#params['DATA_FOLDER_PATTERN']#'DataSet001'
DATA_FILE_EXT       = params['DATA_FILE_EXT']#'csv'

PROJECT_DIR_NAME    = params['PROJECT_DIR_NAME']# 'CyVers' #<! Royi: Anton, don't change it, it should be a team constant
PROJECT_DIR_PATH = os.path.join(os.getcwd()[:os.getcwd().find(PROJECT_DIR_NAME)], PROJECT_DIR_NAME) #>! Pay attention, it will create issues in cases you name the folder `CyVersMe` or anything after / before `CyVers`


# We work according to version 0.8 API.
# See https://github.com/CyVers-AI/CyVersManagement/blob/main/AiTeamOnBoarding.md.
lCsvColName     = params['lCsvColName']
lCsvColNameFlag = params['lCsvColNameFlag']

lSlctedFeaturesRaw    = params['lSlctedFeaturesRaw']#['Amount', 'Currency', 'Currency Type', 'Amount [USD]', 'Receiver Type', 'Gas Price', 'Gas Limit', 'Gas Used' ]
lSlctedFeaturesCalc   = params['lSlctedFeaturesCalc']#[enumObj.name for enumObj in FeatureName if ((enumObj is not FeatureName.TIME_MAX) and (enumObj is not FeatureName.TIME_MIN))]
lSlctdFeatures        = lSlctedFeaturesRaw + lSlctedFeaturesCalc
lCatFeatures          = params['lCatFeatures']#['Currency', 'Currency Type', 'Receiver Type']#lCatFeatures          = ['Currency', 'Receiver Type']
lNumericalFeatures =    [featureName for featureName in lSlctdFeatures if featureName not in lCatFeatures]

#lTotalFeatures = lNumericalFeatures + lCatFeatures

dataSetRotoDir = os.path.join(PROJECT_DIR_PATH, DATA_FOLDER_NAME)

# Training
testSetRatio = params['testSetRatio']#1 / 3
numKFolds    = params['numKFolds']#3

# Amount USD Outlier threshold
amountUsdOutlierThr = params['amountUsdOutlierThr']#1e9
randomState         = params['randomState'] #42

In [None]:
# %% Loading / Generating Data

lCsvFile = ExtractCsvFiles(dataSetRotoDir, folderNamePattern = DATA_FOLDER_PATTERN)
print(f'The number of file found: {len(lCsvFile)}')

# dfData = pd.read_csv(os.path.join(DATA_FOLDER_NAME, csvFileName))
#dfData, dAssetFile = LoadCsvFilesDf(lCsvFile, verifySingleSenderId = False, verifyColumns = False, baseFoldePath = '')
dfData, dAssetFile =  LoadCsvFilesDf(lCsvFile, baseFoldePath = '', lColName = lCsvColName, lColFlag =  lCsvColNameFlag)
numRows, numCols = dfData.shape

print(f"The number of rows (Samples): {numRows}, The number of columns: {numCols}, number of unique sender id's: {dfData['Sender ID'].unique().shape}")
print(f'The data list of columns is: {dfData.columns} with {len(dfData.columns)} columns')

In [None]:
# %% Pre Process Data
dfData = PreProcessData(dfData, updateInplace = True, amountUsdOutlierThr = amountUsdOutlierThr)

In [None]:
ValidateData(dfData, lSlctedFeaturesRaw)  

In [None]:
# %% Instantiate the Pandas Extension
print('Instantiate the Pandas Extension')
print(f'The number of assets in the data: {dfData.GrpBySender.numGrps}')

In [None]:
# %% Calculate Features
dfFeatures = ApplyListOfFeatures(dfData, lSlctedFeaturesCalc)

In [None]:
dfX , scaler_dct =  GenDataPredict(dfFeatures , lSlctdFeatures , lNumericalFeatures , lCatFeatures)

In [None]:
lSelectedFeatures_  = lSlctdFeatures
if 'Amount [USD]' in lSelectedFeatures_: lSelectedFeatures_[lSelectedFeatures_.index('Amount [USD]')] =  'Amount USD' ### change of 'Amount [USD]' column string for categorical datas in xgboost

In [None]:
models_by_files = TrainModelByFiles(dfX , lSelectedFeatures_ , numKFolds, randomState , seedNum)

In [None]:
#models_by_transacts = TrainModelByTransact(dfX ,lSelectedFeatures_  , lCatFeatures ,lSelectedFeatures_ , numKFolds, randomState )

In [None]:
##################### saving :
xgbModel = models_by_files[-1][2] ### <<--- choose best one by your specific criteria
lRawFeatures = lSlctedFeaturesRaw
lProcessedFeatures = lSlctedFeaturesCalc
lSelectedFeatures = lSlctdFeatures

TIME_STAMP_FORMAT = '%Y_%m_%d_%H_%M_%S' #<! For the strftime() formatter
MODEL_FILE_NAME = 'Model'
MODEL_FILE_EXT  = 'pkl' #<! Used to be JSON for XGBoost, Needs to figure it out

folderPostfix   = datetime.datetime.now().strftime(TIME_STAMP_FORMAT)
folderName      = MODEL_FILE_NAME + '_' + folderPostfix

modelFileName   = MODEL_FILE_NAME + '.' + MODEL_FILE_EXT

if not os.path.exists(folderName):
    os.mkdir(folderName)

pickle.dump(xgbModel, open(os.path.join(folderName, modelFileName), "wb"))
pickle.dump(lRawFeatures, open(os.path.join(folderName, 'lRawFeatures.pkl'), "wb"))
pickle.dump(lProcessedFeatures, open(os.path.join(folderName, 'lProcessedFeatures.pkl'), "wb"))
pickle.dump(lCatFeatures, open(os.path.join(folderName, 'lCatFeatures.pkl'), "wb")) 
pickle.dump(lSelectedFeatures, open(os.path.join(folderName, 'lSelectedFeatures.pkl'), "wb"))
pickle.dump(scaler_dct, open(os.path.join(folderName, 'scaler_dct.pkl'), "wb"))

In [None]:
folderName

In [None]:
model_file_hash = hashfile(os.path.join(folderName, modelFileName))

In [None]:
model_file_hash

In [None]:
params['MODEL_FILE_HASH'] = model_file_hash

In [None]:
with open('params_no_gas_.yml','w') as yamlfile:
        ruamel.yaml.safe_dump(params, yamlfile)
 