**Enhanced Phishing Transactions Detection on Ethereum Network with Tree-based Ensembles: An Empirical Study**

* **Researchers:**
  * **Shikah Alsunaidi** (Information and Computer Science Department, KFUPM)
  * **Dr. Hamoud Aljamaan** (Information and Computer Science Department, KFUPM)
---

# ▶ **1. Imports**

---



In [None]:
# import libraries section
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
%matplotlib inline

In [None]:
# Stop warnings
import warnings as w
w.simplefilter(action='ignore',category=FutureWarning)

# ▶ **2. Global Functions**

---

## **2.1. Upload File Function**

---


In [None]:
# This function takes the file name to upload it and return the dataframe
def upload_file(file_name):

  # Upload CSV file (ReducedDS.csv)
  from google.colab import files
  uploaded = files.upload()

  # Convert the uploaded data into dataframe
  import io
  df = pd.read_csv(io.BytesIO(uploaded[file_name]))

  return df

## **2.2. Write to File Function**

---


In [None]:
# This function write the dataframe to excel file
def write_to_excel (mcc_DF, model, fileName):
  if model == "RF":
    df=pd.DataFrame(mcc_DF,columns=[model])
    df.to_csv(fileName,index = False)
  else:
    df = pd.read_csv(fileName)
    df[model]=mcc_DF
    df.to_csv(fileName,index = False)

# ▶ **3 Hyperparameters Optimization**

---



## **3.1. Upload Tuning Data**


---



In [None]:
df = upload_file('tuning_DF.csv')
df

Saving tuning_DF.csv to tuning_DF.csv


Unnamed: 0,block_timestamp,block_number,gas,receipt_gas_used,gas_price,value,class
0,1514924516,4843488,121000,21000,5.100000e+10,5.880000e+18,0
1,1518851388,5105285,21000,21000,1.280000e+11,7.970000e+17,0
2,1518234307,5062740,100000,21000,9.100000e+10,5.540000e+17,0
3,1509353693,4456892,25200,21000,5.000000e+09,2.000000e+18,0
4,1521321343,5273526,21000,21000,9.000000e+10,9.980000e+17,0
...,...,...,...,...,...,...,...
7119,1515950413,4908151,21000,21000,6.000000e+10,1.350000e+17,1
7120,1517271052,4996727,37297,22297,9.900000e+10,0.000000e+00,1
7121,1525374746,5550718,55176,21784,8.000000e+09,0.000000e+00,1
7122,1520779620,5236675,25200,21000,9.593750e+09,2.000000e+15,0


In [None]:
# divide the dataframe into data and lable
x=df.iloc[:,:-1]
y=df['class']
x.shape

(7124, 6)

## **3.2. Install and Import Optuna Lib**

---

* **Resources:**
  * https://optuna.org/
  * https://towardsdatascience.com/exploring-optuna-a-hyper-parameter-framework-using-logistic-regression-84bd622cd3a5


---



In [None]:
%pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 5.1 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 9.8 MB/s 
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 59.0 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.3-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 8.0 MB/s 
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.11.0-py2.py3-none-any.whl (112 kB)
[K     |████████████████████████████████| 112 kB 50.5 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-an

In [None]:
import optuna

## **3.3. Objective Functions**

---



#### **3.3.1. RF**

---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from sklearn.ensemble import RandomForestClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  rf_n_estimators = trial.suggest_int("rf_n_estimators", 50, 100)
  rf_random_state = trial.suggest_categorical("rf_random_state", [22,32,42])
  classifier_obj = RandomForestClassifier(n_estimators = rf_n_estimators, random_state = rf_random_state)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-03 12:26:07,623][0m A new study created in memory with name: no-name-cb983a24-5a86-4f94-8e3a-8d781fae4539[0m
[32m[I 2022-11-03 12:27:39,140][0m Trial 0 finished with value: 0.9272603954708489 and parameters: {'rf_n_estimators': 100, 'rf_random_state': 32}. Best is trial 0 with value: 0.9272603954708489.[0m
[32m[I 2022-11-03 12:28:51,426][0m Trial 1 finished with value: 0.9272669139171645 and parameters: {'rf_n_estimators': 79, 'rf_random_state': 32}. Best is trial 1 with value: 0.9272669139171645.[0m
[32m[I 2022-11-03 12:29:52,286][0m Trial 2 finished with value: 0.9281008001613964 and parameters: {'rf_n_estimators': 68, 'rf_random_state': 42}. Best is trial 2 with value: 0.9281008001613964.[0m
[32m[I 2022-11-03 12:31:15,365][0m Trial 3 finished with value: 0.9281988135634636 and parameters: {'rf_n_estimators': 94, 'rf_random_state': 32}. Best is trial 3 with value: 0.9281988135634636.[0m
[32m[I 2022-11-03 12:32:14,332][0m Trial 4 finished with value: 0.

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  #optuna.visualization.plot_slice(study, params=['rf_n_estimators', 'rf_max_depth'])
  #optuna.visualization.plot_slice(study, params=['rf_n_estimators', 'rf_criterion',  'rf_n_estimators', 'rf_random_state', 'rf_max_depth', 'rf_min_samples_leaf'])
  optuna.visualization.plot_slice(study, params=['rf_n_estimators', 'rf_random_state'])

In [None]:
#RF
get_Optimized_Parmaters()

The best trial is : 
FrozenTrial(number=21, values=[0.9295314707119204], datetime_start=datetime.datetime(2022, 11, 3, 12, 50, 10, 373562), datetime_complete=datetime.datetime(2022, 11, 3, 12, 51, 30, 13983), params={'rf_n_estimators': 88, 'rf_random_state': 42}, distributions={'rf_n_estimators': IntDistribution(high=100, log=False, low=50, step=1), 'rf_random_state': CategoricalDistribution(choices=(22, 32, 42))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=21, state=TrialState.COMPLETE, value=None)
The best value is : 
0.9295314707119204
The best parameters are : 
{'rf_n_estimators': 88, 'rf_random_state': 42}


In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

The best trial is : 
FrozenTrial(number=21, values=[0.9295314707119204], datetime_start=datetime.datetime(2022, 11, 3, 12, 50, 10, 373562), datetime_complete=datetime.datetime(2022, 11, 3, 12, 51, 30, 13983), params={'rf_n_estimators': 88, 'rf_random_state': 42}, distributions={'rf_n_estimators': IntDistribution(high=100, log=False, low=50, step=1), 'rf_random_state': CategoricalDistribution(choices=(22, 32, 42))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=21, state=TrialState.COMPLETE, value=None)


In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.9295314707119204


In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'rf_n_estimators': 88, 'rf_random_state': 42}


In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['rf_n_estimators', 'rf_random_state'])

### **3.3.2. Ada**

---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from sklearn.ensemble import AdaBoostClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef


  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  Ada_n_estimators = trial.suggest_categorical("Ada_n_estimators", [100, 300,500,700,900])
  Ada_learning_rate = trial.suggest_float("Ada_learning_rate", 0.1,1)
  classifier_obj = AdaBoostClassifier(n_estimators = Ada_n_estimators, learning_rate = Ada_learning_rate)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-03 18:08:27,761][0m A new study created in memory with name: no-name-aa01545e-e56f-4f1a-90f8-78bbf51793c1[0m
[32m[I 2022-11-03 18:11:49,324][0m Trial 0 finished with value: 0.8755076453933007 and parameters: {'rf_n_estimators': 300, 'Ada_learning_rate': 0.5071955099740973}. Best is trial 0 with value: 0.8755076453933007.[0m
[32m[I 2022-11-03 18:21:44,620][0m Trial 1 finished with value: 0.8785102974299552 and parameters: {'rf_n_estimators': 900, 'Ada_learning_rate': 0.21833990147056848}. Best is trial 1 with value: 0.8785102974299552.[0m
[32m[I 2022-11-03 18:31:41,831][0m Trial 2 finished with value: 0.903093782858042 and parameters: {'rf_n_estimators': 900, 'Ada_learning_rate': 0.6676309191343832}. Best is trial 2 with value: 0.903093782858042.[0m
[32m[I 2022-11-03 18:37:09,924][0m Trial 3 finished with value: 0.8958681738719951 and parameters: {'rf_n_estimators': 500, 'Ada_learning_rate': 0.7796120025504635}. Best is trial 2 with value: 0.903093782858042.

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['rf_n_estimators', 'Ada_learning_rate'])

In [None]:
optuna.visualization.plot_slice(study, params=['Ada_n_estimators', 'Ada_learning_rate'])

In [None]:
#Ada
get_Optimized_Parmaters()

In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['rf_n_estimators', 'Ada_learning_rate'])

In [None]:
optuna.visualization.plot_slice(study, params=['Ada_n_estimators', 'Ada_learning_rate'])

### **3.3.3 ET**

---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from sklearn.ensemble import ExtraTreesClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:
  ET_n_estimators = trial.suggest_int("ET_n_estimators", 50, 100)
  ET_random_state = trial.suggest_categorical("ET_random_state", [22,32,42])
  classifier_obj = ExtraTreesClassifier(n_estimators = ET_n_estimators, random_state = ET_random_state)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-04 14:34:38,362][0m A new study created in memory with name: no-name-14a92f38-88dc-4703-a0e3-d2eee5ea05d3[0m
[32m[I 2022-11-04 14:35:17,926][0m Trial 0 finished with value: 0.9253140437313823 and parameters: {'ET_n_estimators': 58, 'ET_random_state': 42}. Best is trial 0 with value: 0.9253140437313823.[0m
[32m[I 2022-11-04 14:35:53,174][0m Trial 1 finished with value: 0.9249928075913946 and parameters: {'ET_n_estimators': 61, 'ET_random_state': 22}. Best is trial 0 with value: 0.9253140437313823.[0m
[32m[I 2022-11-04 14:36:38,726][0m Trial 2 finished with value: 0.9264092914137296 and parameters: {'ET_n_estimators': 98, 'ET_random_state': 22}. Best is trial 2 with value: 0.9264092914137296.[0m
[32m[I 2022-11-04 14:37:13,861][0m Trial 3 finished with value: 0.9254318561821482 and parameters: {'ET_n_estimators': 87, 'ET_random_state': 42}. Best is trial 2 with value: 0.9264092914137296.[0m
[32m[I 2022-11-04 14:37:49,294][0m Trial 4 finished with value: 0.9

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['ET_n_estimators', 'ET_random_state'])

In [None]:
get_Optimized_Parmaters()

The best trial is : 
FrozenTrial(number=15, values=[0.9272998077818962], datetime_start=datetime.datetime(2022, 11, 4, 14, 43, 54, 981761), datetime_complete=datetime.datetime(2022, 11, 4, 14, 44, 38, 236895), params={'ET_n_estimators': 100, 'ET_random_state': 22}, distributions={'ET_n_estimators': IntDistribution(high=100, log=False, low=50, step=1), 'ET_random_state': CategoricalDistribution(choices=(22, 32, 42))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=15, state=TrialState.COMPLETE, value=None)
The best value is : 
0.9272998077818962
The best parameters are : 
{'ET_n_estimators': 100, 'ET_random_state': 22}


In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

The best trial is : 
FrozenTrial(number=15, values=[0.9272998077818962], datetime_start=datetime.datetime(2022, 11, 4, 14, 43, 54, 981761), datetime_complete=datetime.datetime(2022, 11, 4, 14, 44, 38, 236895), params={'ET_n_estimators': 100, 'ET_random_state': 22}, distributions={'ET_n_estimators': IntDistribution(high=100, log=False, low=50, step=1), 'ET_random_state': CategoricalDistribution(choices=(22, 32, 42))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=15, state=TrialState.COMPLETE, value=None)


In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.9272998077818962


In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'ET_n_estimators': 100, 'ET_random_state': 22}


In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['ET_n_estimators', 'ET_random_state'])

### **3.3.4. DT**

---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from sklearn import tree

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  DT_criterion = trial.suggest_categorical("DT_criterion", ['gini', 'entropy'])
  DT_random_state = trial.suggest_categorical("DT_random_state", [22,32,42])
  DT_max_depth = trial.suggest_int("DT_max_depth", 1,8)

  classifier_obj =tree.DecisionTreeClassifier(random_state = DT_random_state, criterion = DT_criterion, max_depth = DT_max_depth)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-04 15:46:38,151][0m A new study created in memory with name: no-name-1ca3b0a9-0e7a-469d-8362-cf2cb49fa08f[0m
[32m[I 2022-11-04 15:46:40,119][0m Trial 0 finished with value: 0.8275747083091304 and parameters: {'DT_criterion': 'gini', 'DT_random_state': 32, 'DT_max_depth': 3}. Best is trial 0 with value: 0.8275747083091304.[0m
[32m[I 2022-11-04 15:46:42,814][0m Trial 1 finished with value: 0.8619208915500305 and parameters: {'DT_criterion': 'gini', 'DT_random_state': 42, 'DT_max_depth': 6}. Best is trial 1 with value: 0.8619208915500305.[0m
[32m[I 2022-11-04 15:46:45,297][0m Trial 2 finished with value: 0.8497273461574908 and parameters: {'DT_criterion': 'gini', 'DT_random_state': 32, 'DT_max_depth': 5}. Best is trial 1 with value: 0.8619208915500305.[0m
[32m[I 2022-11-04 15:46:49,972][0m Trial 3 finished with value: 0.8877456797752766 and parameters: {'DT_criterion': 'entropy', 'DT_random_state': 32, 'DT_max_depth': 8}. Best is trial 3 with value: 0.88774567

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['DT_random_state', 'DT_criterion','DT_max_depth'])

In [None]:
get_Optimized_Parmaters()

The best trial is : 
FrozenTrial(number=44, values=[0.8911208940474882], datetime_start=datetime.datetime(2022, 11, 4, 15, 49, 23, 667691), datetime_complete=datetime.datetime(2022, 11, 4, 15, 49, 27, 692896), params={'DT_criterion': 'entropy', 'DT_random_state': 42, 'DT_max_depth': 8}, distributions={'DT_criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'DT_random_state': CategoricalDistribution(choices=(22, 32, 42)), 'DT_max_depth': IntDistribution(high=8, log=False, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=44, state=TrialState.COMPLETE, value=None)
The best value is : 
0.8911208940474882
The best parameters are : 
{'DT_criterion': 'entropy', 'DT_random_state': 42, 'DT_max_depth': 8}


In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

The best trial is : 
FrozenTrial(number=44, values=[0.8911208940474882], datetime_start=datetime.datetime(2022, 11, 4, 15, 49, 23, 667691), datetime_complete=datetime.datetime(2022, 11, 4, 15, 49, 27, 692896), params={'DT_criterion': 'entropy', 'DT_random_state': 42, 'DT_max_depth': 8}, distributions={'DT_criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'DT_random_state': CategoricalDistribution(choices=(22, 32, 42)), 'DT_max_depth': IntDistribution(high=8, log=False, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=44, state=TrialState.COMPLETE, value=None)


In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.8911208940474882


In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'DT_criterion': 'entropy', 'DT_random_state': 42, 'DT_max_depth': 8}


In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['DT_random_state', 'DT_criterion','DT_max_depth'])

### **3.3.5. GBM**

---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from sklearn.ensemble import GradientBoostingClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  GB_n_estimators = trial.suggest_categorical("GB_n_estimators", [50,100,500,1000])
  GB_learning_rate = trial.suggest_categorical("GB_learning_rate", [0.001,0.01,0.1])
  GB_min_samples_leaf = trial.suggest_categorical("GB_min_samples_leaf", [1,5,10])
  GB_max_depth=trial.suggest_categorical("GB_max_depth", [3,7,9])
  GB_loss=trial.suggest_categorical("GB_loss", ['deviance','exponential'])

  classifier_obj =GradientBoostingClassifier(n_estimators=GB_n_estimators,learning_rate=GB_learning_rate,min_samples_leaf=GB_min_samples_leaf,max_depth=GB_max_depth,loss=GB_loss)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-04 19:59:37,903][0m A new study created in memory with name: no-name-9e5a9bfb-8630-4bd9-bc5c-9f7026f2fd37[0m
[32m[I 2022-11-04 20:15:58,374][0m Trial 0 finished with value: 0.0 and parameters: {'GB_n_estimators': 500, 'GB_learning_rate': 0.001, 'GB_min_samples_leaf': 10, 'GB_max_depth': 9, 'GB_loss': 'exponential'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-04 20:18:38,125][0m Trial 1 finished with value: 0.0 and parameters: {'GB_n_estimators': 100, 'GB_learning_rate': 0.001, 'GB_min_samples_leaf': 5, 'GB_max_depth': 7, 'GB_loss': 'exponential'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-04 20:21:52,018][0m Trial 2 finished with value: 0.0 and parameters: {'GB_n_estimators': 100, 'GB_learning_rate': 0.001, 'GB_min_samples_leaf': 5, 'GB_max_depth': 9, 'GB_loss': 'deviance'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-04 20:35:47,680][0m Trial 3 finished with value: 0.9033259758894373 and parameters: {'GB_n_estimators': 500, 'GB_lear

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['GB_n_estimators','GB_learning_rate','GB_min_samples_leaf','GB_max_depth','GB_loss'])

In [None]:
get_Optimized_Parmaters()

In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['GB_n_estimators','GB_learning_rate','GB_min_samples_leaf','GB_max_depth','GB_loss'])

### **3.3.6. XGB**
 * https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn
---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from xgboost import XGBClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  XGB_n_estimators = trial.suggest_categorical("XGB_n_estimators", [100,300,500,700,900])
  XGB_learning_rate = trial.suggest_float("XGB_learning_rate", 0.1,0.7)
  XGB_max_depth=trial.suggest_int("XGB_max_depth", 3,7)
  XGB_random_state = trial.suggest_int("XGB_random_state", 0,42)

  classifier_obj =XGBClassifier(n_estimators=XGB_n_estimators,learning_rate=XGB_learning_rate,max_depth=XGB_max_depth, random_state=XGB_random_state)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-05 18:07:16,880][0m A new study created in memory with name: no-name-4ea31841-78f8-4518-ab00-2aa0f12386ae[0m
[32m[I 2022-11-05 18:07:48,240][0m Trial 0 finished with value: 0.881730137578108 and parameters: {'XGB_n_estimators': 100, 'XGB_learning_rate': 0.33116162101885166, 'XGB_max_depth': 3, 'XGB_random_state': 25}. Best is trial 0 with value: 0.881730137578108.[0m
[32m[I 2022-11-05 18:08:20,393][0m Trial 1 finished with value: 0.8983542850033086 and parameters: {'XGB_n_estimators': 100, 'XGB_learning_rate': 0.34194878246945604, 'XGB_max_depth': 3, 'XGB_random_state': 36}. Best is trial 1 with value: 0.8983542850033086.[0m
[32m[I 2022-11-05 18:13:30,625][0m Trial 2 finished with value: 0.9082840872953666 and parameters: {'XGB_n_estimators': 700, 'XGB_learning_rate': 0.6857989009271866, 'XGB_max_depth': 6, 'XGB_random_state': 9}. Best is trial 2 with value: 0.9082840872953666.[0m
[32m[I 2022-11-05 18:16:00,394][0m Trial 3 finished with value: 0.87716120959

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['XGB_n_estimators','XGB_learning_rate','XGB_max_depth','XGB_random_state'])

In [None]:
get_Optimized_Parmaters()

The best trial is : 
FrozenTrial(number=16, values=[0.9219747578450265], datetime_start=datetime.datetime(2022, 11, 5, 18, 47, 17, 312500), datetime_complete=datetime.datetime(2022, 11, 5, 18, 48, 16, 817930), params={'XGB_n_estimators': 100, 'XGB_learning_rate': 0.513713483627989, 'XGB_max_depth': 7, 'XGB_random_state': 19}, distributions={'XGB_n_estimators': CategoricalDistribution(choices=(100, 300, 500, 700, 900)), 'XGB_learning_rate': FloatDistribution(high=0.7, log=False, low=0.1, step=None), 'XGB_max_depth': IntDistribution(high=7, log=False, low=3, step=1), 'XGB_random_state': IntDistribution(high=42, log=False, low=0, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=16, state=TrialState.COMPLETE, value=None)
The best value is : 
0.9219747578450265
The best parameters are : 
{'XGB_n_estimators': 100, 'XGB_learning_rate': 0.513713483627989, 'XGB_max_depth': 7, 'XGB_random_state': 19}


In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

The best trial is : 
FrozenTrial(number=1, values=[0.9213258757657282], datetime_start=datetime.datetime(2022, 11, 5, 15, 4, 25, 514656), datetime_complete=datetime.datetime(2022, 11, 5, 15, 6, 28, 732260), params={'XGB_n_estimators': 326, 'XGB_learning_rate': 0.45172512178720314, 'XGB_max_depth': 4}, distributions={'XGB_n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'XGB_learning_rate': FloatDistribution(high=0.5, log=False, low=0.1, step=None), 'XGB_max_depth': IntDistribution(high=6, log=False, low=3, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE, value=None)


In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.9213258757657282


In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'XGB_n_estimators': 326, 'XGB_learning_rate': 0.45172512178720314, 'XGB_max_depth': 4}


In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['XGB_n_estimators','XGB_learning_rate','XGB_max_depth','XGB_random_state'])

### **3.3.7. HGB**
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
---



In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from sklearn.ensemble import HistGradientBoostingClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  HGB_max_iter = trial.suggest_categorical("HGB_max_iter", [100,300,500,700,900])
  HGB_learning_rate = trial.suggest_float("HGB_learning_rate", 0.1,0.7)
  HGB_min_samples_leaf = trial.suggest_int('HGB_min_samples_leaf', 5,25)
  HGB_max_depth = trial.suggest_categorical("HGB_max_depth", [None,1,3,5,7])

  classifier_obj =HistGradientBoostingClassifier(max_iter = HGB_max_iter,learning_rate = HGB_learning_rate,
                                min_samples_leaf = HGB_min_samples_leaf, max_depth = HGB_max_depth)

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-09 19:24:53,485][0m A new study created in memory with name: no-name-b12d1e7b-3710-4260-b8de-deaef9c4e525[0m
[32m[I 2022-11-09 19:25:24,606][0m Trial 0 finished with value: 0.9157198868033453 and parameters: {'HGB_max_iter': 100, 'HGB_learning_rate': 0.21549812589555356, 'HGB_min_samples_leaf': 15, 'HGB_max_depth': 3}. Best is trial 0 with value: 0.9157198868033453.[0m
[32m[I 2022-11-09 19:27:27,765][0m Trial 1 finished with value: 0.9399904169357814 and parameters: {'HGB_max_iter': 900, 'HGB_learning_rate': 0.21266216212884387, 'HGB_min_samples_leaf': 15, 'HGB_max_depth': 3}. Best is trial 1 with value: 0.9399904169357814.[0m
[32m[I 2022-11-09 19:30:25,188][0m Trial 2 finished with value: 0.939535858261341 and parameters: {'HGB_max_iter': 700, 'HGB_learning_rate': 0.3556307890904531, 'HGB_min_samples_leaf': 17, 'HGB_max_depth': None}. Best is trial 1 with value: 0.9399904169357814.[0m
[32m[I 2022-11-09 19:32:59,812][0m Trial 3 finished with value: 0.939276

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['HGB_max_iter','HGB_learning_rate', 'HGB_min_samples_leaf', 'HGB_max_depth'])


In [None]:
get_Optimized_Parmaters()

The best trial is : 
FrozenTrial(number=16, values=[0.9429795405527905], datetime_start=datetime.datetime(2022, 11, 9, 19, 54, 23, 303322), datetime_complete=datetime.datetime(2022, 11, 9, 19, 58, 23, 440108), params={'HGB_max_iter': 700, 'HGB_learning_rate': 0.15798616895216405, 'HGB_min_samples_leaf': 7, 'HGB_max_depth': 7}, distributions={'HGB_max_iter': CategoricalDistribution(choices=(100, 300, 500, 700, 900)), 'HGB_learning_rate': FloatDistribution(high=0.7, log=False, low=0.1, step=None), 'HGB_min_samples_leaf': IntDistribution(high=25, log=False, low=5, step=1), 'HGB_max_depth': CategoricalDistribution(choices=(None, 1, 3, 5, 7))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=16, state=TrialState.COMPLETE, value=None)
The best value is : 
0.9429795405527905
The best parameters are : 
{'HGB_max_iter': 700, 'HGB_learning_rate': 0.15798616895216405, 'HGB_min_samples_leaf': 7, 'HGB_max_depth': 7}


In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

The best trial is : 
FrozenTrial(number=16, values=[0.9429795405527905], datetime_start=datetime.datetime(2022, 11, 9, 19, 54, 23, 303322), datetime_complete=datetime.datetime(2022, 11, 9, 19, 58, 23, 440108), params={'HGB_max_iter': 700, 'HGB_learning_rate': 0.15798616895216405, 'HGB_min_samples_leaf': 7, 'HGB_max_depth': 7}, distributions={'HGB_max_iter': CategoricalDistribution(choices=(100, 300, 500, 700, 900)), 'HGB_learning_rate': FloatDistribution(high=0.7, log=False, low=0.1, step=None), 'HGB_min_samples_leaf': IntDistribution(high=25, log=False, low=5, step=1), 'HGB_max_depth': CategoricalDistribution(choices=(None, 1, 3, 5, 7))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=16, state=TrialState.COMPLETE, value=None)


In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.9429795405527905


In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'HGB_max_iter': 700, 'HGB_learning_rate': 0.15798616895216405, 'HGB_min_samples_leaf': 7, 'HGB_max_depth': 7}


In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['HGB_max_iter','HGB_learning_rate', 'HGB_min_samples_leaf', 'HGB_max_depth'])

### **3.3.8. CAT**
* https://catboost.ai/en/docs/concepts/parameter-tuning
* https://catboost.ai/en/docs/concepts/loss-functions-classification
---



In [None]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):
  from catboost import CatBoostClassifier

  from sklearn.model_selection import RepeatedStratifiedKFold
  from sklearn.model_selection import cross_validate

  from sklearn.metrics import make_scorer
  from sklearn.metrics import matthews_corrcoef

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import  StandardScaler


  scaler = StandardScaler()

  scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef)}
  rfold= RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

  # Step 2. Setup values for the hyperparameters:

  CAT_n_estimators = trial.suggest_categorical("CAT_n_estimators", [100,300,500,700,900])
  CAT_learning_rate = trial.suggest_float("CAT_learning_rate", 0.1,0.7)
  CAT_depth = trial.suggest_int('CAT_depth', 4,10)
  CAT_min_data_in_leaf = trial.suggest_int("CAT_min_data_in_leaf", 1,10)
  CAT_loss_function = trial.suggest_categorical("CAT_loss_function", ['Logloss','CrossEntropy'])

  classifier_obj =CatBoostClassifier(n_estimators = CAT_n_estimators, learning_rate = CAT_learning_rate,
                                     depth = CAT_depth, min_data_in_leaf = CAT_min_data_in_leaf,
                                     loss_function = CAT_loss_function, logging_level='Silent')

  pipeline = make_pipeline(scaler, classifier_obj)

  # Step 3: Scoring method:
  score=cross_validate(pipeline, x , y, cv=rfold,scoring=scoring, return_train_score=False)
  mcc = score['test_matthews_corrcoef'].mean()
  return mcc

In [None]:
# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2022-11-10 11:36:18,056][0m A new study created in memory with name: no-name-d22fe03e-d90e-4e51-8698-859e75b7503a[0m
[32m[I 2022-11-10 11:38:08,536][0m Trial 0 finished with value: 0.9410769466099532 and parameters: {'CAT_n_estimators': 300, 'CAT_learning_rate': 0.4673814345131694, 'CAT_depth': 5, 'CAT_min_data_in_leaf': 6, 'CAT_loss_function': 'Logloss'}. Best is trial 0 with value: 0.9410769466099532.[0m
[32m[I 2022-11-10 11:46:20,655][0m Trial 1 finished with value: 0.9396746803568771 and parameters: {'CAT_n_estimators': 500, 'CAT_learning_rate': 0.2813810081420888, 'CAT_depth': 9, 'CAT_min_data_in_leaf': 6, 'CAT_loss_function': 'CrossEntropy'}. Best is trial 0 with value: 0.9410769466099532.[0m
[32m[I 2022-11-10 11:50:13,048][0m Trial 2 finished with value: 0.9399197272451372 and parameters: {'CAT_n_estimators': 700, 'CAT_learning_rate': 0.6834138122443394, 'CAT_depth': 6, 'CAT_min_data_in_leaf': 9, 'CAT_loss_function': 'CrossEntropy'}. Best is trial 0 with value:

In [None]:
def get_Optimized_Parmaters():

  # Getting the best trial:
  print(f"The best trial is : \n{study.best_trial}")

  # Getting the best score:
  print(f"The best value is : \n{study.best_value}")

  # Getting the best parameters:
  print(f"The best parameters are : \n{study.best_params}")

  #Visualize the slice plot
  optuna.visualization.plot_slice(study, params=['CAT_n_estimators','CAT_learning_rate', 'CAT_depth', 'CAT_min_data_in_leaf', 'CAT_loss_function'])


In [None]:
get_Optimized_Parmaters()

The best trial is : 
FrozenTrial(number=28, values=[0.9452421986737543], datetime_start=datetime.datetime(2022, 11, 10, 13, 41, 31, 954446), datetime_complete=datetime.datetime(2022, 11, 10, 13, 45, 19, 544393), params={'CAT_n_estimators': 900, 'CAT_learning_rate': 0.16072676121148252, 'CAT_depth': 4, 'CAT_min_data_in_leaf': 7, 'CAT_loss_function': 'Logloss'}, distributions={'CAT_n_estimators': CategoricalDistribution(choices=(100, 300, 500, 700, 900)), 'CAT_learning_rate': FloatDistribution(high=0.7, log=False, low=0.1, step=None), 'CAT_depth': IntDistribution(high=10, log=False, low=4, step=1), 'CAT_min_data_in_leaf': IntDistribution(high=10, log=False, low=1, step=1), 'CAT_loss_function': CategoricalDistribution(choices=('Logloss', 'CrossEntropy'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=28, state=TrialState.COMPLETE, value=None)
The best value is : 
0.9452421986737543
The best parameters are : 
{'CAT_n_estimators': 900, 'CAT_learning_rate': 0.160726761211

In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")

The best trial is : 
FrozenTrial(number=28, values=[0.9452421986737543], datetime_start=datetime.datetime(2022, 11, 10, 13, 41, 31, 954446), datetime_complete=datetime.datetime(2022, 11, 10, 13, 45, 19, 544393), params={'CAT_n_estimators': 900, 'CAT_learning_rate': 0.16072676121148252, 'CAT_depth': 4, 'CAT_min_data_in_leaf': 7, 'CAT_loss_function': 'Logloss'}, distributions={'CAT_n_estimators': CategoricalDistribution(choices=(100, 300, 500, 700, 900)), 'CAT_learning_rate': FloatDistribution(high=0.7, log=False, low=0.1, step=None), 'CAT_depth': IntDistribution(high=10, log=False, low=4, step=1), 'CAT_min_data_in_leaf': IntDistribution(high=10, log=False, low=1, step=1), 'CAT_loss_function': CategoricalDistribution(choices=('Logloss', 'CrossEntropy'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=28, state=TrialState.COMPLETE, value=None)


In [None]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.9452421986737543


In [None]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'CAT_n_estimators': 900, 'CAT_learning_rate': 0.16072676121148252, 'CAT_depth': 4, 'CAT_min_data_in_leaf': 7, 'CAT_loss_function': 'Logloss'}


In [None]:
#Visualize the slice plot
optuna.visualization.plot_slice(study, params=['CAT_n_estimators','CAT_learning_rate', 'CAT_depth', 'CAT_min_data_in_leaf', 'CAT_loss_function'])