In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imbtools.evaluation import BinaryExperiment

In [3]:
parameters_range = {
    "n_classes": [2],
    "class_sep": [2], 
    "n_clusters_per_class": [2, 3, 4, 5], 
    "weights": [[0.95, 0.05], [0.98, 0.02], [0.992, 0.008]], 
    "n_samples": [10000], 
    "n_features": [5, 10, 50, 100], 
    "n_redundant": [0],
    "n_informative":[5],
    "random_state": [5]
}

datasets_parameters = [dict(zip(parameters_range.keys(), parameter_product)) for parameter_product in product(*parameters_range.values())]

In [4]:
imbalanced_datasets = []
for parameters in datasets_parameters:
    imbalanced_datasets.append(make_classification(**parameters))

In [5]:
oversampling_methods = [None, RandomOverSampler(), SMOTE(), SMOTE(kind='borderline1'), SMOTE(kind='borderline2'), ADASYN()]
classifiers = [LogisticRegression(), GradientBoostingClassifier()]
param_grids = [None, {'max_depth':[2, 3, 5, 8], 'n_estimators':[10, 50, 80, 100]}]

In [6]:
experiment = BinaryExperiment(imbalanced_datasets, classifiers, oversampling_methods, n_jobs=-1, param_grids=param_grids)

In [7]:
experiment.run(logging_results=False)

100% (8640 of 8640) |#############| Elapsed Time: 1 day, 19:42:05 ETA:  0:00:00

In [8]:
experiment.datasets_summary_

Unnamed: 0,Dataset name,# of features,# of instances,# of minority instances,# of majority instances,Imbalance Ratio
0,dataset_1,5,10000,556,9444,16.99
1,dataset_2,10,10000,553,9447,17.08
2,dataset_3,50,10000,541,9459,17.48
3,dataset_4,100,10000,539,9461,17.55
4,dataset_5,5,10000,260,9740,37.46
5,dataset_6,10,10000,257,9743,37.91
6,dataset_7,50,10000,242,9758,40.32
7,dataset_8,100,10000,240,9760,40.67
8,dataset_9,5,10000,140,9860,70.43
9,dataset_10,10,10000,139,9861,70.94


In [9]:
experiment.mean_cv_results_

Unnamed: 0,Dataset,Classifier,Oversampling method,Metric,Mean CV score
0,dataset_1,GradientBoostingClassifier,ADASYN,f1 score,0.360643
1,dataset_1,GradientBoostingClassifier,ADASYN,geometric mean score,0.862144
2,dataset_1,GradientBoostingClassifier,ADASYN,roc auc score,0.944713
3,dataset_1,GradientBoostingClassifier,,f1 score,0.890233
4,dataset_1,GradientBoostingClassifier,,geometric mean score,0.914537
5,dataset_1,GradientBoostingClassifier,,roc auc score,0.950307
6,dataset_1,GradientBoostingClassifier,RandomOverSampler,f1 score,0.821979
7,dataset_1,GradientBoostingClassifier,RandomOverSampler,geometric mean score,0.934970
8,dataset_1,GradientBoostingClassifier,RandomOverSampler,roc auc score,0.949817
9,dataset_1,GradientBoostingClassifier,SMOTE,f1 score,0.830943


In [10]:
experiment.std_cv_results_

Unnamed: 0,Dataset,Classifier,Oversampling method,Metric,Std CV score
0,dataset_1,GradientBoostingClassifier,ADASYN,f1 score,0.035686
1,dataset_1,GradientBoostingClassifier,ADASYN,geometric mean score,0.013836
2,dataset_1,GradientBoostingClassifier,ADASYN,roc auc score,0.004755
3,dataset_1,GradientBoostingClassifier,,f1 score,0.004363
4,dataset_1,GradientBoostingClassifier,,geometric mean score,0.007154
5,dataset_1,GradientBoostingClassifier,,roc auc score,0.002814
6,dataset_1,GradientBoostingClassifier,RandomOverSampler,f1 score,0.033506
7,dataset_1,GradientBoostingClassifier,RandomOverSampler,geometric mean score,0.001813
8,dataset_1,GradientBoostingClassifier,RandomOverSampler,roc auc score,0.004028
9,dataset_1,GradientBoostingClassifier,SMOTE,f1 score,0.029258


In [11]:
experiment.mean_ranking_results_

Unnamed: 0_level_0,Unnamed: 1_level_0,ADASYN,None,RandomOverSampler,SMOTE,SMOTE2,SMOTE3
Classifier,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GradientBoostingClassifier,f1 score,4.04,2.23,3.1,4.25,2.35,5.02
GradientBoostingClassifier,geometric mean score,3.21,5.77,3.98,3.23,2.9,1.92
GradientBoostingClassifier,roc auc score,3.98,3.88,3.48,4.0,2.77,2.9
LogisticRegression,f1 score,5.58,1.35,4.4,3.56,2.38,3.73
LogisticRegression,geometric mean score,4.9,5.58,2.48,2.9,2.6,2.54
LogisticRegression,roc auc score,5.54,2.12,3.31,3.71,3.06,3.25


In [None]:
experiment.mean_ranking_results_

In [None]:
experiment.mean_ranking_results_

In [12]:
experiment.friedman_test_results_

Unnamed: 0_level_0,Unnamed: 1_level_0,p-value
Classifier,Metric,Unnamed: 2_level_1
GradientBoostingClassifier,f1 score,5.2107570000000004e-17
GradientBoostingClassifier,geometric mean score,2.9142760000000004e-23
GradientBoostingClassifier,roc auc score,0.0008799962
LogisticRegression,f1 score,5.4992290000000005e-31
LogisticRegression,geometric mean score,3.617915e-26
LogisticRegression,roc auc score,2.088188e-17
