In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Basic Imports
import numpy as np
import pandas as pd
from collections import Counter

#Visualization
import matplotlib.pyplot as plt

# Preprocessing Imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler

# Resampling Imports
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn import FunctionSampler

# Feature Selection Imports
from sklearn.feature_selection import RFE

# Model Imports
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.pipeline import Pipeline
from sklearn.utils.parallel import delayed

#Metricx
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score, precision_score, recall_score, accuracy_score

# Enable experimental hist gradient boosting import
from sklearn.experimental import enable_hist_gradient_boosting


In [3]:
%%time
pandas_df = pd.read_csv('RFE_10_features.csv', index_col=0)

CPU times: user 7.8 s, sys: 838 ms, total: 8.64 s
Wall time: 14.8 s


In [4]:
pandas_df.head()

Unnamed: 0,failure,smart_5_raw,smart_9_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_193_raw,smart_197_raw,smart_198_raw,smart_241_raw,smart_242_raw
0,0,0,56776,0,0,0,9374,0,0,69477598000,572668460964
1,0,0,58746,0,0,0,49760,0,0,68899470440,441746036126
2,0,0,53468,0,0,0,53576,0,0,56870347352,328670471836
3,0,0,53804,0,0,0,7493,0,0,61124976952,510085438777
4,0,0,60198,0,0,0,38087,0,0,72258413320,371504760918


In [5]:
# Count the number of 0s and 1s in the 'failure' column
num_failures = pandas_df['failure'].value_counts()

# Print the results
print('Number of 0s:', num_failures[0])
print('Number of 1s:', num_failures[1])

Number of 0s: 6678738
Number of 1s: 632


In [6]:
X = pandas_df.drop('failure', axis=1).values
y = pandas_df['failure'].values

In [7]:
def fdr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fdr = tp / (tp + fn)
    return fdr

def far_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    far = fp / (fp + tn)
    return far
    


scoring = {'precision': 'precision','gmean': make_scorer(geometric_mean_score),'f1': 'f1' , 'recall':'recall', 'fdr': make_scorer(fdr_score), 'far': make_scorer(far_score)}

In [8]:
metrics_df = pd.DataFrame(columns=['G-mean', 'F1', 'Precision', 'FDR', 'FAR', 'Recall'])

In [9]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.0010, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 1000:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.2min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.2min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.2min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.2min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218


In [10]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.010, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 100:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  42.5s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  42.8s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  42.9s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  43.1s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826


In [11]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.10, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 10:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  41.1s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.9s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  41.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.7s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799


In [12]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.25, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 4:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.6s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.8s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.5s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.6s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181


In [13]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.33, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 3:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.4s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.5s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.5s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.6s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769


In [14]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.50, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 2:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.7s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.5s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.6s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668


In [15]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=0.75, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 4:3 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.5s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s




Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304


In [16]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, sampling_strategy=1, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest with 1:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.7s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  40.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628


In [17]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.0010)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 1000:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=  19.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=  20.6s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=  19.5s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482


In [18]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.010)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 100:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   1.6s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   1.6s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   1.5s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [19]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.10)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 10:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.1s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [20]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.25)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 4:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.1s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [21]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.33)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 3:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.1s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [22]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.50)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 2:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [23]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.75)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 4:3 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [24]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=1)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with 1:1 sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [25]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.0010)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 1000:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   5.7s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   3.6s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   3.5s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [26]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.010)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 100:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.3s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.3s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [27]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.10)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 10:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [28]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.25)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 4:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [29]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.33)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 3:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [30]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.50)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 2:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.1s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [31]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=0.75)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 4:3'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [32]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42, sampling_strategy=1)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with 1:1'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest with 1000:1 sampling,0.226836,0.042129,0.035354,0.052218,0.000134,0.052218
Balanced Random Forest with 100:1 sampling,0.623556,0.038005,0.019975,0.390826,0.001815,0.390826
Balanced Random Forest with 10:1 sampling,0.894951,0.01224,0.006166,0.811799,0.012369,0.811799
Balanced Random Forest with 4:1 sampling,0.912568,0.008994,0.004521,0.848181,0.017717,0.848181
Balanced Random Forest with 3:1 sampling,0.912124,0.007799,0.003918,0.849769,0.020526,0.849769
Balanced Random Forest with 2:1 sampling,0.913334,0.005989,0.003005,0.857668,0.027009,0.857668
Balanced Random Forest with 4:3 sampling,0.915478,0.004503,0.002258,0.870304,0.036693,0.870304
Balanced Random Forest with 1:1 sampling,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest with 1000:1 sampling,0.217563,0.031538,0.02367,0.047482,0.000185,0.047482
Random Forest with 100:1 sampling,0.558447,0.031178,0.016402,0.314786,0.001792,0.314786


In [33]:
metrics_df.to_csv('exp3_results_10features.csv')

#### temp 

In [3]:
import os
if not os.path.exists('./img/'):
    os.mkdir('./img/')
import matplotlib.pyplot as plt
os.path.join('./img/','plot1.svg')


In [34]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core import Workspace

# set up workspace
ws = Workspace.from_config()

# get the name of the compute target
compute_target_name = 's53308011'

# get the compute target object
compute_target = ComputeTarget(workspace=ws, name=compute_target_name)

# stop the compute target
compute_target.stop()