In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Basic Imports
import numpy as np
import pandas as pd
from collections import Counter

#Visualization
import matplotlib.pyplot as plt

# Preprocessing Imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler

# Resampling Imports
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn import FunctionSampler

# Feature Selection Imports
from sklearn.feature_selection import RFE

# Model Imports
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.pipeline import Pipeline
from sklearn.utils.parallel import delayed

#Metricx
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score, precision_score, recall_score, accuracy_score

# Enable experimental hist gradient boosting import
from sklearn.experimental import enable_hist_gradient_boosting


In [4]:
%%time
pandas_df = pd.read_csv('RFE_10_features.csv', index_col=0)

CPU times: user 8.15 s, sys: 1.54 s, total: 9.69 s
Wall time: 14.8 s


In [5]:
pandas_df.head()

Unnamed: 0,failure,smart_5_raw,smart_9_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_193_raw,smart_197_raw,smart_198_raw,smart_241_raw,smart_242_raw
0,0,0,56776,0,0,0,9374,0,0,69477598000,572668460964
1,0,0,58746,0,0,0,49760,0,0,68899470440,441746036126
2,0,0,53468,0,0,0,53576,0,0,56870347352,328670471836
3,0,0,53804,0,0,0,7493,0,0,61124976952,510085438777
4,0,0,60198,0,0,0,38087,0,0,72258413320,371504760918


In [6]:
# Count the number of 0s and 1s in the 'failure' column
num_failures = pandas_df['failure'].value_counts()

# Print the results
print('Number of 0s:', num_failures[0])
print('Number of 1s:', num_failures[1])

Number of 0s: 6678738
Number of 1s: 632


In [7]:
X = pandas_df.drop('failure', axis=1).values
y = pandas_df['failure'].values

In [8]:
def fdr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fdr = tp / (tp + fn)
    return fdr

def far_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    far = fp / (fp + tn)
    return far
    


scoring = {'precision': 'precision','gmean': make_scorer(geometric_mean_score),'f1': 'f1' , 'recall':'recall', 'fdr': make_scorer(fdr_score), 'far': make_scorer(far_score)}

In [9]:
metrics_df = pd.DataFrame(columns=['G-mean', 'F1', 'Precision', 'FDR', 'FAR', 'Recall'])

In [10]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  38.2s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  37.2s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  39.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  37.2s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628


In [11]:
%%time

RF_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, class_weight='balanced'))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean

RF_scores = cross_validate(RF_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_scores['test_gmean'].mean()
F1 = RF_scores['test_f1'].mean()
FDR = RF_scores['test_fdr'].mean()
FAR = RF_scores['test_far'].mean()
Precision = RF_scores['test_precision'].mean()
Recall = RF_scores['test_recall'].mean()

# append the results for Random Forest
metrics_df.loc['Random Forest'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 3.9min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 3.9min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 3.8min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 3.9min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest,0.0,0.0,0.0,0.0,1.4e-05,0.0


In [12]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with Random under sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest,0.0,0.0,0.0,0.0,1.4e-05,0.0
Random Forest with Random under sampling,0.915038,0.003613,0.00181,0.878215,0.046236,0.878215


In [13]:
%%time
LR_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced', n_jobs=-1))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_scores = cross_validate(LR_pipeline, X, y, cv=5, scoring=scoring)

Gmean = LR_scores['test_gmean'].mean()
F1 = LR_scores['test_f1'].mean()
FDR = LR_scores['test_fdr'].mean()
FAR = LR_scores['test_far'].mean()
Precision = LR_scores['test_precision'].mean()
Recall = LR_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.5min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.9min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.1min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.4min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel



Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest,0.0,0.0,0.0,0.0,1.4e-05,0.0
Random Forest with Random under sampling,0.915038,0.003613,0.00181,0.878215,0.046236,0.878215
Weighted Logistic Regression,0.901823,0.006883,0.003456,0.832321,0.02271,0.832321


In [14]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with random Undersampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.0s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest,0.0,0.0,0.0,0.0,1.4e-05,0.0
Random Forest with Random under sampling,0.915038,0.003613,0.00181,0.878215,0.046236,0.878215
Weighted Logistic Regression,0.901823,0.006883,0.003456,0.832321,0.02271,0.832321
Weighted Logistic Regression with random Undersampling,0.747562,0.014172,0.007177,0.563317,0.007444,0.563317


In [15]:
%%time

GR_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedBaggingClassifier(estimator=HistGradientBoostingClassifier(random_state=42), n_jobs=8))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
GR_scores = cross_validate(GR_pipeline, X, y, cv=5, scoring=scoring)

Gmean = GR_scores['test_gmean'].mean()
F1 = GR_scores['test_f1'].mean()
FDR = GR_scores['test_fdr'].mean()
FAR = GR_scores['test_far'].mean()
Precision = GR_scores['test_precision'].mean()
Recall = GR_scores['test_recall'].mean()


metrics_df.loc['Balanced Bagging with Hist Gradient'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   7.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   7.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   7.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   7.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest,0.0,0.0,0.0,0.0,1.4e-05,0.0
Random Forest with Random under sampling,0.915038,0.003613,0.00181,0.878215,0.046236,0.878215
Weighted Logistic Regression,0.901823,0.006883,0.003456,0.832321,0.02271,0.832321
Weighted Logistic Regression with random Undersampling,0.747562,0.014172,0.007177,0.563317,0.007444,0.563317
Balanced Bagging with Hist Gradient,0.914767,0.005266,0.002641,0.863992,0.031159,0.863992


In [16]:
%%time

SB_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedBaggingClassifier(sampler=SMOTE(), n_jobs=-1))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
SB_scores = cross_validate(SB_pipeline, X, y, cv=5, scoring=scoring)

Gmean = SB_scores['test_gmean'].mean()
F1 = SB_scores['test_f1'].mean()
FDR = SB_scores['test_fdr'].mean()
FAR = SB_scores['test_far'].mean()
Precision = SB_scores['test_precision'].mean()
Recall = SB_scores['test_recall'].mean()


metrics_df.loc['Smote bagging with RF'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 9.9min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 9.8min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 9.7min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 9.9min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.8s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.913083,0.003422,0.001714,0.876628,0.048611,0.876628
Random Forest,0.0,0.0,0.0,0.0,1.4e-05,0.0
Random Forest with Random under sampling,0.915038,0.003613,0.00181,0.878215,0.046236,0.878215
Weighted Logistic Regression,0.901823,0.006883,0.003456,0.832321,0.02271,0.832321
Weighted Logistic Regression with random Undersampling,0.747562,0.014172,0.007177,0.563317,0.007444,0.563317
Balanced Bagging with Hist Gradient,0.914767,0.005266,0.002641,0.863992,0.031159,0.863992
Smote bagging with RF,0.152879,0.034998,0.057117,0.025297,4.2e-05,0.025297


In [17]:
metrics_df.to_csv('exp2_results_10features.csv')

In [18]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core import Workspace

# set up workspace
ws = Workspace.from_config()

# get the name of the compute target
compute_target_name = 's53308011'

# get the compute target object
compute_target = ComputeTarget(workspace=ws, name=compute_target_name)

# stop the compute target
compute_target.stop()