In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Basic Imports
import numpy as np
import pandas as pd
from collections import Counter

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler

# Resampling Imports
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn import FunctionSampler

# Feature Selection Imports
from sklearn.feature_selection import RFE

# Model Imports
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.pipeline import Pipeline
from sklearn.utils.parallel import delayed

#Metricx
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score, precision_score, recall_score, accuracy_score

# Enable experimental hist gradient boosting import
from sklearn.experimental import enable_hist_gradient_boosting


In [3]:
%%time
pandas_df = pd.read_csv('2022_5_features_STM.csv')

CPU times: user 1.05 s, sys: 366 ms, total: 1.42 s
Wall time: 2.25 s


In [4]:
pandas_df.head()

Unnamed: 0,failure,smart_5_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [5]:
# Count the number of 0s and 1s in the 'failure' column
num_failures = pandas_df['failure'].value_counts()

# Print the results
print('Number of 0s:', num_failures[0])
print('Number of 1s:', num_failures[1])

Number of 0s: 6680436
Number of 1s: 633


In [6]:
X = pandas_df.drop('failure', axis=1).values
y = pandas_df['failure'].values

In [7]:
def fdr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fdr = tp / (tp + fn)
    return fdr

def far_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    far = fp / (fp + tn)
    return far
    


scoring = {'precision': 'precision','gmean': make_scorer(geometric_mean_score),'f1': 'f1' , 'recall':'recall', 'fdr': make_scorer(fdr_score), 'far': make_scorer(far_score)}

In [8]:
metrics_df = pd.DataFrame(columns=['G-mean', 'F1', 'Precision', 'FDR', 'FAR', 'Recall'])

In [9]:
%%time
pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedRandomForestClassifier(random_state=42, n_jobs=-1))
],verbose=True)

# evaluate the pipeline using cross-validation with G-mean
BRF_scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

Gmean = BRF_scores['test_gmean'].mean()
F1 = BRF_scores['test_f1'].mean()
FDR = BRF_scores['test_fdr'].mean()
FAR = BRF_scores['test_far'].mean()
Precision = BRF_scores['test_precision'].mean()
Recall = BRF_scores['test_recall'].mean()

metrics_df.loc['Balanced Random Forest'] = [Gmean, F1, Precision, FDR, FAR, Recall]



[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  38.3s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  37.2s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  36.8s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  36.9s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipel



In [10]:
%%time

RF_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean

RF_scores = cross_validate(RF_pipeline, X, y, cv=5, scoring=scoring, n_jobs=-1)

Gmean = RF_scores['test_gmean'].mean()
F1 = RF_scores['test_f1'].mean()
FDR = RF_scores['test_fdr'].mean()
FAR = RF_scores['test_far'].mean()
Precision = RF_scores['test_precision'].mean()
Recall = RF_scores['test_recall'].mean()

# append the results for Random Forest
metrics_df.loc['Random Forest'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.4min




Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.866596,0.002935,0.00147,0.791426,0.051045,0.791426
Random Forest,0.212673,0.000563,0.000283,0.047432,0.016835,0.047432


In [11]:
%%time

#cv = 5 and all features
RF_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value= 0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=8, random_state=42, class_weight='balanced'))
], verbose=True)

# evaluate the pipeline using cross-validation with G-mean
RF_r_scores = cross_validate(RF_r_pipeline, X, y, cv=5, scoring=scoring)

Gmean = RF_r_scores['test_gmean'].mean()
F1 = RF_r_scores['test_f1'].mean()
FDR = RF_r_scores['test_fdr'].mean()
FAR = RF_r_scores['test_far'].mean()
Precision = RF_r_scores['test_precision'].mean()
Recall = RF_r_scores['test_recall'].mean()

# append the results for Random Forest with Random Forest with Random Under Sampling
metrics_df.loc['Random Forest with Random under sampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.5s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.2s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.5s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.3s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.5s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.5s
[Pipeline] ..... (step 3 of 4) Processing under_sampler, total=   1.3s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   0.1s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 4) Processing scaler, total=   0.5s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.866596,0.002935,0.00147,0.791426,0.051045,0.791426
Random Forest,0.212673,0.000563,0.000283,0.047432,0.016835,0.047432
Random Forest with Random under sampling,0.862081,0.002826,0.001415,0.785089,0.053182,0.785089


In [12]:
%%time
LR_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced', n_jobs=-1))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_scores = cross_validate(LR_pipeline, X, y, cv=5, scoring=scoring, n_jobs=-1)

Gmean = LR_scores['test_gmean'].mean()
F1 = LR_scores['test_f1'].mean()
FDR = LR_scores['test_fdr'].mean()
FAR = LR_scores['test_far'].mean()
Precision = LR_scores['test_precision'].mean()
Recall = LR_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

CPU times: user 795 ms, sys: 507 ms, total: 1.3 s
Wall time: 27 s


Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.866596,0.002935,0.00147,0.791426,0.051045,0.791426
Random Forest,0.212673,0.000563,0.000283,0.047432,0.016835,0.047432
Random Forest with Random under sampling,0.862081,0.002826,0.001415,0.785089,0.053182,0.785089
Weighted Logistic Regression,0.851757,0.005663,0.002843,0.744044,0.024732,0.744044


In [13]:
%%time
# Create the logistic regression pipeline with under-sampling
LR_r_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('under_sampler', RandomUnderSampler(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000000, class_weight='balanced'))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
LR_r_scores = cross_validate(LR_r_pipeline, X, y, cv=5, scoring=scoring, n_jobs=8)

# Print the mean G-mean score and standard deviation
Gmean = LR_r_scores['test_gmean'].mean()
F1 = LR_r_scores['test_f1'].mean()
FDR = LR_r_scores['test_fdr'].mean()
FAR = LR_r_scores['test_far'].mean()
Precision = LR_r_scores['test_precision'].mean()
Recall = LR_r_scores['test_recall'].mean()


metrics_df.loc['Weighted Logistic Regression with random Undersampling'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


CPU times: user 809 ms, sys: 515 ms, total: 1.32 s
Wall time: 7.14 s


Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.866596,0.002935,0.00147,0.791426,0.051045,0.791426
Random Forest,0.212673,0.000563,0.000283,0.047432,0.016835,0.047432
Random Forest with Random under sampling,0.862081,0.002826,0.001415,0.785089,0.053182,0.785089
Weighted Logistic Regression,0.851757,0.005663,0.002843,0.744044,0.024732,0.744044
Weighted Logistic Regression with random Undersampling,0.744097,0.014762,0.00748,0.557718,0.007022,0.557718


In [14]:
%%time

GR_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedBaggingClassifier(estimator=HistGradientBoostingClassifier(random_state=42), n_jobs=8))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
GR_scores = cross_validate(GR_pipeline, X, y, cv=5, scoring=scoring)

Gmean = GR_scores['test_gmean'].mean()
F1 = GR_scores['test_f1'].mean()
FDR = GR_scores['test_fdr'].mean()
FAR = GR_scores['test_far'].mean()
Precision = GR_scores['test_precision'].mean()
Recall = GR_scores['test_recall'].mean()


metrics_df.loc['Balanced Bagging with Hist Gradient'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   6.7s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   6.8s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   6.6s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   6.6s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.866596,0.002935,0.00147,0.791426,0.051045,0.791426
Random Forest,0.212673,0.000563,0.000283,0.047432,0.016835,0.047432
Random Forest with Random under sampling,0.862081,0.002826,0.001415,0.785089,0.053182,0.785089
Weighted Logistic Regression,0.851757,0.005663,0.002843,0.744044,0.024732,0.744044
Weighted Logistic Regression with random Undersampling,0.744097,0.014762,0.00748,0.557718,0.007022,0.557718
Balanced Bagging with Hist Gradient,0.85842,0.003557,0.001783,0.769329,0.041867,0.769329


In [15]:
%%time

SB_pipeline = imPipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler()),
    ('classifier', BalancedBaggingClassifier(sampler=SMOTE(), n_jobs=-1))
], verbose=True)

# Evaluate the pipeline using cross-validation with G-mean
SB_scores = cross_validate(SB_pipeline, X, y, cv=5, scoring=scoring)

Gmean = SB_scores['test_gmean'].mean()
F1 = SB_scores['test_f1'].mean()
FDR = SB_scores['test_fdr'].mean()
FAR = SB_scores['test_far'].mean()
Precision = SB_scores['test_precision'].mean()
Recall = SB_scores['test_recall'].mean()


metrics_df.loc['Smote bagging'] = [Gmean, F1, Precision, FDR, FAR, Recall]

metrics_df


[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.3min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.3min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.2min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.3min
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.5s
[Pipel

Unnamed: 0,G-mean,F1,Precision,FDR,FAR,Recall
Balanced Random Forest,0.866596,0.002935,0.00147,0.791426,0.051045,0.791426
Random Forest,0.212673,0.000563,0.000283,0.047432,0.016835,0.047432
Random Forest with Random under sampling,0.862081,0.002826,0.001415,0.785089,0.053182,0.785089
Weighted Logistic Regression,0.851757,0.005663,0.002843,0.744044,0.024732,0.744044
Weighted Logistic Regression with random Undersampling,0.744097,0.014762,0.00748,0.557718,0.007022,0.557718
Balanced Bagging with Hist Gradient,0.85842,0.003557,0.001783,0.769329,0.041867,0.769329
Smote bagging,0.21915,0.00568,0.003145,0.048969,0.004223,0.048969


In [16]:
metrics_df.to_csv('exp2_results_5features')

In [17]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core import Workspace

# set up workspace
ws = Workspace.from_config()

# get the name of the compute target
compute_target_name = 's53308011'

# get the compute target object
compute_target = ComputeTarget(workspace=ws, name=compute_target_name)

# stop the compute target
compute_target.stop()