# Multiple classifiers

We will attempt to create methods to train and evaluate multiple multi-output classifier and rank them based on F2 score

Given we were not able to create a good balanced dataset, we will test using original dataset with reduced features 

In [29]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xb

In [31]:
X_train=pd.read_csv('X_train.csv')
y_train=pd.read_csv('y_train.csv')
X_test=pd.read_csv('X_test.csv')
y_test=pd.read_csv('y_test.csv')
y_train_omf = y_train['Machine failure']
y_test_omf = y_test['Machine failure']
X_train_reduced = X_train.drop(columns=['Torque _Nm_', 'Process temperature _K_', 'Air temperature _K_'])
X_test_reduced = X_test.drop(columns=['Torque _Nm_', 'Process temperature _K_', 'Air temperature _K_'])


In [32]:
from sklearn.metrics import fbeta_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputClassifier

def get_ml_perf_machine_failure(name, clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    y_train_omf = y_train['Machine failure']
    y_test_omf = y_test['Machine failure']
    clf = MultiOutputClassifier(clf)
    clf.fit(X_train, y_train)
    y_train_pred= clf.predict(X_train)
    y_train_pred = pd.DataFrame(y_train_pred, columns=y_test.columns)
    y_test_pred = clf.predict(X_test)
    y_test_pred = pd.DataFrame(y_test_pred, columns=y_test.columns)
    y_test_pred_omf = y_test_pred['Machine failure']
    y_train_pred_omf = y_train_pred['Machine failure']
    f2_train=fbeta_score(y_train_omf, y_train_pred_omf, beta=2)
    f2_test=fbeta_score(y_test_omf, y_test_pred_omf, beta=2)
    train_recall=recall_score(y_train_omf, y_train_pred_omf)
    test_recall=recall_score(y_test_omf, y_test_pred_omf)
    train_precision=precision_score(y_train_omf, y_train_pred_omf, zero_division=0)
    test_precision=precision_score(y_test_omf, y_test_pred_omf, zero_division=0)
    train_accuracy=accuracy_score(y_train_omf, y_train_pred_omf)
    test_accuracy=accuracy_score(y_test_omf, y_test_pred_omf)
    predictor_name = clf.__class__.__name__
    cm = confusion_matrix(y_train_omf, y_train_pred_omf)
    train_tn, train_fp, train_fn, train_tp = cm.ravel()
    cm = confusion_matrix(y_test_omf, y_test_pred_omf)
    test_tn, test_fp, test_fn, test_tp = cm.ravel()
    result = {
        "multi-output-classifier": clf,
        "f2_train": f2_train,
        "f2_test": f2_test,
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "train_recall":train_recall,
        "test_recall": test_recall,
        "test_precision": test_precision,
        "train_precision":train_precision,
        "predictor_name": predictor_name,
        "train_fp": train_fp,
        "train_fn": train_fn,
        "test_fp": test_fp,
        "test_fn": test_fn,
        "train_tp": train_tp,
        "train_tn": train_tn,
        "test_tp": test_tp,
        "test_tn": test_tn,
        "test_pred": y_test_pred,
        "train_pred": y_train_pred,
        "name": name        
    }
    return result

def evaluate_models(models:dict,  X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    results = []
    for name, model in models.items():
        results.append(get_ml_perf_machine_failure(name, model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test))
    results.sort(reverse=True, key=lambda result: result["f2_test"])
    print(" -- Sorted result --")
    for result in results:
        print(f"{result["name"]:20} : f2-test {result["f2_test"]:20} | accuracy {result["test_accuracy"]:10}  | test-fp {result["test_fp"]:10}  | test-fn {result["test_fn"]:10}")
    return results

In [37]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

names = [
    "Logistic Regression",
    "Nearest Neighbors",
    "Decision Tree md 10",
    "Decision Tree md 5",
    "Random Forest",
    "AdaBoost",
    "Naive Bayes",
    "XGBoost"
]

classifiers = [
    KNeighborsClassifier(5),
    DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=10, min_samples_leaf=5),
    DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10, min_samples_leaf=5),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    XGBClassifier(
        n_estimators=100,
    objective='binary:logistic',
    random_state=42)
]

models = {name: clf for name, clf in zip(names, classifiers)}

print("on full features")
results = evaluate_models(models) # use reduced features

print("on reduced features")
results_reduced = evaluate_models(models, X_train=X_train_reduced, X_test=X_test_reduced) # use reduced features


on full features
 -- Sorted result --
Decision Tree md 5   : f2-test   0.8840579710144928 | accuracy     0.9935  | test-fp          4  | test-fn          9
Nearest Neighbors    : f2-test   0.8670520231213873 | accuracy      0.992  | test-fp          6  | test-fn         10
Decision Tree md 10  : f2-test   0.8670520231213873 | accuracy      0.992  | test-fp          6  | test-fn         10
Naive Bayes          : f2-test    0.855072463768116 | accuracy     0.9915  | test-fp          6  | test-fn         11
AdaBoost             : f2-test   0.5376344086021505 | accuracy      0.959  | test-fp         52  | test-fn         30
Random Forest        : f2-test   0.4716981132075472 | accuracy      0.976  | test-fp          8  | test-fn         40
Logistic Regression  : f2-test  0.40192926045016075 | accuracy     0.9745  | test-fp          6  | test-fn         45
on reduced features
 -- Sorted result --
Decision Tree md 5   : f2-test   0.8918128654970761 | accuracy      0.995  | test-fp          1

Decision Tree with max depth=5, Reduced feature performed the best among all of our machine learning models

The hyper parameter max depth makes quite a difference, we should optimize our decision tree hyperparameters

## Hyperparameters Optimization For Decision Tree

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import make_scorer, fbeta_score
f2_scorer = make_scorer(fbeta_score, beta=2)
# Set rule to ignore warnings
param_grid = {
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4, 6, 8, 10, 12, None],
    'n_estimators': [1, 8, 16, 32, 64, 100, 200],
}
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(over_X_train, over_y_train)  # fit the classifier on the training data
clf = GridSearchCV(rf_clf,param_grid = param_grid, cv = 3, verbose=False, scoring=f2_scorer)
best_rf_clf = clf.fit(over_X_train, over_y_train)
best_rf_clf.best_estimator_
print("Best parameters for Random Forest Regression:")
print(best_rf_clf.best_params_)
print("Best score for Random Forest Regression:")
print(best_rf_clf.best_score_)

NameError: name 'over_X_train' is not defined