# Multiple classifiers

We will attempt to create methods to train and evaluate multiple multi-output classifier and rank them based on F2 score

Given we were not able to create a good balanced dataset, we will test using original dataset with reduced features 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [4]:
X_train=pd.read_csv('X_train.csv')
y_train=pd.read_csv('y_train.csv')
X_test=pd.read_csv('X_test.csv')
y_test=pd.read_csv('y_test.csv')
y_train_omf = y_train['Machine failure']
y_test_omf = y_test['Machine failure']
X_train_reduced = X_train.drop(columns=['Torque [Nm]', 'Process temperature [K]', 'Air temperature [K]'])
X_test_reduced = X_test.drop(columns=['Torque [Nm]', 'Process temperature [K]', 'Air temperature [K]'])


In [19]:
from sklearn.metrics import fbeta_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputClassifier

def get_ml_perf_machine_failure(name, clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    y_train_omf = y_train['Machine failure']
    y_test_omf = y_test['Machine failure']
    clf = MultiOutputClassifier(clf)
    clf.fit(X_train, y_train)
    y_train_pred= clf.predict(X_train)
    y_train_pred = pd.DataFrame(y_train_pred, columns=y_test.columns)
    y_test_pred = clf.predict(X_test)
    y_test_pred = pd.DataFrame(y_test_pred, columns=y_test.columns)
    y_test_pred_omf = y_test_pred['Machine failure']
    y_train_pred_omf = y_train_pred['Machine failure']
    f2_train=fbeta_score(y_train_omf, y_train_pred_omf, beta=2, average='macro')
    f2_test=fbeta_score(y_test_omf, y_test_pred_omf, beta=2, average='macro')
    train_recall=recall_score(y_train_omf, y_train_pred_omf, average='macro')
    test_recall=recall_score(y_test_omf, y_test_pred_omf, average='macro')
    train_precision=precision_score(y_train_omf, y_train_pred_omf, average='macro', zero_division=0)
    test_precision=precision_score(y_test_omf, y_test_pred_omf, average='macro', zero_division=0)
    train_accuracy=accuracy_score(y_train_omf, y_train_pred_omf)
    test_accuracy=accuracy_score(y_test_omf, y_test_pred_omf)
    predictor_name = clf.__class__.__name__
    cm = confusion_matrix(y_train_omf, y_train_pred_omf)
    train_tn, train_fp, train_fn, train_tp = cm.ravel()
    cm = confusion_matrix(y_test_omf, y_test_pred_omf)
    test_tn, test_fp, test_fn, test_tp = cm.ravel()
    result = {
        "multi-output-classifier": clf,
        "f2_train": f2_train,
        "f2_test": f2_test,
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "train_recall":train_recall,
        "test_recall": test_recall,
        "test_precision": test_precision,
        "train_precision":train_precision,
        "predictor_name": predictor_name,
        "train_fp": train_fp,
        "train_fn": train_fn,
        "test_fp": test_fp,
        "test_fn": test_fn,
        "train_tp": train_tp,
        "train_tn": train_tn,
        "test_tp": test_tp,
        "test_tn": test_tn,
        "test_pred": y_test_pred,
        "train_pred": y_train_pred,
        "name": name        
    }
    return result

def evaluate_models(models:dict,  X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    results = []
    for name, model in models.items():
        results.append(get_ml_perf_machine_failure(name, model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test))
    results.sort(reverse=True, key=lambda result: result["f2_test"])
    return results

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Naive Bayes",
]

classifiers = [
    KNeighborsClassifier(5),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
]

models = {name: clf for name, clf in zip(names, classifiers)}

results = evaluate_models(models)


Random Forest : f2-test 0.9407343195310682 f2-train  1.0 test-fp 4 test-fn 9
Decision Tree : f2-test 0.9332290095242384 f2-train  1.0 test-fp 5 test-fn 10
Naive Bayes : f2-test 0.7564574203376353 f2-train  0.7337608297863116 test-fp 52 test-fn 30
AdaBoost : f2-test 0.7321308165707225 f2-train  0.7338676001425131 test-fp 8 test-fn 40
Nearest Neighbors : f2-test 0.6974038912427293 f2-train  0.7460419909855751 test-fp 6 test-fn 45
{'estimator__bootstrap': True, 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__max_leaf_nodes': None, 'estimator__max_samples': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__monotonic_cst': None, 'estimator__n_estimators': 100, 'estimator__n_jobs': None, 'estimator__oob_score': False, 'estimator__random_state': 42, 'e

In [None]:
print(" -- Sorted result --")
for result in results:
    print(f"{result["name"]} : f2-test {result["f2_test"]} accuracy {result["test_accuracy"]} test-fp {result["test_fp"]} test-fn {result["test_fn"]}")

result_dict = {result["name"]:result for result in results}

print("--- Random Forest")
moclr=result_dict["Random Forest"]["multi-output-classifier"]
clr = moclr.get_params()["estimator"]
print(clr.get_params())

print("--- Decision Tree")
moclr=result_dict["Decision Tree"]["multi-output-classifier"]
clr = moclr.get_params()["estimator"]
print(clr.get_params())

 -- Sorted result --
Random Forest : f2-test 0.9407343195310682 f2-train  1.0 test-fp 4 test-fn 9
Decision Tree : f2-test 0.9332290095242384 f2-train  1.0 test-fp 5 test-fn 10
Naive Bayes : f2-test 0.7564574203376353 f2-train  0.7337608297863116 test-fp 52 test-fn 30
AdaBoost : f2-test 0.7321308165707225 f2-train  0.7338676001425131 test-fp 8 test-fn 40
Nearest Neighbors : f2-test 0.6974038912427293 f2-train  0.7460419909855751 test-fp 6 test-fn 45
--- Random Forest
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
--- Decision Tree
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 

For our threshold predictor
- F2 Score on Test Data: 0.8053615831757778
- Recall on Test Data: 0.9280162842339008
- Precision on Test Data: 0.6571438277056866
- Accuracy on Test Data: 0.9275

Random Forest and Decision Tree performed much better
