In [1]:
import dalex as dx

import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('ai4i2020.csv', names = ["UDI", "Product ID", "Type",  
            "Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", 
                                        "Tool wear [min]", "TWF", "HDF", "PWF", "OSF", "RNF", "Machine failure"])

In [3]:
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF,Machine failure
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
X = data.drop(['UDI','Product ID','Machine failure'], axis=1)
y = data["Machine failure"]

In [5]:
X.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=50)

In [7]:
numerical_features = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]", "TWF", "HDF", "PWF", "OSF", "RNF"]
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

categorical_features = ['Type']
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [35]:
# classifier = DecisionTreeClassifier()
classifier = DecisionTreeClassifier(max_depth=3, criterion='entropy', max_features=0.6, splitter='best')

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])

In [36]:
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [46]:
exp = dx.Explainer(clf, X_train, y_train)

Preparation of a new explainer is initiated

  -> data              : 9000 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 9000 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x0000022EFEB90AF0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.00167, max = 0.5
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.5, mean = -3.21e-19, max = 0.999
  -> model_info        : package sklearn

A new explainer has been created!


In [47]:
exp.predict(X_test)[:10]

array([0.00085911, 0.00085911, 0.00331895, 0.00085911, 0.00331895,
       0.        , 0.00085911, 0.00331895, 0.00085911, 0.        ])

In [48]:
print(y_test[1:10])

7868    1
4176    0
4161    0
8770    0
8919    0
3884    0
1590    0
617     0
1562    0
Name: Machine failure, dtype: int64


In [49]:
cp = exp.predict_profile(X_test.iloc[2], label='testing')

Calculating ceteris paribus: 100%|████████████████████████████████████████████████████| 11/11 [00:00<00:00, 132.44it/s]


In [50]:
cp.plot()

In [51]:
print(accuracy_score(y_test, predictions))

0.996


In [52]:
mp = exp.model_performance(model_type = 'classification')
mp.result

Unnamed: 0,recall,precision,f1,accuracy,auc
DecisionTreeClassifier,0.066667,0.5,0.117647,0.998333,0.730325


In [53]:
mp.plot(geom="roc")