[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saschaschworm/big-data-and-data-science/blob/master/notebooks/demos/outage-decision-tree-learning.ipynb)

In [1]:
from functools import partial
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/demos/outages.csv')
X, y = data.iloc[:, 0:3], data['outage']

hyperparams = {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2, 
               'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': None, 
               'random_state': 1909, 'max_leaf_nodes': None, 'min_impurity_decrease':0.0, 
               'min_impurity_split': None}

model = DecisionTreeClassifier(**hyperparams)

In [3]:
categorical_featues = ['vibration', 'error', 'temperature']

categorical_transformer = Pipeline([
    ('onehotencoder', OneHotEncoder(drop='first')),
])

preprocessor = ColumnTransformer([
    ('categorical_transformer', categorical_transformer, categorical_featues),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model),
])

pipeline = pipeline.fit(X, y)

In [4]:
instance = pd.DataFrame({'vibration': ['medium'], 'error': ['yes'], 'temperature': ['low']})

prediction = pipeline.predict(instance)
prediction_proba = pipeline.predict_proba(instance)
f'Prediction result: {prediction} ({prediction_proba})'

"Prediction result: ['no'] ([[1. 0.]])"

In [5]:
custom_scorer = make_scorer(partial(f1_score, pos_label='yes'))
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=4, return_train_score=True)

res_acc_tr = np.mean(res_cv['train_score']) * 100
res_acc_te = np.mean(res_cv['test_score']) * 100
f'Average F1 on Training and Test Sets: {res_acc_tr:.2f}%/{res_acc_te:.2f}%'

'Average F1 on Training and Test Sets: 100.00%/91.67%'