[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saschaschworm/big-data-and-data-science/blob/master/notebooks/demos/outage-decision-tree-learning.ipynb)

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/demos/outages.csv')
X, y = data.iloc[:, 0:3], data['outage']

hyperparams = {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2, 
               'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': None, 
               'random_state': 1909, 'max_leaf_nodes': None, 'min_impurity_decrease':0.0, 
               'min_impurity_split': None}

model = DecisionTreeClassifier(**hyperparams)

In [3]:
categorical_featues = ['vibration', 'error', 'temperature']

categorical_transformer = Pipeline([
    ('onehotencoder', OneHotEncoder(drop='first')),
])

preprocessor = ColumnTransformer([
    ('categorical_transformer', categorical_transformer, categorical_featues),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model),
])

pipeline = pipeline.fit(X, y)

In [4]:
instance = pd.DataFrame({'vibration': ['medium'], 'error': ['yes'], 'temperature': ['low']})

prediction = pipeline.predict(instance)
prediction_proba = pipeline.predict_proba(instance)
f'Prediction result: {prediction} ({prediction_proba})'

"Prediction result: ['no'] ([[1. 0.]])"

In [5]:
custom_scorer = make_scorer(f1_score, pos_label='yes')
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=4, return_train_score=True)

res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'

'Average F1 on Training and Test Sets: 100.00%/91.67%'

In [6]:
hyperparams = {
    'n_estimators': 4, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 
    'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 
    'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 
    'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': 1909, 
    'verbose': 0, 'warm_start': False, 'class_weight': None}

model = RandomForestClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model),
])

pipeline = pipeline.fit(X, y)

In [7]:
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=4, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'

'Average F1 on Training and Test Sets: 97.73%/75.00%'

In [8]:
n_estimators = randint(100, 500)
max_depth = randint(1, 10)
param_distributions = {'model__n_estimators': n_estimators, 'model__max_depth': max_depth}

In [9]:
rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5, 
                        scoring=custom_scorer, n_jobs=-1, iid=False, cv=4, random_state=1909)
rs = rs.fit(X, y)

In [10]:
f'Optimal parameters: {rs.best_params_}'

"Optimal parameters: {'model__max_depth': 6, 'model__n_estimators': 442}"

In [11]:
n_estimators = [100, 200, 300, 400, 500]
max_depth = [1, 2, 3, 5, 6, 7, 8, 9, 10]
param_grid = {'model__n_estimators': n_estimators,  'model__max_depth': max_depth}

In [12]:
gs = GridSearchCV(pipeline, param_grid=param_grid, scoring=custom_scorer, n_jobs=-1, 
                  iid=False, cv=4)
gs = gs.fit(X, y)

In [13]:
f'Optimal parameters: {gs.best_params_}'

"Optimal parameters: {'model__max_depth': 3, 'model__n_estimators': 400}"