<a href="https://colab.research.google.com/github/rgumi/dataScience/blob/master/backorders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from scipy.stats import randint

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/exercises/backorders.csv')
X, y = data.iloc[:, 0:-1], data['went_on_backorder']

hyperparams = {'random_state': 1909}
model = RandomForestClassifier(**hyperparams)

In [0]:
# normalization

numeric_features = X.iloc[:, 0:-1].columns
categorical_features = ['ppap_risk']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])
categorical_transformer = Pipeline ([
    ('onehotencoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
    ('c_transformer', categorical_transformer, categorical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])

#pipeline = pipeline.fit(X, y)


In [104]:
n_estimators = randint(200, 400)
max_depth = randint(75, 125)
# model__ weil man im object "model" in der pipeline die Parameter verbessern/setzen will
param_distributions = {'model__n_estimators': n_estimators, 'model__max_depth': max_depth}

rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5,
                       scoring=custom_scorer, n_jobs=-1, iid=False, cv=10, random_state=1909)
rs = rs.fit(X, y)

print(f'Optimal parameters: {rs.best_params_}')

Optimal parameters: {'model__max_depth': 96, 'model__n_estimators': 286}


In [105]:
# best: {'model__max_depth': 96, 'model__n_estimators': 286}
#
hyperparams = {'n_estimators': rs.best_params_['model__n_estimators'], 
               'criterion': 'gini', 
               'max_depth': rs.best_params_['model__max_depth'],
               'min_samples_split': 2,
               'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto',
               'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None,
               'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': 1909,
               'verbose': 0, 'warm_start': False, 'class_weight': None}

model = RandomForestClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])

pipeline.fit(X, y)
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%')

Average F1 on Training and Test Sets: 97.38%/89.73%
Average F1 on Training and Test Sets: 97.38%/89.75%
