[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saschaschworm/big-data-and-data-science/blob/master/notebooks/development-exercises/backorder-random-forest.ipynb)

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
data = pd.read_csv('https://github.com/saschaschworm/big-data-and-data-science/blob/master/datasets/exercises/backorders.csv?raw=true')
X, y = data.iloc[:, 0:10], data['went_on_backorder']

In [3]:
categorical_featues = list(data.iloc[:, -2:-1].columns)
numerical_features = list(data.iloc[:, 0:9].columns)

categorical_transformer = Pipeline([
    ('onehotencoder', OneHotEncoder(drop='first')),
])

numerical_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])

preprocessor = ColumnTransformer([
    ('categorical_transformer', categorical_transformer, categorical_featues),
    ('numerical_transformer', numerical_transformer, numerical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', RandomForestClassifier(random_state=1909)),
])

In [4]:
custom_scorer = make_scorer(f1_score, pos_label='yes')
n_estimators, max_depth = randint(200, 400), randint(50, 100)
param_distributions = {'model__n_estimators': n_estimators, 'model__max_depth': max_depth}

In [5]:
rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, 
                        n_iter=5, scoring=custom_scorer, n_jobs=-1, iid=False, 
                        cv=10, random_state=1909)
rs = rs.fit(X, y)
f'Optimal parameters: {rs.best_params_}'

"Optimal parameters: {'model__max_depth': 63, 'model__n_estimators': 343}"

In [6]:
model = RandomForestClassifier(n_estimators=rs.best_params_['model__n_estimators'], 
                               max_depth=rs.best_params_['model__max_depth'], 
                               random_state=1909)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model),
])

pipeline = pipeline.fit(X, y)

In [7]:
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}/{res_f1_te:.2f}')

Average F1 on Training and Test Sets: 97.38/89.74


In [8]:
prediction_set = pd.DataFrame([[100, 0, 2, 5, 6, 1, 2, 5, 1, 'no']], columns=X.columns)

In [9]:
pipeline.predict(prediction_set)

array(['no'], dtype=object)

In [10]:
pipeline.predict_proba(prediction_set)

array([[0.80758017, 0.19241983]])