Preprocessing

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import toml
with open("../config.toml", "r") as f:
    config = toml.load(f)
    
DATA_FOLDER = config["DATA_FOLDER"]
MODEL_FOLDER = config["MODEL_FOLDER"]

random_seed = 46
np.random.seed(random_seed)
random.seed(random_seed)

jobs = -1 #this is the number of cores that the models and test will run on. -1 means that all cores will be used 

#dropping all of the observations that are very likely errors
dataset = pd.read_csv(DATA_FOLDER + "/master_dataset.csv")
dataset = dataset[dataset["cases_hrs"] <= 300]
dataset = dataset[dataset["Total_Hours"] >= 10]


#dropping all of the uneeded columns
to_drop = ["Date", "Total_Hours", "Total_Cases", "B_HrsPct", "B_Cases", "Total_Each_Day", "dry_ratio", "clr_ratio", "frz_ratio", "GO_LIVE_DATE", "LABEL_TYPE"]
dataset_build = dataset.drop(labels=to_drop, axis=1)
dataset_build.rename(columns={"BRNCH_CD":"brnch_cd", "A_HrsPct":"a_hrspct", "C_HrsPct":"c_hrspct", "A_Cases":"a_cases", "C_Cases":"c_cases"}, inplace=True)
dataset_build.to_csv(DATA_FOLDER + "/model_dataset.csv",index=False)

numeric_features = ["a_hrspct",  'c_hrspct', 'a_cases', 'c_cases']
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)
categorical_features=['brnch_cd', 'weekday', 'month']
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(dataset_build.drop(labels="cases_hrs", axis=1),dataset_build['cases_hrs'], random_state=random_seed, train_size = .70)

scores = []


Model Building

In [3]:
#linear regression
reg_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", LinearRegression())]
)
reg_pipe.fit(X_train, y_train)
scores.append(["Reg", reg_pipe.score(X_test, y_test)])

#gradient boosting regressor model
gbr_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(random_state=random_seed))]
)
gbr_pipe.fit(X_train, y_train)
scores.append(["GBR Init", gbr_pipe.score(X_test, y_test)])

#improved gbr
gbr_improved_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, n_estimators=1000, learning_rate=0.3, max_depth=10, random_state=random_seed, loss='squared_error'))]
)
gbr_improved_pipe.fit(X_train, y_train)
scores.append(["GBR Improved", gbr_improved_pipe.score(X_test, y_test)])

#improved gbr 2.0 - lower depth
gbr_improved_pipe_2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, n_estimators=1000, learning_rate=0.3, max_depth=5, random_state=random_seed, loss='squared_error'))]
)
gbr_improved_pipe_2.fit(X_train, y_train)
scores.append(["GBR Improved", gbr_improved_pipe_2.score(X_test, y_test)])

      Iter       Train Loss   Remaining Time 
         1         942.9804            4.58m
         2         758.2974            4.61m
         3         631.9937            4.54m
         4         554.0886            4.55m
         5         501.9105            4.52m
         6         456.6620            4.45m
         7         424.0542            4.39m
         8         397.7424            4.30m
         9         364.3594            4.22m
        10         346.8073            4.18m
        20         234.1818            3.84m
        30         186.0763            3.71m
        40         153.4382            3.61m
        50         129.5386            3.55m
        60         107.3654            3.51m
        70          95.1016            3.46m
        80          84.3506            3.42m
        90          75.3225            3.38m
       100          66.8518            3.34m
       200          31.3407            2.96m
       300          20.3148            2.56m
       40

##### GBR Grid Search
* Takes a while to run

In [4]:
"""
gbrgrid = {'n_estimators':[500,1000,2000], 
           'learning_rate':[0.15,0.3], 
           'max_depth':[5,10]}

crossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)

gbrgrid_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("gbrsearch", GridSearchCV(estimator = GradientBoostingRegressor(), 
                                                                      param_grid = gbrgrid,
                                                                      scoring = 'neg_mean_squared_error',
                                                                      verbose = 1,
                                                                      cv = crossvalidation, n_jobs=jobs))]
)
gbrsearch = gbrgrid_pipe.fit(X_train, y_train)
gbrgrid_pipe.best_params_, gbrgrid_pipe.best_score_
"""

'\ngbrgrid = {\'n_estimators\':[500,1000,2000], \n           \'learning_rate\':[0.15,0.3], \n           \'max_depth\':[5,10]}\n\ncrossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)\n\ngbrgrid_pipe = Pipeline(\n    steps=[("preprocessor", preprocessor), ("gbrsearch", GridSearchCV(estimator = GradientBoostingRegressor(), \n                                                                      param_grid = gbrgrid,\n                                                                      scoring = \'neg_mean_squared_error\',\n                                                                      verbose = 1,\n                                                                      cv = crossvalidation, n_jobs=jobs))]\n)\ngbrsearch = gbrgrid_pipe.fit(X_train, y_train)\ngbrgrid_pipe.best_params_, gbrgrid_pipe.best_score_\n'

In [5]:
#improved gbr
    ## to reduce variablity, depth for gbr should be between 4 - 8
    ## n_estimators = 2000
    ## smaller learning rate = 0.05 - 0.15 is good
    ## 4 minutes
gbr_improved_pipe_3 = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, 
                                                                               n_estimators=2000, 
                                                                               learning_rate=0.05,
                                                                               max_depth=6,  
                                                                               loss='squared_error'))]
)
gbr_improved_pipe_3.fit(X_train, y_train)
scores.append(["GBR Improved 3",gbr_improved_pipe_3.score(X_test, y_test)])


      Iter       Train Loss   Remaining Time 
         1        1211.4200            4.57m
         2        1170.0448            4.53m
         3        1132.6770            4.48m
         4        1097.3747            4.49m
         5        1063.9353            4.49m
         6        1033.3433            4.50m
         7        1004.9712            4.51m
         8         978.5187            4.53m
         9         954.0518            4.53m
        10         929.0276            4.52m
        20         760.7208            4.47m
        30         661.8552            4.45m
        40         595.0819            4.37m
        50         546.9083            4.27m
        60         512.4975            4.17m
        70         485.4619            4.10m
        80         462.8035            4.04m
        90         443.5262            3.98m
       100         426.0728            3.94m
       200         323.1951            3.60m
       300         275.5669            3.35m
       40

##### Extra Trees Regressor
* Best MSE / RMSE = 242.242 / 15.564
    * With DEFAULT paramters, performs pretty well against tuned GBR
    * increasing n_estimators does not change much

In [6]:
#Extra trees base model, 5 1/2 minutes
xtratree_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", ExtraTreesRegressor(verbose=1, n_jobs = jobs))]
)
xtratree_pipe.fit(X_train, y_train)
scores.append(["xtratree",xtratree_pipe.score(X_test, y_test)])


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   57.2s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished


### Random Forest Regressor
* Best MSE / RMSE = 261.028 / 16.156

In [7]:
#random forest grid search
## run if needed
"""
crossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)
randfor_param = {
             'max_depth': [5, 10, 15],
             'n_estimators': [500, 1000, 1500]}
randfor_search = GridSearchCV(RandomForestRegressor(n_jobs=jobs), randfor_param, refit = True, verbose = 3, cv = crossvalidation, scoring = 'neg_mean_squared_error', n_jobs=jobs)
randfor_search.fit(X_train, y_train)
"""

"\ncrossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)\nrandfor_param = {\n             'max_depth': [5, 10, 15],\n             'n_estimators': [500, 1000, 1500]}\nrandfor_search = GridSearchCV(RandomForestRegressor(n_jobs=jobs), randfor_param, refit = True, verbose = 3, cv = crossvalidation, scoring = 'neg_mean_squared_error', n_jobs=jobs)\nrandfor_search.fit(X_train, y_train)\n"

In [8]:
#baseline RFR, outperformed by extratrees regressor - 4 min
randfor_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor(verbose=1, n_jobs=jobs))]
)
randfor_pipe.fit(X_train, y_train)
scores.append(["RFR", randfor_pipe.score(X_test, y_test)])


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   48.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished


### Decision tree regressor
* Best MSE / RMSE = 466.094 / 21.589

In [9]:
#baseline Decision tree regressor
dectree_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", DecisionTreeRegressor())]
)
dectree_pipe.fit(X_train, y_train)
scores.append(["Decision Tree", dectree_pipe.score(X_test, y_test)])


In [10]:
scores

[['Reg', 0.6580482257434541],
 ['GBR Init', 0.6106992906929891],
 ['GBR Improved', 0.7829166031320501],
 ['GBR Improved', 0.7849626923522042],
 ['GBR Improved 3', 0.795639625461525],
 ['xtratree', 0.7945109664434109],
 ['RFR', 0.7812300991781476],
 ['Decision Tree', 0.6153347876716004]]

Pickling the models

In [11]:
#pickling the model 
from joblib import dump

dump(reg_pipe, MODEL_FOLDER + "/lin_reg.joblib") 
dump(gbr_pipe, MODEL_FOLDER + "/gbr_init.joblib")
dump(gbr_improved_pipe, MODEL_FOLDER + "/gbr_improved.joblib")
dump(gbr_improved_pipe_2, MODEL_FOLDER + "/gbr_improved_2.joblib")
dump(gbr_improved_pipe_3, MODEL_FOLDER +"/gbr_improved_3.joblib")
dump(randfor_pipe, MODEL_FOLDER + "/rfr.joblib")
dump(dectree_pipe, MODEL_FOLDER + "/dec_tree.joblib")
dump(xtratree_pipe, MODEL_FOLDER + "/xtra_tree.joblib")


['../src/models/xtra_tree.joblib']