Preprocessing

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


random_seed = 46
np.random.seed(random_seed)
random.seed(random_seed)

jobs = -1 #this is the number of cores that the models and test will run on. -1 means that all cores will be used 

#dropping all of the observations that are very likely errors
dataset = pd.read_csv("data/master_dataset.csv")
dataset = dataset[dataset["cases_hrs"] <= 300]
dataset = dataset[dataset["Total_Hours"] >= 10]


#dropping all of the uneeded columns
to_drop = ["Date", "Total_Hours", "Total_Cases", "B_HrsPct", "B_Cases", "Total_Each_Day", "dry_ratio", "clr_ratio", "frz_ratio", "GO_LIVE_DATE", "LABEL_TYPE"]
dataset_build = dataset.drop(labels=to_drop, axis=1)
dataset_build.rename(columns={"BRNCH_CD":"brnch_cd", "A_HrsPct":"a_hrspct", "C_HrsPct":"c_hrspct", "A_Cases":"a_cases", "C_Cases":"c_cases"}, inplace=True)
dataset_build.to_csv("data/model_dataset.csv",index=False)

numeric_features = ["a_hrspct",  'c_hrspct', 'a_cases', 'c_cases']
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)
categorical_features=['brnch_cd', 'weekday', 'month']
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(dataset_build.drop(labels="cases_hrs", axis=1),dataset_build['cases_hrs'], random_state=random_seed, train_size = .70)

scores = []


Model Building

In [2]:
#linear regression
reg_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", LinearRegression())]
)
reg_pipe.fit(X_train, y_train)
scores.append(["Reg", reg_pipe.score(X_test, y_test)])

#gradient boosting regressor model
gbr_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(random_state=random_seed))]
)
gbr_pipe.fit(X_train, y_train)
scores.append(["GBR Init", gbr_pipe.score(X_test, y_test)])

#improved gbr
gbr_improved_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, n_estimators=1000, learning_rate=0.3, max_depth=10, random_state=random_seed, loss='squared_error'))]
)
gbr_improved_pipe.fit(X_train, y_train)
scores.append(["GBR Improved", gbr_improved_pipe.score(X_test, y_test)])

#improved gbr 2.0 - lower depth
gbr_improved_pipe_2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, n_estimators=1000, learning_rate=0.3, max_depth=5, random_state=random_seed, loss='squared_error'))]
)
gbr_improved_pipe_2.fit(X_train, y_train)
scores.append(["GBR Improved", gbr_improved_pipe_2.score(X_test, y_test)])

      Iter       Train Loss   Remaining Time 
         1        3352.2183            4.79m
         2        3131.8694            4.78m
         3        2976.2394            4.88m
         4        2866.8875            4.79m
         5        2776.4135            4.68m
         6        2704.9141            4.56m
         7        2652.6289            4.40m
         8        2617.3720            4.22m
         9        2568.4399            4.14m
        10        2522.4693            4.07m
        20        2179.4723            3.69m
        30        1940.9241            3.50m
        40        1742.9652            3.36m
        50        1592.6744            3.28m
        60        1483.7383            3.20m
        70        1387.9131            3.14m
        80        1299.9089            3.08m
        90        1210.3370            3.04m
       100        1118.3258            3.00m
       200         653.0818            2.60m
       300         465.2375            2.24m
       40

##### GBR Grid Search
* Takes a while to run

In [None]:
"""
gbrgrid = {'n_estimators':[500,1000,2000], 
           'learning_rate':[0.15,0.3], 
           'max_depth':[5,10]}

crossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)

gbrgrid_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("gbrsearch", GridSearchCV(estimator = GradientBoostingRegressor(), 
                                                                      param_grid = gbrgrid,
                                                                      scoring = 'neg_mean_squared_error',
                                                                      verbose = 1,
                                                                      cv = crossvalidation, n_jobs=jobs))]
)
gbrsearch = gbrgrid_pipe.fit(X_train, y_train)
gbrgrid_pipe.best_params_, gbrgrid_pipe.best_score_
"""

In [3]:
#improved gbr
    ## to reduce variablity, depth for gbr should be between 4 - 8
    ## n_estimators = 2000
    ## smaller learning rate = 0.05 - 0.15 is good
    ## 4 minutes
gbr_improved_pipe_3 = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, 
                                                                               n_estimators=2000, 
                                                                               learning_rate=0.05,
                                                                               max_depth=6,  
                                                                               loss='squared_error'))]
)
gbr_improved_pipe_3.fit(X_train, y_train)
scores.append(["GBR Improved 3",gbr_improved_pipe_3.score(X_test, y_test)])


      Iter       Train Loss   Remaining Time 
         1        3672.8376            6.79m
         2        3634.7779            6.64m
         3        3600.4718            6.71m
         4        3566.9995            6.62m
         5        3536.5007            6.57m
         6        3508.7512            6.56m
         7        3484.6493            6.55m
         8        3459.9013            6.51m
         9        3438.6136            6.52m
        10        3415.3945            6.55m
        20        3259.2275            6.51m
        30        3164.2132            6.43m
        40        3104.2060            6.35m
        50        3065.1080            6.19m
        60        3025.0354            5.78m
        70        2993.7219            5.41m
        80        2964.4815            5.14m
        90        2938.8806            4.92m
       100        2916.1085            4.73m
       200        2733.5650            3.78m
       300        2600.1975            3.34m
       40

##### Extra Trees Regressor
* Best MSE / RMSE = 242.242 / 15.564
    * With DEFAULT paramters, performs pretty well against tuned GBR
    * increasing n_estimators does not change much

In [4]:
#Extra trees base model, 5 1/2 minutes
xtratree_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", ExtraTreesRegressor(verbose=1, n_jobs = jobs))]
)
xtratree_pipe.fit(X_train, y_train)
scores.append(["xtratree",xtratree_pipe.score(X_test, y_test)])


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished


### Random Forest Regressor
* Best MSE / RMSE = 261.028 / 16.156

In [None]:
#random forest grid search
## run if needed
"""
crossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)
randfor_param = {
             'max_depth': [5, 10, 15],
             'n_estimators': [500, 1000, 1500]}
randfor_search = GridSearchCV(RandomForestRegressor(n_jobs=jobs), randfor_param, refit = True, verbose = 3, cv = crossvalidation, scoring = 'neg_mean_squared_error', n_jobs=jobs)
randfor_search.fit(X_train, y_train)
"""

In [6]:
#baseline RFR, outperformed by extratrees regressor - 4 min
randfor_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor(verbose=1, n_jobs=jobs))]
)
randfor_pipe.fit(X_train, y_train)
scores.append(["RFR", randfor_pipe.score(X_test, y_test)])


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.6s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished


### Decision tree regressor
* Best MSE / RMSE = 466.094 / 21.589

In [7]:
#baseline Decision tree regressor
dectree_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", DecisionTreeRegressor())]
)
dectree_pipe.fit(X_train, y_train)
scores.append(["Decision Tree", dectree_pipe.score(X_test, y_test)])


In [8]:
scores

[['Reg', 0.050882787387249384],
 ['GBR Init', 0.1376223526075423],
 ['GBR Improved', -0.08274349859880914],
 ['GBR Improved', 0.04058212754023327],
 ['GBR Improved 3', 0.11209127690946474],
 ['xtratree', 0.03662133309109494],
 ['RFR', 0.08079112462153282],
 ['Decision Tree', -0.6565450546367695]]

Pickling the models

In [9]:
#pickling the model 
from joblib import dump

dump(reg_pipe, "models/lin_reg.joblib") 
dump(gbr_pipe, "models/gbr_init.joblib")
dump(gbr_improved_pipe, "models/gbr_improved.joblib")
dump(gbr_improved_pipe_2, "models/gbr_improved_2.joblib")
dump(gbr_improved_pipe_3, "models/gbr_improved_3.joblib")
dump(randfor_pipe, "models/rfr.joblib")
dump(dectree_pipe, "models/dec_tree.joblib")
dump(xtratree_pipe, "models/xtra_tree.joblib")


['models/xtra_tree.joblib']

[CV 2/3] END .......max_depth=5, n_estimators=500;, score=nan total time=   0.0s
[CV 3/3] END ......max_depth=5, n_estimators=1500;, score=nan total time=   0.0s
[CV 2/3] END ......max_depth=10, n_estimators=500;, score=nan total time=   0.0s
[CV 1/3] END .....max_depth=10, n_estimators=1000;, score=nan total time=   0.0s
[CV 3/3] END .....max_depth=10, n_estimators=1000;, score=nan total time=   0.0s
[CV 3/3] END .....max_depth=10, n_estimators=1500;, score=nan total time=   0.0s
[CV 2/3] END .....max_depth=15, n_estimators=1000;, score=nan total time=   0.0s
[CV 2/3] END ......max_depth=5, n_estimators=1000;, score=nan total time=   0.0s
[CV 3/3] END ......max_depth=15, n_estimators=500;, score=nan total time=   0.0s
[CV 3/3] END .....max_depth=15, n_estimators=1000;, score=nan total time=   0.0s
[CV 3/3] END ......max_depth=5, n_estimators=1000;, score=nan total time=   0.0s
[CV 2/3] END ......max_depth=15, n_estimators=500;, score=nan total time=   0.0s
[CV 1/3] END .....max_depth=