Preprocessing

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


random_seed = 46
np.random.seed(random_seed)
random.seed(random_seed)

#dropping all of the observations that are very likely errors
dataset = pd.read_csv("data/master_dataset.csv")
dataset = dataset[dataset["cases_hrs"] <= 300]
dataset = dataset[dataset["Total_Hours"] >= 10]


#dropping all of the uneeded columns
to_drop = ["Date", "Total_Hours", "Total_Cases", "B_HrsPct", "B_Cases", "Total_Each_Day", "dry_ratio", "clr_ratio", "frz_ratio", "GO_LIVE_DATE", "LABEL_TYPE"]
dataset_build = dataset.drop(labels=to_drop, axis=1)
dataset_build.rename(columns={"BRNCH_CD":"brnch_cd", "A_HrsPct":"a_hrspct", "C_HrsPct":"c_hrspct", "A_Cases":"a_cases", "C_Cases":"c_cases"}, inplace=True)
dataset_build.to_csv("data/model_dataset.csv",index=False)

numeric_features = ["a_hrspct",  'c_hrspct', 'a_cases', 'c_cases']
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)
categorical_features=['brnch_cd', 'weekday', 'month']
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(dataset_build.drop(labels="cases_hrs", axis=1),dataset_build['cases_hrs'], random_state=random_seed, train_size = .70)

scores = []


Model Building

In [21]:
#linear regression
reg_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", LinearRegression())]
)
reg_pipe.fit(X_train, y_train)
scores.append(["Reg", reg_pipe.score(X_test, y_test)])

#gradient boosting regressor model
gbr_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(random_state=random_seed))]
)
gbr_pipe.fit(X_train, y_train)
scores.append(["GBR Init", gbr_pipe.score(X_test, y_test)])

#improved gbr
gbr_improved_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, n_estimators=1000, learning_rate=0.3, max_depth=10, random_state=random_seed, loss='squared_error'))]
)
gbr_improved_pipe.fit(X_train, y_train)
scores.append(["GBR Improved", gbr_improved_pipe.score(X_test, y_test)])

#improved gbr 2.0 - lower depth
gbr_improved_pipe_2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, n_estimators=1000, learning_rate=0.3, max_depth=5, random_state=random_seed, loss='squared_error'))]
)
gbr_improved_pipe_2.fit(X_train, y_train)
scores.append(["GBR Improved", gbr_improved_pipe_2.score(X_test, y_test)])




      Iter       Train Loss   Remaining Time 
         1        3352.2183            4.71m
         2        3131.8694            4.72m
         3        2976.2394            4.71m
         4        2866.8875            4.65m
         5        2776.4135            4.59m
         6        2704.9141            4.50m
         7        2652.6289            4.37m
         8        2617.3720            4.21m
         9        2568.4399            4.13m
        10        2522.4693            4.08m
        20        2179.4723            3.73m
        30        1940.9241            3.55m
        40        1742.9652            3.42m
        50        1592.6744            3.35m
        60        1483.7383            3.27m
        70        1387.9131            3.20m
        80        1299.9089            3.15m
        90        1210.3370            3.11m
       100        1118.3258            3.08m
       200         653.0818            2.68m
       300         465.2375            2.31m
       40

##### GBR Grid Search
* Takes a while to run

In [7]:
gbrgrid = {'n_estimators':[500,1000,2000], 
           'learning_rate':[0.15,0.3], 
           'max_depth':[5,10]}

crossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)

gbrgrid_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("gbrsearch", GridSearchCV(estimator = GradientBoostingRegressor(), 
                                                                      param_grid = gbrgrid,
                                                                      scoring = 'neg_mean_squared_error',
                                                                      verbose = 1,
                                                                      cv = crossvalidation))]
)
gbrsearch = gbrgrid_pipe.fit(X_train, y_train)
gbrgrid_pipe.best_params_, gbrgrid_pipe.best_score_


'\ngbrgrid = {\'n_estimators\':[500,1000,2000], \n           \'learning_rate\':[0.15,0.3], \n           \'max_depth\':[5,10]}\n\ngbrgrid_pipe = Pipeline(\n    steps=[("preprocessor", preprocessor), ("gbrsearch", GridSearchCV(estimator = GradientBoostingRegressor(), \n                                                                      param_grid = gbrgrid,\n                                                                      scoring = \'neg_mean_squared_error\',\n                                                                      verbose = 1,\n                                                                      cv = crossvalidation))]\n)\ngbrsearch = gbrgrid_pipe.fit(X_train, y_train)\n#gbrgrid_pipe.best_params_, gbrgrid_pipe.best_score_\n'

In [11]:
#improved gbr
    ## to reduce variablity, depth for gbr should be between 4 - 8
    ## n_estimators = 2000
    ## smaller learning rate = 0.05 - 0.15 is good
    ## 4 minutes
gbr_improved_pipe_3 = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(verbose=1, 
                                                                               n_estimators=2000, 
                                                                               learning_rate=0.05,
                                                                               max_depth=6,  
                                                                               loss='squared_error'))]
)
gbr_improved_pipe_3.fit(X_train, y_train)
scores.append(["GBR Improved 3",gbr_improved_pipe_3.score(X_test, y_test)])


      Iter       Train Loss   Remaining Time 
         1        3672.8376            4.09m
         2        3634.7779            4.05m
         3        3600.4718            4.03m
         4        3566.9995            4.04m
         5        3536.5007            4.04m
         6        3508.7512            4.06m
         7        3484.6493            4.07m
         8        3459.9013            4.04m
         9        3438.6136            4.01m
        10        3415.3945            4.01m
        20        3259.2275            3.98m
        30        3164.2132            3.92m
        40        3104.2060            3.84m
        50        3065.1080            3.73m
        60        3025.0354            3.67m
        70        2993.7219            3.62m
        80        2964.4815            3.59m
        90        2938.8806            3.56m
       100        2916.1085            3.53m
       200        2733.5650            3.27m
       300        2600.1975            3.06m
       40

##### Extra Trees Regressor
* Best MSE / RMSE = 242.242 / 15.564
    * With DEFAULT paramters, performs pretty well against tuned GBR
    * increasing n_estimators does not change much

In [12]:
#Extra trees base model, 5 1/2 minutes
xtratree_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", ExtraTreesRegressor(verbose=1))]
)
xtratree_pipe.fit(X_train, y_train)
scores.append(["xtratree",xtratree_pipe.score(X_test, y_test)])


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  2.6min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s


### Random Forest Regressor
* Best MSE / RMSE = 261.028 / 16.156

In [None]:
#random forest grid search
## run if needed

crossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)
randfor_param = {
             'max_depth': [5, 10, 15],
             'n_estimators': [500, 1000, 1500]}
randfor_search = GridSearchCV(RandomForestRegressor(), randfor_param, refit = True, verbose = 3, cv = crossvalidation, scoring = 'neg_mean_squared_error')
randfor_search.fit(X_train, y_train)


"\ncrossvalidation=KFold(n_splits=3,shuffle=True,random_state=1)\nrandfor_param = {\n             'max_depth': [5, 10, 15],\n             'n_estimators': [500, 1000, 1500]}\nrandfor_search = GridSearchCV(RandomForestRegressor(), randfor_param, refit = True, verbose = 3, cv = crossvalidation, scoring = 'neg_mean_squared_error')\nrandfor_search.fit(X_train, y_train)\n"

In [13]:
#baseline RFR, outperformed by extratrees regressor - 4 min
randfor_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor(verbose=1))]
)
randfor_pipe.fit(X_train, y_train)
scores.append(["RFR", randfor_pipe.score(X_test, y_test)])


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  2.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s


### Decision tree regressor
* Best MSE / RMSE = 466.094 / 21.589

In [14]:
#baseline Decision tree regressor
dectree_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", DecisionTreeRegressor())]
)
dectree_pipe.fit(X_train, y_train)
scores.append(["Decision Tree",dectree_pipe.score(X_test, y_test)])


In [16]:
scores

[['GBR Improved 3', 0.11209127690946474],
 ['xtratree', 0.03662133309109494],
 ['RFR', 0.08079112462153293],
 ['Decision Tree', -0.6565450546367695]]

Pickling the models

In [22]:
#pickling the model 
from joblib import dump

dump(reg_pipe, "models/lin_reg.joblib") 
dump(gbr_pipe, "models/gbr_init.joblib")
dump(gbr_improved_pipe, "models/gbr_improved.joblib")
dump(gbr_improved_pipe_2, "models/gbr_improved_2.joblib")
dump(gbr_improved_pipe_3, "models/gbr_improved_3.joblib")
dump(randfor_pipe, "models/rfr.joblib")
dump(dectree_pipe, "models/dec_tree.joblib")
dump(xtratree_pipe, "models/xtra_tree.joblib")


['models/xtra_tree.joblib']