# Model Selection
1. State the different modeling approaches that you will compare to address the business opportunity.
    - RandomForestRegression
    - GradientBoosting
    - LassoCV
    
2. Iterate on your suite of possible models by modifying data transformations, pipeline architectures, hyperparameters and other relevant factors.

3. Re-train your model on all of the data using the selected approach and prepare it for deployment.

4. Articulate your findings in a summary report.
The RandomForestRegressor proved to be the best with an r2 score of 0.96

In [32]:
import os
import time
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from solutionGuidance.cslib import fetch_ts, engineer_features

In [85]:
data_dir = os.path.join("data","cs-train")
timeseries_all = fetch_ts(data_dir,clean=False)

... loading ts data from files


In [86]:
## Extract features and 
X,y,dates = engineer_features(timeseries_all['all'])

## Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [87]:
## train a random forest model
param_grid_rf = {
'rf__criterion': ['mse','mae'],
'rf__n_estimators': [10,15,20,25,40,80,100,200]
}
pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),('rf', RandomForestRegressor())])
grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
rf_r2_score = r2_score(y_test, y_pred)


print("eval_rsme =", eval_rmse)
print("r2_score = ", rf_r2_score)
print("best params =", grid.best_params_)

eval_rsme = 17695.0
r2_score =  0.9522766670585541
best params = {'rf__criterion': 'mse', 'rf__n_estimators': 10}




In [31]:
## train a gradientBo model
param_grid_rf = {
'rf__criterion': ['mse','mae','rsme'],
'rf__n_estimators': [10,15,20,25,40,80,100,200]
}
pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),('rf', GradientBoostingRegressor())])
grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
rf_r2_score = r2_score(y_test, y_pred)

print("eval_rsme =", eval_rmse)
print("r2_score = ", rf_r2_score)
print("best params =", grid.best_params_)

eval_rsme = 20230.0
r2_score =  0.9376201400246862
best params = {'rf__criterion': 'mse', 'rf__n_estimators': 200}




In [84]:
## train simple LassoCV model


model = LassoCV()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
rf_r2_score = r2_score(y_test, y_pred)

print("eval_rsme =", eval_rmse)
print("r2_score = ", rf_r2_score)


eval_rsme = 55919.0
r2_score =  0.5233891731671572


In [88]:
from cslib import fetch_ts, engineer_features

In [93]:
from model import model_train, model_load


    
## train the model
model_train(data_dir,test=False)

## load the model
model = model_load()

print("model training complete.")




... loading ts data from files




... saving model: models/sl-portugal-0_1.joblib




... saving model: models/sl-united_kingdom-0_1.joblib




... saving model: models/sl-hong_kong-0_1.joblib




... saving model: models/sl-eire-0_1.joblib




... saving model: models/sl-spain-0_1.joblib




... saving model: models/sl-france-0_1.joblib




... saving model: models/sl-singapore-0_1.joblib




... saving model: models/sl-all-0_1.joblib




... saving model: models/sl-norway-0_1.joblib




... saving model: models/sl-germany-0_1.joblib




... saving model: models/sl-netherlands-0_1.joblib
... loading ts data from files
model training complete.
