<center><em>Copyright by Pierian Data Inc.</em></center>
<center><em>For more information, visit us at <a href='http://www.pieriandata.com'>www.pieriandata.com</a></em></center>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Model-Persistence" data-toc-modified-id="Model-Persistence-1">Model Persistence</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1">Imports</a></span></li><li><span><a href="#Data" data-toc-modified-id="Data-1.2">Data</a></span></li><li><span><a href="#Data-Preparation" data-toc-modified-id="Data-Preparation-1.3">Data Preparation</a></span></li><li><span><a href="#Model-Training" data-toc-modified-id="Model-Training-1.4">Model Training</a></span></li><li><span><a href="#Model-Evaluation" data-toc-modified-id="Model-Evaluation-1.5">Model Evaluation</a></span><ul class="toc-item"><li><span><a href="#Hyperparameter-Tuning" data-toc-modified-id="Hyperparameter-Tuning-1.5.1">Hyperparameter Tuning</a></span></li></ul></li><li><span><a href="#Final-Hold-Out-Test-Performance-for-Reporting" data-toc-modified-id="Final-Hold-Out-Test-Performance-for-Reporting-1.6">Final Hold Out Test Performance for Reporting</a></span></li><li><span><a href="#Full-Training" data-toc-modified-id="Full-Training-1.7">Full Training</a></span></li><li><span><a href="#Saving-Model-(and-anything-else-as-pickle-file)" data-toc-modified-id="Saving-Model-(and-anything-else-as-pickle-file)-1.8">Saving Model (and anything else as pickle file)</a></span><ul class="toc-item"><li><span><a href="#Saving-Data-Columns-as-pickle-file" data-toc-modified-id="Saving-Data-Columns-as-pickle-file-1.8.1">Saving Data Columns as pickle file</a></span></li></ul></li><li><span><a href="#Loading-Model-(Model-Persistence)" data-toc-modified-id="Loading-Model-(Model-Persistence)-1.9">Loading Model (Model Persistence)</a></span></li></ul></li></ul></div>

# Model Persistence

## Imports

In [2]:
import pandas as pd

## Data

In [3]:
df = pd.read_csv('../Data/Advertising.csvtising.csv')

In [5]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [7]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TV,200.0,147.0425,85.854236,0.7,74.375,149.75,218.825,296.4
radio,200.0,23.264,14.846809,0.0,9.975,22.9,36.525,49.6
newspaper,200.0,30.554,21.778621,0.3,12.75,25.75,45.1,114.0
sales,200.0,14.0225,5.217457,1.6,10.375,12.9,17.4,27.0


## Data Preparation

In [8]:
X = df.drop('sales', axis=1)
y = df['sales']

In [10]:
# Train | VALIDATION | TEST
# 70% | 15% | 15%
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split 30% of test into validation and hold-out (15% and 15% each)
X_validation, X_holdout_test, y_validation, y_holdout_test  = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
X_train.shape, X_validation.shape, X_holdout_test.shape

((140, 3), (30, 3), (30, 3))

In [13]:
y_train.shape, y_validation.shape, y_holdout_test.shape

((140,), (30,), (30,))

In [None]:
# we can scale if we want to 

## Model Training

In [39]:
from sklearn.ensemble import RandomForestRegressor

Note: If we are comparing multiple models, it's fair to use same set of data across each one. So better to put random_state.

In [69]:
rf = RandomForestRegressor(n_estimators=3, random_state=101)

In [70]:
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=3, random_state=101)

## Model Evaluation

In [71]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [72]:
predictions = rf.predict(X_validation)

In [73]:
MAE = mean_absolute_error(y_validation, predictions)
MAE

0.6833333333333329

In [74]:
MSE = mean_squared_error(y_validation, predictions)
MSE

0.7106296296296291

In [75]:
import numpy as np

In [76]:
RMSE = np.sqrt(MSE)
RMSE

0.8429885109713116

In [77]:
r2_score(y_validation, predictions)

0.9761517320104487

### Hyperparameter Tuning

In [78]:
rf = RandomForestRegressor(n_estimators=10, random_state=101)

In [79]:
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=101)

In [80]:
predictions = rf.predict(X_validation)

In [81]:
MAE = mean_absolute_error(y_validation, predictions)
MAE

0.4753333333333333

In [82]:
MSE = mean_squared_error(y_validation, predictions)
MSE

0.3358199999999998

In [83]:
RMSE = np.sqrt(MSE)
RMSE

0.5794997842967673

In [84]:
r2_score(y_validation, predictions)

0.9887300993058435

## Final Hold Out Test Performance for Reporting

In [85]:
holdout_predictions = rf.predict(X_holdout_test)

In [87]:
MAE = mean_absolute_error(y_holdout_test, holdout_predictions)
MAE

0.7330000000000001

In [88]:
MSE = mean_squared_error(y_holdout_test, holdout_predictions)
MSE

0.7388433333333333

In [89]:
RMSE = np.sqrt(MSE)
RMSE

0.8595599649433036

In [90]:
r2_score(y_holdout_test, holdout_predictions)

0.9700158407287547

## Full Training

In [91]:
final_model = RandomForestRegressor(n_estimators=10, random_state=101)

In [93]:
final_model.fit(X, y)

RandomForestRegressor(n_estimators=10, random_state=101)

## Saving Model (and anything else as pickle file)

In [94]:
import joblib

In [95]:
joblib.dump(final_model,'Model/final_model.pkl')

['Model/final_model.pkl']

### Saving Data Columns as pickle file

In [96]:
X.columns

Index(['TV', 'radio', 'newspaper'], dtype='object')

In [97]:
list(X.columns)

['TV', 'radio', 'newspaper']

In [98]:
joblib.dump(list(X.columns), 'Model/columns_names.pkl')

['Model/columns_names.pkl']

## Loading Model (Model Persistence)

In [99]:
new_columns = joblib.load('Model/columns_names.pkl')

In [100]:
new_columns

['TV', 'radio', 'newspaper']

In [101]:
loaded_model = joblib.load("Model/final_model.pkl")

In [102]:
loaded_model.predict([[230.1,37.8,69.2]])

  "X does not have valid feature names, but"


array([21.9])

In [42]:
loaded_model = joblib.load('final_model.pkl')

In [104]:
df.head(1)

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1


-----