## Modeling

The basic procedure for modeling with electricity data is as follows:
-  Create same train-test split for all models
-  Create model
-  Scale all features to have mean zero and unit variance
-  Set up some sort of hyperparameter tuning scheme (either grid search or randomized search)
-  Fit data with model
-  Evaluate model by computing relevant metrics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import randint

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# a list of models that will be tested
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor



In [2]:
WORKING_DIR = '/Users/rvg/Documents/springboard_ds/springboard_portfolio/Electricity_Demand/'

la_df = pd.read_pickle(WORKING_DIR + 'data/LA_df.pkl')

seattle_df = pd.read_pickle(WORKING_DIR + 'data/seattle_df.pkl')

y_la = la_df[['demand']]
X_la = la_df.drop(['demand'], axis=1)

y_se = seattle_df[['demand']]
X_se = seattle_df.drop(['demand'], axis=1)

#set up training and testing set 
X_train_la, X_test_la, y_train_la, y_test_la = train_test_split(X_la,y_la,test_size=.2,random_state=42)
X_train_se, X_test_se, y_train_se, y_test_se = train_test_split(X_se,y_se,test_size=.2,random_state=42)

la_r2 = []
la_name = []
la_rmse = []

se_r2 = []
se_name = []
se_rmse = []

In [3]:
def evaluate(model, X, y, X_test, y_test, m_name):
    # get predictions
    y_pred = model.predict(X_test)
    #get r^2 and rmse
    r2 = model.score(X_test, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    #adjusted r^2
    adj_r2 = 1 - (1-r2)*(len(y)-1)/(len(y)-X.shape[1]-1)

    print m_name
    print '---------------------'
    print 'R^2: %.4f' % r2
    print 'adj R^2: %.4f' % adj_r2
    print 'Root MSE: %.4f' % rmse
    return r2, adj_r2, rmse

## Linear Regression

In [4]:
# LA #
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('linearregression', LinearRegression())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Fit to the training set
pipeline.fit(X_train_la, y_train_la)

print('LA Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'Linear Regression')

la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('Linear Regression')

print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')


# SEATTLE #
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('linearregression', LinearRegression())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Fit to the training set
pipeline.fit(X_train_se, y_train_se)

print ('Seattle Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'Linear Regression')

se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('Linear Regression')

print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
Linear Regression
---------------------
R^2: 0.6959
adj R^2: 0.6957
Root MSE: 423.9397
time elapsed = 0.03 sec


Seattle Results
---------------------
Linear Regression
---------------------
R^2: 0.5909
adj R^2: 0.5907
Root MSE: 129.8553
time elapsed = 0.03 sec




## ElasticNet

In [5]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Set up parameter space for grid search
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30),'elasticnet__alpha':np.linspace(.1,10,10)}

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, parameters, cv=5)

# Fit to the training set
gm_cv.fit(X_train_la,y_train_la)

print('LA Results')
print('---------------------')
m = gm_cv.best_estimator_
print(gm_cv.best_params_)

r2, adj_r2, rmse = evaluate(m, X_la, y_la, X_test_la, y_test_la, 'ElasticNet')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('ElasticNet')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')




t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Set up parameter space for grid search
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30),'elasticnet__alpha':np.linspace(.1,10,10)}

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, parameters, cv=5)

# Fit to the training set
gm_cv.fit(X_train_se,y_train_se)
print('Seattle Results')
print('---------------------')
m = gm_cv.best_estimator_
print(gm_cv.best_params_)

r2, adj_r2, rmse = evaluate(m, X_se, y_se, X_test_se, y_test_se, 'ElasticNet')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('ElasticNet')

print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 1.0}
ElasticNet
---------------------
R^2: 0.6958
adj R^2: 0.6956
Root MSE: 423.9647
time elapsed = 81.79 sec


Seattle Results
---------------------
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 1.0}
ElasticNet
---------------------
R^2: 0.5907
adj R^2: 0.5905
Root MSE: 129.8880
time elapsed = 81.94 sec




## Decision Tree

Takes quite a long time to tune hyperparameters, so we just use default settings

In [6]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('DecisionTreeRegressor', DecisionTreeRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"DecisionTreeRegressor__max_depth": [3, None],
              "DecisionTreeRegressor__max_features": randint(1, X_la.shape[1]),
              "DecisionTreeRegressor__min_samples_leaf": randint(1, 9),
              "DecisionTreeRegressor__criterion": ["mae", "mse"]}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train_la,y_train_la)
pipeline.fit(X_train_la, y_train_la)

#m = rm_cv.best_estimator_
#print(rm_cv.best_params_)

print('LA Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'Decision Tree')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('Decision Tree')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')




t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('DecisionTreeRegressor', DecisionTreeRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"DecisionTreeRegressor__max_depth": [3, None],
              "DecisionTreeRegressor__max_features": randint(1, X_se.shape[1]),
              "DecisionTreeRegressor__min_samples_leaf": randint(1, 9),
              "DecisionTreeRegressor__criterion": ["mae", "mse"]}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train_se,y_train_se)
pipeline.fit(X_train_se, y_train_se)

print('Seattle Results')
print('---------------------')
#m = rm_cv.best_estimator_
#print(rm_cv.best_params_)

r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'Decision Tree')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('Decision Tree')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
Decision Tree
---------------------
R^2: 0.7388
adj R^2: 0.7387
Root MSE: 392.8503
time elapsed = 0.15 sec


Seattle Results
---------------------
Decision Tree
---------------------
R^2: 0.4677
adj R^2: 0.4674
Root MSE: 148.1291
time elapsed = 0.20 sec




## k-NN

Takes quite a long time to tune hyperparameters, so we just use default settings

In [7]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('KNeighborsRegressor', KNeighborsRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"KNeighborsRegressor__n_neighbors": np.arange(3,6)}#,
              #"KNeighborsRegressor__weights": ['uniform', 'distance'],
              #"KNeighborsRegressor__leaf_size": randint(30,60),
              #"KNeighborsRegressor__metric": ["minkowski", "euclidean", 'manhattan']}

# Create the GridSearchCV object: rm_cv
#gm_cv = GridSearchCV(pipeline, parameters, cv=5)

#gm_cv.fit(X_train,y_train)
pipeline.fit(X_train_la,y_train_la)

#m = gm_cv.best_estimator_
#print(gm_cv.best_params_)
print('LA Results')
print('---------------------')

r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'k-NN')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('k-NN')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')



t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('KNeighborsRegressor', KNeighborsRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"KNeighborsRegressor__n_neighbors": np.arange(3,6)}#,
              #"KNeighborsRegressor__weights": ['uniform', 'distance'],
              #"KNeighborsRegressor__leaf_size": randint(30,60),
              #"KNeighborsRegressor__metric": ["minkowski", "euclidean", 'manhattan']}

# Create the GridSearchCV object: rm_cv
#gm_cv = GridSearchCV(pipeline, parameters, cv=5)

#gm_cv.fit(X_train,y_train)
pipeline.fit(X_train_se,y_train_se)

#m = gm_cv.best_estimator_
#print(gm_cv.best_params_)
print('Seattle Results')
print('---------------------')

r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'k-NN')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('k-NN')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
k-NN
---------------------
R^2: 0.8071
adj R^2: 0.8069
Root MSE: 337.6596
time elapsed = 3.42 sec


Seattle Results
---------------------
k-NN
---------------------
R^2: 0.5997
adj R^2: 0.5995
Root MSE: 128.4466
time elapsed = 4.02 sec




## Random Forest

Also takes long time to tune--use defaults

In [8]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('RandomForestRegressor', RandomForestRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"RandomForestRegressor__n_estimators": randint(10,51),
              "RandomForestRegressor__max_depth": [3, None],
              "RandomForestRegressor__min_samples_leaf": randint(1, 9),
              "RandomForestRegressor__criterion": ["mae", "mse"],
              "RandomForestRegressor__max_features": randint(1, X_la.shape[1])}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_la,y_train_la)

#m = rm_cv.best_estimator_
print('LA Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'Random Forest')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('Random Forest')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')




t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('RandomForestRegressor', RandomForestRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"RandomForestRegressor__n_estimators": randint(10,51),
              "RandomForestRegressor__max_depth": [3, None],
              "RandomForestRegressor__min_samples_leaf": randint(1, 9),
              "RandomForestRegressor__criterion": ["mae", "mse"],
              "RandomForestRegressor__max_features": randint(1, X_se.shape[1])}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_se,y_train_se)

#m = rm_cv.best_estimator_
print('Seattle Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'Random Forest')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('Random Forest')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
Random Forest
---------------------
R^2: 0.8395
adj R^2: 0.8395
Root MSE: 307.9225
time elapsed = 0.79 sec


Seattle Results
---------------------
Random Forest
---------------------
R^2: 0.6972
adj R^2: 0.6971
Root MSE: 111.7122
time elapsed = 0.91 sec




## Gradient Boosting

In [9]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('GradientBoostingRegressor', GradientBoostingRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"GradientBoostingRegressor__n_estimators": randint(100,500),
              "GradientBoostingRegressor__max_depth": [3, None],
              "GradientBoostingRegressor__min_samples_leaf": randint(1, 9),
              "GradientBoostingRegressor__criterion": ["mae", "mse"],
              "GradientBoostingRegressor__max_features": randint(1, X_la.shape[1])}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_la,y_train_la)

#m = rm_cv.best_estimator_
print('LA Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'Gradient Boosting')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('Gradient Boosting')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')


t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('GradientBoostingRegressor', GradientBoostingRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"GradientBoostingRegressor__n_estimators": randint(10,51),
              "GradientBoostingRegressor__max_depth": [3, None],
              "GradientBoostingRegressor__min_samples_leaf": randint(1, 9),
              "GradientBoostingRegressor__criterion": ["mae", "mse"],
              "GradientBoostingRegressor__max_features": randint(1, X_se.shape[1])}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_se,y_train_se)

#m = rm_cv.best_estimator_
print('Seattle Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'Gradient Boosting')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('Gradient Boosting')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
Gradient Boosting
---------------------
R^2: 0.8042
adj R^2: 0.8041
Root MSE: 340.1589
time elapsed = 1.21 sec


Seattle Results
---------------------
Gradient Boosting
---------------------
R^2: 0.6295
adj R^2: 0.6293
Root MSE: 123.5835
time elapsed = 1.25 sec




## Bagging

In [10]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('BaggingRegressor', BaggingRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"BaggingRegressor__n_estimators": randint(10,51),
              "BaggingRegressor__max_samples": randint(1, 9),
              "BaggingRegressor__max_features": randint(1, X_la.shape[1])}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_la,y_train_la)

#m = rm_cv.best_estimator_
print('LA Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'Bagging')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('Bagging')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')




t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('BaggingRegressor', BaggingRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"BaggingRegressor__n_estimators": randint(10,51),
              "BaggingRegressor__max_samples": randint(1, 9),
              "BaggingRegressor__max_features": randint(1, X_la.shape[1])}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_se,y_train_se)

#m = rm_cv.best_estimator_
print('Seattle Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'Bagging')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('Bagging')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
Bagging
---------------------
R^2: 0.8411
adj R^2: 0.8410
Root MSE: 306.4654
time elapsed = 0.80 sec


Seattle Results
---------------------
Bagging
---------------------
R^2: 0.6976
adj R^2: 0.6974
Root MSE: 111.6452
time elapsed = 0.92 sec




## AdaBoost

In [11]:
t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('AdaBoostRegressor', AdaBoostRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"AdaBoostRegressor__n_estimators": randint(10,51),
              "AdaBoostRegressor__learning_rate": randint(1, 9),
              "AdaBoostRegressor__loss": ['linear', 'square', 'exponential']}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_la,y_train_la)

#m = rm_cv.best_estimator_
print('LA Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_la, y_la, X_test_la, y_test_la, 'AdaBoost')
la_r2.append(adj_r2)
la_rmse.append(rmse)
la_name.append('AdaBoost')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')



t_start = time.time()
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
         ('AdaBoostRegressor', AdaBoostRegressor())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

parameters = {"AdaBoostRegressor__n_estimators": randint(10,51),
              "AdaBoostRegressor__learning_rate": randint(1, 9),
              "AdaBoostRegressor__loss": ['linear', 'square', 'exponential']}

# Create the RandomizedSearchCV object: rm_cv
#rm_cv = RandomizedSearchCV(pipeline, parameters, cv=5)

#rm_cv.fit(X_train,y_train)
pipeline.fit(X_train_se,y_train_se)

#m = rm_cv.best_estimator_
print('Seattle Results')
print('---------------------')
r2, adj_r2, rmse = evaluate(pipeline, X_se, y_se, X_test_se, y_test_se, 'AdaBoost')
se_r2.append(adj_r2)
se_rmse.append(rmse)
se_name.append('Adaboost')
print('time elapsed = %.2f sec' % (time.time() - t_start) )
print('\n')

LA Results
---------------------
AdaBoost
---------------------
R^2: 0.7269
adj R^2: 0.7267
Root MSE: 401.7430
time elapsed = 0.79 sec


Seattle Results
---------------------
AdaBoost
---------------------
R^2: 0.5695
adj R^2: 0.5693
Root MSE: 133.2065
time elapsed = 0.86 sec




## Results

In [14]:
la_results = pd.DataFrame({'Model': la_name, 'R^2': la_r2, 'RMSE': la_rmse})
print('-------LA-------')
print(la_results.sort_values(by='R^2', ascending=False))
print('\n')

seattle_results = pd.DataFrame({'Model': se_name, 'R^2': se_r2, 'RMSE': se_rmse})
print('-------SEATTLE-------')
print(seattle_results.sort_values(by='R^2', ascending=False))

-------LA-------
               Model        RMSE       R^2
6            Bagging  306.465419  0.840969
4      Random Forest  307.922512  0.839453
3               k-NN  337.659565  0.806947
5  Gradient Boosting  340.158915  0.804078
2      Decision Tree  392.850254  0.738680
7           AdaBoost  401.742951  0.726715
0  Linear Regression  423.939663  0.695682
1         ElasticNet  423.964661  0.695646


-------SEATTLE-------
               Model        RMSE       R^2
6            Bagging  111.645219  0.697429
4      Random Forest  111.712214  0.697066
5  Gradient Boosting  123.583500  0.629261
3               k-NN  128.446632  0.599509
0  Linear Regression  129.855254  0.590677
1         ElasticNet  129.887951  0.590471
7           Adaboost  133.206488  0.569277
2      Decision Tree  148.129109  0.467367
