### Predicting Electricity Use for Individual Customer 

##### Random Forest

In [1]:
# Libraries used
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [2]:
# Read in data
data = pd.read_csv("ML_data - filtered.csv")

In [3]:
# View first few rows of data
data.head()

Unnamed: 0,Wk.Day,Month,Day,Year,Numeric_Time,TMP,WSP,WDR,ISO.tot.dmd,NH.RT.MWh,UES.Cap,UES.Sea,LMP.RT.NH,Target
0,Fri,8,26,2016,1.0,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.07
1,Fri,8,26,2016,1.25,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.08
2,Fri,8,26,2016,1.5,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.07
3,Fri,8,26,2016,1.75,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.08
4,Fri,8,26,2016,2.0,74,7,210,14966.4,1207.3,48.669,75.021,19.68,0.08


In [4]:
# Check dataframe data types
data.dtypes

Wk.Day           object
Month             int64
Day               int64
Year              int64
Numeric_Time    float64
TMP               int64
WSP               int64
WDR               int64
ISO.tot.dmd     float64
NH.RT.MWh       float64
UES.Cap         float64
UES.Sea         float64
LMP.RT.NH       float64
Target          float64
dtype: object

In [5]:
# Shape of dataframe
data.shape

(82333, 14)

In [6]:
# Descriptive statistics for each column
data.describe()

Unnamed: 0,Month,Day,Year,Numeric_Time,TMP,WSP,WDR,ISO.tot.dmd,NH.RT.MWh,UES.Cap,UES.Sea,LMP.RT.NH,Target
count,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0,82333.0
mean,7.103336,15.813963,2017.27639,11.874792,49.313544,7.571399,182.717792,13971.873442,1302.010738,48.535705,77.352669,37.03405,0.113365
std,3.497264,8.832236,0.705907,6.927943,18.538303,5.127014,113.521769,2492.388689,255.142331,15.263676,17.887657,38.680914,0.204578
min,1.0,1.0,2016.0,0.0,-6.0,0.0,0.0,9007.88,603.1,5.395,30.578,-133.77,0.0038
25%,4.0,8.0,2017.0,5.75,35.0,5.0,80.0,12208.35,1114.5,37.066,65.02,20.55,0.04
50%,8.0,16.0,2017.0,12.0,49.0,7.0,220.0,13723.31,1314.0,47.861,76.425,27.97,0.05
75%,10.0,23.0,2018.0,18.0,65.0,10.0,280.0,15361.71,1457.1,58.248,86.446,42.65,0.11
max,12.0,31.0,2018.0,23.75,95.0,37.0,360.0,25763.2,2379.5,114.956,166.8,2493.15,2.45


In [7]:
# One-hot encoding
data = pd.get_dummies(data)

In [8]:
# Check dataframe data types
data.dtypes

Month             int64
Day               int64
Year              int64
Numeric_Time    float64
TMP               int64
WSP               int64
WDR               int64
ISO.tot.dmd     float64
NH.RT.MWh       float64
UES.Cap         float64
UES.Sea         float64
LMP.RT.NH       float64
Target          float64
Wk.Day_Fri        uint8
Wk.Day_Mon        uint8
Wk.Day_Sat        uint8
Wk.Day_Sun        uint8
Wk.Day_Thu        uint8
Wk.Day_Tue        uint8
Wk.Day_Wed        uint8
dtype: object

In [9]:
# View first few rows of data after one-hot encoding
data.head()

Unnamed: 0,Month,Day,Year,Numeric_Time,TMP,WSP,WDR,ISO.tot.dmd,NH.RT.MWh,UES.Cap,UES.Sea,LMP.RT.NH,Target,Wk.Day_Fri,Wk.Day_Mon,Wk.Day_Sat,Wk.Day_Sun,Wk.Day_Thu,Wk.Day_Tue,Wk.Day_Wed
0,8,26,2016,1.0,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.07,1,0,0,0,0,0,0
1,8,26,2016,1.25,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.08,1,0,0,0,0,0,0
2,8,26,2016,1.5,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.07,1,0,0,0,0,0,0
3,8,26,2016,1.75,75,7,200,15020.25,1248.8,50.321,78.051,19.8,0.08,1,0,0,0,0,0,0
4,8,26,2016,2.0,74,7,210,14966.4,1207.3,48.669,75.021,19.68,0.08,1,0,0,0,0,0,0


In [10]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, Watts or kWh from data cleaning)
target = np.array(data['Target'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('Target', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [11]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 42)


In [12]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (61749, 19)
Training Target Shape: (61749,)
Testing Predictors Shape: (20584, 19)
Testing Target Shape: (20584,)


In [13]:
# Instantiate model with 100 decision trees
### NOTE - INCREASE THIS TO 1000 DECISION TREES IF RUNTIME ISNT TOO LONG

start_time = time.time()

rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rf.fit(train_predictors, train_target)

end_time = time.time()

print("--- %s seconds ---" % (end_time - start_time))

--- 77.78699994087219 seconds ---


In [17]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_predictors)

# Calculate max and mean test_target value
test_target_max_1 = max(test_target)
test_target_mean_1 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_1 = max(errors)
mean_absolute_error_1 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_1 = 100 * (errors / test_target)
accuracy_1 = 100 - np.mean(mape_1)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_1, 2), 'kWh')
print('Max Absolute Error:', round(max_absolute_error_1, 2), 'kWh')
print('Mean Target Variable Value:', round(test_target_mean_1, 2), 'kWh')
print('Mean Absolute Error:', round(mean_absolute_error_1, 2), 'kWh')
print('Accuracy:', round(accuracy_1, 2), '%.')

Target Variable Max Value: 2.03 MW
Max Absolute Error: 1.44 MW
Mean Target Variable Value: 0.11 MW
Mean Absolute Error: 0.04 MW
Accuracy: 53.96 %.


##### KNN

In [18]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [19]:
# Calculate and display errors for training data using mean squared error (MSE), 
# mean absolute error (MAE), and root mean squared errot (RMSE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  0.014292998311765372  & mae =  0.03992679622342063  & rmse =  0.11955332831738885


In [20]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  0.024463469050058296  & mae =  0.05424499028371551  & rmse =  0.15640802105409524


In [21]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_2 = max(test_target)
test_target_mean_2 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_2 = max(errors)
mean_absolute_error_2 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_2 = 100 * (errors / test_target)
accuracy_2 = 100 - np.mean(mape_2)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_2, 2), 'kWh')
print('Max Absolute Error:', round(max_absolute_error_2, 2), 'kWh')
print('Mean Target Variable Value:', round(test_target_mean_2, 2), 'kWh')
print('Mean Absolute Error:', round(mean_absolute_error_2, 2), 'kWh')
print('Accuracy:', round(accuracy_2, 2), '%.')

Target Variable Max Value: 2.03 MW
Max Absolute Error: 1.78 MW
Mean Target Variable Value: 0.11 MW
Mean Absolute Error: 0.05 MW
Accuracy: 42.68 %.


### Predicting Electricity Use for UES-Seacoast Service Territory
##### Includes weather data, but less ISO-NE data than the ISO aggregate dataset

##### Random Forest

In [22]:
# Read in data
data = pd.read_csv("ML_data - predict UES - Day-Ahead Shifted.csv")

In [23]:
# View first few rows of data
data.head()

Unnamed: 0,Wk.Day,Month,Day,Year,Numeric_Time,TMP,WSP,WDR,ISO.tot.dmd,NH.RT.MWh,UES.Cap.DayLoad,UES.Sea.DayLoad,LMP.RT.NH,UES.Sea.NextDayLoad
0,Fri,8,26,2016,1,75,7,200,15020.25,1248.8,50.321,78.051,19.8,71.854
1,Fri,8,26,2016,2,74,7,210,14966.4,1207.3,48.669,75.021,19.68,67.551
2,Fri,8,26,2016,3,74,7,210,14762.88,1187.1,47.546,73.552,19.47,65.174
3,Fri,8,26,2016,4,73,6,250,14539.61,1201.0,48.176,73.772,18.84,64.206
4,Fri,8,26,2016,5,72,6,240,14526.14,1281.4,52.154,77.124,18.23,65.02


In [24]:
# Check dataframe data types
data.dtypes

Wk.Day                  object
Month                    int64
Day                      int64
Year                     int64
Numeric_Time             int64
TMP                      int64
WSP                      int64
WDR                      int64
ISO.tot.dmd            float64
NH.RT.MWh              float64
UES.Cap.DayLoad        float64
UES.Sea.DayLoad        float64
LMP.RT.NH              float64
UES.Sea.NextDayLoad    float64
dtype: object

In [25]:
# Shape of dataframe
data.shape

(20568, 14)

In [26]:
# Descriptive statistics for each column
data.describe()

Unnamed: 0,Month,Day,Year,Numeric_Time,TMP,WSP,WDR,ISO.tot.dmd,NH.RT.MWh,UES.Cap.DayLoad,UES.Sea.DayLoad,LMP.RT.NH,UES.Sea.NextDayLoad
count,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0
mean,7.097773,15.795313,2017.275379,11.499951,49.335862,7.574436,182.818748,13972.419631,1302.039605,48.548,77.356042,37.04196,77.321125
std,3.495278,8.823213,0.705894,6.922418,18.539691,5.126797,113.505901,2493.91501,255.267825,15.270029,17.897408,38.697544,17.857973
min,1.0,1.0,2016.0,0.0,-6.0,0.0,0.0,9007.88,603.1,5.395,30.578,-133.77,30.578
25%,4.0,8.0,2017.0,5.75,35.0,5.0,80.0,12208.3225,1114.4,37.0765,65.01875,20.54,65.009
50%,8.0,16.0,2017.0,11.5,49.0,7.0,220.0,13722.705,1314.0,47.8865,76.425,27.97,76.4075
75%,10.0,23.0,2018.0,17.25,65.0,10.0,280.0,15363.17,1457.2,58.261,86.462,42.6625,86.427
max,12.0,31.0,2018.0,23.0,95.0,37.0,360.0,25763.2,2379.5,114.956,166.8,2493.15,166.8


In [27]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                    int64
Day                      int64
Year                     int64
Numeric_Time             int64
TMP                      int64
WSP                      int64
WDR                      int64
ISO.tot.dmd            float64
NH.RT.MWh              float64
UES.Cap.DayLoad        float64
UES.Sea.DayLoad        float64
LMP.RT.NH              float64
UES.Sea.NextDayLoad    float64
Wk.Day_Fri               uint8
Wk.Day_Mon               uint8
Wk.Day_Sat               uint8
Wk.Day_Sun               uint8
Wk.Day_Thu               uint8
Wk.Day_Tue               uint8
Wk.Day_Wed               uint8
dtype: object

In [28]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.NextDayLoad'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.NextDayLoad', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [29]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [30]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (15426, 19)
Training Target Shape: (15426,)
Testing Predictors Shape: (5142, 19)
Testing Target Shape: (5142,)


In [31]:
# Instantiate model with 100 decision trees
### NOTE - INCREASE THIS TO 1000 DECISION TREES IF RUNTIME ISNT TOO LONG

start_time = time.time()

rf = RandomForestRegressor(n_estimators = 300, random_state = 77)
# Train the model on training data
rf.fit(train_predictors, train_target)

end_time = time.time()

print("--- %s seconds ---" % (end_time - start_time))

--- 60.289000034332275 seconds ---


In [34]:
# Use predict method on the test data
predictions = rf.predict(test_predictors)

# Calculate max and mean test_target value
test_target_max_3 = max(test_target)
test_target_mean_3 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_3 = max(errors)
mean_absolute_error_3 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_3 = 100 * (errors / test_target)
accuracy_3 = 100 - np.mean(mape_3)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_3, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_3, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_3, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_3, 2), 'MW')
print('Accuracy:', round(accuracy_3, 2), '%.')

Target Variable Max Value: 166.8 MW
Max Absolute Error: 36.71 MW
Mean Target Variable Value: 77.4 MW
Mean Absolute Error: 3.63 MW
Accuracy: 95.41 %.


In [35]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(predictor_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: UES.Sea.DayLoad      Importance: 0.81
Variable: NH.RT.MWh            Importance: 0.03
Variable: Day                  Importance: 0.02
Variable: Numeric_Time         Importance: 0.02
Variable: TMP                  Importance: 0.02
Variable: ISO.tot.dmd          Importance: 0.02
Variable: LMP.RT.NH            Importance: 0.02
Variable: Month                Importance: 0.01
Variable: WSP                  Importance: 0.01
Variable: WDR                  Importance: 0.01
Variable: UES.Cap.DayLoad      Importance: 0.01
Variable: Wk.Day_Fri           Importance: 0.01
Variable: Wk.Day_Sun           Importance: 0.01
Variable: Year                 Importance: 0.0
Variable: Wk.Day_Mon           Importance: 0.0
Variable: Wk.Day_Sat           Importance: 0.0
Variable: Wk.Day_Thu           Importance: 0.0
Variable: Wk.Day_Tue           Importance: 0.0
Variable: Wk.Day_Wed           Importance: 0.0


In [39]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 77,
 'verbose': 0,
 'warm_start': False}


In [41]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [42]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=77, n_jobs = -1)

# Fit the random search model
rf_random.fit(train_predictors, train_target)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 77.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 163.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=77, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [43]:
# View the best parameters after fitting the random search
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [47]:
# Create function for evaluating model performance
def evaluate(model, test_predictors, test_target):
    predictions = model.predict(test_predictors)
    errors = abs(predictions - test_target)
    mape = 100 * np.mean(errors / test_target)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} MW.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [46]:
# Compare base model with results of randomized search to see if there is an improvement
base_model = RandomForestRegressor(n_estimators = 300, random_state = 77)
base_model.fit(train_predictors, train_target)
base_accuracy = evaluate(base_model, test_predictors, test_target)


best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_predictors, test_target)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 3.6298 degrees.
Accuracy = 95.41%.
Model Performance
Average Error: 3.2599 degrees.
Accuracy = 95.86%.
Improvement of 0.48%.


In [53]:
### See if performance can be improved further by using GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [70, 80, 90, 100],
    'max_features': [2, 3, 5],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 4],
    'n_estimators': [100, 300, 600, 1000]
}
# Create a base model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [54]:
# Fit the grid search to the data and view the best parameters
grid_search.fit(train_predictors, train_target)
grid_search.best_params_

Fitting 3 folds for each of 576 candidates, totalling 1728 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 48.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 77.9min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 129.1min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 162.8min finished


{'bootstrap': False,
 'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [55]:
# Use best estimator from GridSearchCV
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_predictors, test_target)

# Re-run base grid case to be able to compare accuracies
base_model = RandomForestRegressor(n_estimators = 300, random_state = 77)
base_model.fit(train_predictors, train_target)
base_accuracy = evaluate(base_model, test_predictors, test_target)

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 3.1748 degrees.
Accuracy = 95.97%.
Model Performance
Average Error: 3.6298 degrees.
Accuracy = 95.41%.
Improvement of 0.58%.


##### KNN

In [36]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [37]:
# Calculate and display errors for training data using mean squared error (MSE), 
# mean absolute error (MAE), and root mean squared errot (RMSE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  15.485645117172568  & mae =  2.7132255983404643  & rmse =  3.9351804427716615


In [38]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  26.110479368197904  & mae =  3.5914830649552707  & rmse =  5.109841422999143


In [39]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_4 = max(test_target)
test_target_mean_4 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_4 = max(errors)
mean_absolute_error_4 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_4 = 100 * (errors / test_target)
accuracy_4 = 100 - np.mean(mape_4)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_4, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_4, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_4, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_4, 2), 'MW')
print('Accuracy:', round(accuracy_4, 2), '%.')

Target Variable Max Value: 166.8 MW
Max Absolute Error: 34.85 MW
Mean Target Variable Value: 77.4 MW
Mean Absolute Error: 3.59 MW
Accuracy: 95.28 %.
