### PCA - Customer Metering Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [2]:
# Read in data
data = pd.read_csv("Metering data for PCA.csv")

In [3]:
# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,Numeric_Time,cust_02,cust_03,cust_04,cust_06,cust_07,cust_08,...,cust_12,cust_13,cust_15,cust_16,cust_19,cust_20,cust_21,cust_23,cust_25,cust_26
0,5,17,2018,1.0,0.04,0.03,0.03,0.08,0.07,0.08,...,0.06,0.01,0.01,0.0,0.01,0.03,0.23,0.01,0.03,0.12
1,5,17,2018,1.25,0.08,0.04,0.02,0.09,0.07,0.06,...,0.07,0.04,0.0,0.0,0.02,0.04,0.25,0.01,0.03,0.1
2,5,17,2018,1.5,0.08,0.06,0.04,0.08,0.07,0.06,...,0.05,0.04,0.02,0.0,0.04,0.03,0.06,0.02,0.03,0.1
3,5,17,2018,1.75,0.07,0.07,0.02,0.11,0.06,0.05,...,0.05,0.04,0.03,0.01,0.05,0.02,0.05,0.01,0.02,0.13
4,5,17,2018,2.0,0.07,0.06,0.02,0.12,0.06,0.06,...,0.05,0.01,0.0,0.0,0.02,0.02,0.05,0.02,0.01,0.13


In [4]:
# Check dataframe data types
data.dtypes

Month             int64
Day               int64
Year              int64
Numeric_Time    float64
cust_02         float64
cust_03         float64
cust_04         float64
cust_06         float64
cust_07         float64
cust_08         float64
cust_09         float64
cust_10         float64
cust_11         float64
cust_12         float64
cust_13         float64
cust_15         float64
cust_16         float64
cust_19         float64
cust_20         float64
cust_21         float64
cust_23         float64
cust_25         float64
cust_26         float64
dtype: object

In [5]:
# Summary Stats
data.describe()

Unnamed: 0,Month,Day,Year,Numeric_Time,cust_02,cust_03,cust_04,cust_06,cust_07,cust_08,...,cust_12,cust_13,cust_15,cust_16,cust_19,cust_20,cust_21,cust_23,cust_25,cust_26
count,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,...,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0,34972.0
mean,6.52279,15.719547,2018.373013,11.879697,0.189792,0.090279,0.063588,0.201591,0.094872,0.100635,...,0.090505,0.091943,0.063247,0.063706,0.036963,0.076414,0.069627,0.050945,0.035759,0.098943
std,3.447548,8.793679,0.483612,6.93292,0.266298,0.102451,0.090697,0.139132,0.086829,0.091874,...,0.106706,0.152801,0.084406,0.087769,0.059398,0.117868,0.101351,0.066291,0.049171,0.078752
min,1.0,1.0,2018.0,0.0,0.01,0.0,0.01,0.03,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01
25%,4.0,8.0,2018.0,5.75,0.07,0.03,0.02,0.12,0.05,0.05,...,0.04,0.02,0.02,0.02,0.01,0.02,0.02,0.01,0.02,0.05
50%,7.0,16.0,2018.0,12.0,0.1,0.06,0.03,0.15,0.07,0.08,...,0.06,0.03,0.04,0.04,0.02,0.04,0.04,0.03,0.03,0.09
75%,10.0,23.0,2019.0,18.0,0.14,0.11,0.06,0.23,0.11,0.11,...,0.1,0.07,0.07,0.07,0.04,0.07,0.07,0.06,0.04,0.12
max,12.0,31.0,2019.0,23.75,2.57,1.65,1.24,1.6,1.08,1.25,...,1.57,1.75,0.92,1.51,1.28,1.32,1.22,0.95,1.14,1.07


In [6]:
# Shape of dataframe
data.shape

(34972, 23)

In [7]:
### Isolate data from customers (i.e. drop timestamping data)

# Columns to drop
labels = ['Month', 'Day', 'Year', 'Numeric_Time']

# Drop columns
x = data.drop(labels=labels, axis=1)

# New shape of data
x.shape

(34972, 19)

In [8]:
# First few rows of new dataframe
x.head()

Unnamed: 0,cust_02,cust_03,cust_04,cust_06,cust_07,cust_08,cust_09,cust_10,cust_11,cust_12,cust_13,cust_15,cust_16,cust_19,cust_20,cust_21,cust_23,cust_25,cust_26
0,0.04,0.03,0.03,0.08,0.07,0.08,0.04,0.02,0.03,0.06,0.01,0.01,0.0,0.01,0.03,0.23,0.01,0.03,0.12
1,0.08,0.04,0.02,0.09,0.07,0.06,0.06,0.03,0.03,0.07,0.04,0.0,0.0,0.02,0.04,0.25,0.01,0.03,0.1
2,0.08,0.06,0.04,0.08,0.07,0.06,0.05,0.03,0.04,0.05,0.04,0.02,0.0,0.04,0.03,0.06,0.02,0.03,0.1
3,0.07,0.07,0.02,0.11,0.06,0.05,0.04,0.05,0.03,0.05,0.04,0.03,0.01,0.05,0.02,0.05,0.01,0.02,0.13
4,0.07,0.06,0.02,0.12,0.06,0.06,0.05,0.05,0.04,0.05,0.01,0.0,0.0,0.02,0.02,0.05,0.02,0.01,0.13


In [9]:
### Scale dataframe of customer values using StandardScaler

# Note that x_scaled is an array, not a dataframe
x_scaled = StandardScaler().fit_transform(x)

# Visualize the scaled data using a dataframe
df = pd.DataFrame(x_scaled)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-0.562508,-0.58838,-0.37034,-0.873938,-0.286452,-0.224602,-0.879899,-0.676815,-0.729111,-0.285883,-0.536279,-0.630858,-0.725849,-0.453946,-0.393787,1.582373,-0.617673,-0.11712,0.267393
1,-0.412298,-0.490771,-0.480599,-0.802063,-0.286452,-0.442295,-0.699531,-0.549049,-0.729111,-0.192166,-0.339942,-0.749335,-0.725849,-0.285587,-0.308945,1.779709,-0.617673,-0.11712,0.013427
2,-0.412298,-0.295553,-0.260081,-0.873938,-0.286452,-0.442295,-0.789715,-0.549049,-0.658119,-0.3796,-0.339942,-0.51238,-0.725849,0.051131,-0.393787,-0.094986,-0.46682,-0.11712,0.013427
3,-0.44985,-0.197945,-0.480599,-0.658312,-0.401622,-0.551142,-0.879899,-0.293517,-0.729111,-0.3796,-0.339942,-0.393903,-0.611912,0.219489,-0.478628,-0.193654,-0.617673,-0.320493,0.394376
4,-0.44985,-0.295553,-0.480599,-0.586437,-0.401622,-0.442295,-0.789715,-0.293517,-0.658119,-0.3796,-0.536279,-0.749335,-0.725849,-0.285587,-0.478628,-0.193654,-0.46682,-0.523866,0.394376


In [10]:
# PCA - Specify number of components
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(x_scaled)
principalDF = pd.DataFrame(data = principalComponents,
                           columns = ['principal component 1',
                                      'principal component 2',
                                      'principal component 3',
                                      'principal component 4',
                                      'principal component 5',])
principalDF.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5
0,-1.656775,-0.386175,0.547707,-0.596091,0.315683
1,-1.514116,-0.554265,0.558745,-0.576857,0.544962
2,-1.843253,0.09798,0.224306,-0.22074,0.250541
3,-1.79466,0.245853,0.025536,-0.193884,0.339577
4,-1.87829,-0.017474,-0.067363,-0.307591,-0.100106


In [11]:
# How much variance does above PCA describe?
pca.explained_variance_ratio_

array([0.19596306, 0.06757914, 0.06078037, 0.05793728, 0.05031275])

In [12]:
# PCA - Specify the desired amount of explained variance
pca2 = PCA(n_components=0.95)
principalComponents = pca2.fit_transform(x_scaled)

In [13]:
# How many components are needed to explain 95% of the variance?
# Recall that dataset included metering data from 19 customers
pca2.n_components_

18

### PCA for ISO-NE service territory data

In [14]:
# Read in data
iso_data = pd.read_csv("ISO.aggregate.csv")

In [15]:
# See first few rows of data
iso_data.head()

Unnamed: 0,Settlement.Date,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,01/01/2010,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,01/01/2010,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,01/01/2010,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,01/01/2010,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,01/01/2010,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [16]:
# Check dataframe data types
iso_data.dtypes

Settlement.Date     object
Month                int64
Day                  int64
Year                 int64
TI                   int64
ISO.tot.dmd        float64
LMP.DA.NH          float64
LMP.RT.NH          float64
UES.Cap            float64
UES.Sea            float64
NH.RT.MWh          float64
ME.RT.MWh          float64
VT.RT.MWh          float64
CT.RT.MWh          float64
RI.RT.MWh          float64
MA.SE.RT.MWh       float64
MA.WC.RT.MWh       float64
MA.NE.RT.MWh       float64
Wk.Day              object
dtype: object

In [17]:
# Summary Stats
iso_data.describe()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh
count,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0,78888.0
mean,6.524186,15.727829,2014.0,12.5,14580.323975,44.314016,43.558842,49.921363,76.881151,1319.439378,1306.278982,644.162969,3482.567405,928.714068,1689.279528,1962.351245,2885.513858
std,3.448757,8.800448,2.581691,6.92223,2701.159746,34.641323,41.97141,16.058125,18.111757,265.28396,205.392568,107.796148,771.736129,206.067112,389.917777,378.862822,570.326415
min,1.0,1.0,2010.0,1.0,8665.0,-5.0,-155.11,5.395,6.912,523.2,681.8,302.3,1367.4,371.8,708.7,736.9,1729.7
25%,4.0,8.0,2012.0,6.75,12636.0,26.56,24.2,37.74075,64.2835,1121.2,1143.5,558.9,2937.9,784.1,1413.3,1683.0,2473.4
50%,7.0,16.0,2014.0,12.5,14395.92,36.18,34.46,49.494,76.0395,1333.1,1329.65,648.8,3444.25,916.5,1659.35,1951.6,2869.3
75%,10.0,23.0,2016.0,18.25,16121.0,48.67,47.75,60.21025,86.3,1485.4,1450.2,721.5,3915.3,1032.2,1887.325,2199.0,3210.7
max,12.0,31.0,2018.0,24.0,27762.0,785.0,2493.15,118.547,168.43,2433.3,2034.1,1002.5,7218.8,4314.3,3644.7,3652.3,5657.9


In [18]:
# Shape of dataframe
iso_data.shape

(78888, 19)

In [19]:
### Drop timestamping data

# Columns to drop
labels = ['Settlement.Date', 'Month', 'Day', 'Year', 'TI', 'Wk.Day']

# Drop columns
y = iso_data.drop(labels=labels, axis=1)

# New shape of data
y.shape

(78888, 13)

In [20]:
# First few rows of new dataframe
y.head()

Unnamed: 0,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh
0,13797.0,63.04,63.33,30.142,67.858,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9
1,13560.0,57.61,69.74,27.37,63.276,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0
2,13121.0,53.74,55.39,25.573,59.899,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2
3,13134.0,42.09,60.49,24.728,58.275,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1
4,13144.0,47.95,53.43,24.974,58.516,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2


In [21]:
### Scale dataframe using StandardScaler

# Note that y_scaled is an array, not a dataframe
y_scaled = StandardScaler().fit_transform(y)

# Visualize the scaled data using a dataframe
df = pd.DataFrame(y_scaled)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.289997,0.540571,0.471065,-1.231743,-0.498196,-0.570861,-0.908408,-0.034908,-0.357336,-0.501364,-0.490054,-0.49636,-0.344741
1,-0.377738,0.383821,0.623789,-1.404367,-0.751183,-0.858102,-1.115817,-0.359597,-0.584358,-0.712462,-0.730875,-0.753711,-0.554973
2,-0.540262,0.272104,0.281888,-1.516274,-0.937637,-1.029241,-1.271617,-0.605433,-0.751126,-0.868723,-0.895783,-0.898883,-0.705414
3,-0.535449,-0.064202,0.4034,-1.568896,-1.027303,-1.123103,-1.342701,-0.809524,-0.848569,-0.965779,-0.982725,-0.986778,-0.802026
4,-0.531747,0.104962,0.235189,-1.553576,-1.013997,-1.136297,-1.260419,-0.839209,-0.85764,-0.964323,-0.977596,-0.991529,-0.794837


In [22]:
# PCA - Specify number of components
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(y_scaled)
principalDF = pd.DataFrame(data = principalComponents,
                           columns = ['principal component 1',
                                      'principal component 2',
                                      'principal component 3',
                                      'principal component 4',
                                      'principal component 5',])
principalDF.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5
0,-1.526112,1.148991,0.024611,-0.203553,-0.803992
1,-2.255963,1.287369,0.238645,-0.066054,-0.773
2,-2.856995,1.078013,0.209978,-0.229546,-0.714868
3,-3.175825,0.978905,0.416209,0.032693,-0.704791
4,-3.152832,0.967806,0.312375,-0.173453,-0.616519


In [23]:
pca.explained_variance_ratio_

array([0.81915056, 0.1053449 , 0.02546786, 0.01686609, 0.01358903])

In [24]:
# PCA - Specify the desired amount of explained variance
pca2 = PCA(n_components=0.95)
principalComponents = pca2.fit_transform(y_scaled)

In [25]:
# How many components are needed to explain 95% of the variance?
# Recall that ISO-NE dataset had 13 features
pca2.n_components_

4

### Using PCA to Predict UES Load - Can Accuracy be Maintained?  Can Model Training be Sped Up?
##### KNN used with PCA to predict UES load 1 to 7 days in advance

### Predict UES load - 1 Day Ahead (KNN & PCA)

In [26]:
# Read in data
data = pd.read_csv("ISO.aggregate - 1 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.NEXT.DAY.LOAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,61.904,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,58.754,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,57.32,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,57.068,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,57.858,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [27]:
# Shape of dataframe
data.shape

(78864, 19)

In [28]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                      int64
Day                        int64
Year                       int64
TI                         int64
ISO.tot.dmd              float64
LMP.DA.NH                float64
LMP.RT.NH                float64
UES.Cap                  float64
UES.Sea                  float64
UES.Sea.NEXT.DAY.LOAD    float64
NH.RT.MWh                float64
ME.RT.MWh                float64
VT.RT.MWh                float64
CT.RT.MWh                float64
RI.RT.MWh                float64
MA.SE.RT.MWh             float64
MA.WC.RT.MWh             float64
MA.NE.RT.MWh             float64
Wk.Day_Fri                 uint8
Wk.Day_Mon                 uint8
Wk.Day_Sat                 uint8
Wk.Day_Sun                 uint8
Wk.Day_Thu                 uint8
Wk.Day_Tue                 uint8
Wk.Day_Wed                 uint8
dtype: object

In [29]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.NEXT.DAY.LOAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.NEXT.DAY.LOAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [30]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [31]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59148, 24)
Training Target Shape: (59148,)
Testing Predictors Shape: (19716, 24)
Testing Target Shape: (19716,)


In [32]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

In [33]:
# Make an instance of the Model
pca = PCA(.95)

pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [34]:
# How many components needed to explain 95% of the variance?
pca.n_components_

12

In [35]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

In [36]:
# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [37]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  17.586731881175357  & mae =  2.6333723879082975  & rmse =  4.1936537626722785


In [38]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  28.583893869868128  & mae =  3.4618506593629546  & rmse =  5.3463907329962455


In [39]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_1 = max(test_target)
test_target_mean_1 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_1 = max(errors)
mean_absolute_error_1 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_1 = 100 * (errors / test_target)
accuracy_1 = 100 - np.mean(mape_1)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_1, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_1, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_1, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_1, 2), 'MW')
print('Accuracy:', round(accuracy_1, 2), '%.')

Target Variable Max Value: 166.65 MW
Max Absolute Error: 47.35 MW
Mean Target Variable Value: 77.05 MW
Mean Absolute Error: 3.46 MW
Accuracy: 95.61 %.


In [40]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.5min finished


In [41]:
gs_knn.best_score_

0.9268481339917845

In [42]:
gs_knn.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

In [43]:
gs_knn.best_params_

{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

In [44]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_1 = max(test_target)
test_target_mean_GS_1 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_1 = max(errors)
mean_absolute_error_GS_1 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_1 = 100 * (errors / test_target)
accuracy_GS_1 = 100 - np.mean(mape_GS_1)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_1, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_1, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_1, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_1, 2), 'MW')
print('Accuracy:', round(accuracy_GS_1, 2), '%.')

Target Variable Max Value: 166.65 MW
Max Absolute Error: 39.25 MW
Mean Target Variable Value: 77.05 MW
Mean Absolute Error: 2.54 MW
Accuracy: 96.74 %.


### Predict UES load - 2 Days Ahead (KNN & PCA)

In [45]:
# Read in data
data = pd.read_csv("ISO.aggregate - 2 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.2.DAYS.AHEAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,67.512,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,64.69,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,63.511,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,63.021,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,63.48,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [46]:
# Shape of dataframe
data.shape

(78840, 19)

In [47]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                     int64
Day                       int64
Year                      int64
TI                        int64
ISO.tot.dmd             float64
LMP.DA.NH               float64
LMP.RT.NH               float64
UES.Cap                 float64
UES.Sea                 float64
UES.Sea.2.DAYS.AHEAD    float64
NH.RT.MWh               float64
ME.RT.MWh               float64
VT.RT.MWh               float64
CT.RT.MWh               float64
RI.RT.MWh               float64
MA.SE.RT.MWh            float64
MA.WC.RT.MWh            float64
MA.NE.RT.MWh            float64
Wk.Day_Fri                uint8
Wk.Day_Mon                uint8
Wk.Day_Sat                uint8
Wk.Day_Sun                uint8
Wk.Day_Thu                uint8
Wk.Day_Tue                uint8
Wk.Day_Wed                uint8
dtype: object

In [48]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.2.DAYS.AHEAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.2.DAYS.AHEAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [49]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [50]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59130, 24)
Training Target Shape: (59130,)
Testing Predictors Shape: (19710, 24)
Testing Target Shape: (19710,)


In [51]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Make an instance of the Model
pca = PCA(.95)

# Fit model to data
pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [52]:
# How many components needed to explain the 95% of the variance?
pca.n_components_

12

In [53]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [54]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  28.20027254108642  & mae =  3.358518853373922  & rmse =  5.310392880106558


In [55]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  45.093777117789955  & mae =  4.322612146118722  & rmse =  6.715190028419892


In [56]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_2 = max(test_target)
test_target_mean_2 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_2 = max(errors)
mean_absolute_error_2 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_2 = 100 * (errors / test_target)
accuracy_2 = 100 - np.mean(mape_2)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_2, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_2, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_2, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_2, 2), 'MW')
print('Accuracy:', round(accuracy_2, 2), '%.')

Target Variable Max Value: 168.43 MW
Max Absolute Error: 64.56 MW
Mean Target Variable Value: 76.92 MW
Mean Absolute Error: 4.32 MW
Accuracy: 94.48 %.


In [57]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.2min finished


In [58]:
print('GRID SEARCH BEST SCORE:')
print(gs_knn.best_score_)
print('')

print('GRID SEARCH BEST ESTIMATOR:')
print(gs_knn.best_estimator_)
print('')

print('GRID SEARCH BEST PARAMETERS:')
print(gs_knn.best_params_)

GRID SEARCH BEST SCORE:
0.8841638292806817

GRID SEARCH BEST ESTIMATOR:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

GRID SEARCH BEST PARAMETERS:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [59]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_2 = max(test_target)
test_target_mean_GS_2 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_2 = max(errors)
mean_absolute_error_GS_2 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_2 = 100 * (errors / test_target)
accuracy_GS_2 = 100 - np.mean(mape_GS_2)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_2, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_2, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_2, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_2, 2), 'MW')
print('Accuracy:', round(accuracy_GS_2, 2), '%.')

Target Variable Max Value: 168.43 MW
Max Absolute Error: 66.78 MW
Mean Target Variable Value: 76.92 MW
Mean Absolute Error: 3.1 MW
Accuracy: 95.97 %.


### Predict UES load - 3 Days Ahead (KNN & PCA)

In [60]:
# Read in data
data = pd.read_csv("ISO.aggregate - 3 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.3.DAYS.AHEAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,61.488,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,59.038,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,57.848,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,58.058,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,61.047,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [61]:
# Shape of dataframe
data.shape

(78816, 19)

In [62]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                     int64
Day                       int64
Year                      int64
TI                        int64
ISO.tot.dmd             float64
LMP.DA.NH               float64
LMP.RT.NH               float64
UES.Cap                 float64
UES.Sea                 float64
UES.Sea.3.DAYS.AHEAD    float64
NH.RT.MWh               float64
ME.RT.MWh               float64
VT.RT.MWh               float64
CT.RT.MWh               float64
RI.RT.MWh               float64
MA.SE.RT.MWh            float64
MA.WC.RT.MWh            float64
MA.NE.RT.MWh            float64
Wk.Day_Fri                uint8
Wk.Day_Mon                uint8
Wk.Day_Sat                uint8
Wk.Day_Sun                uint8
Wk.Day_Thu                uint8
Wk.Day_Tue                uint8
Wk.Day_Wed                uint8
dtype: object

In [63]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.3.DAYS.AHEAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.3.DAYS.AHEAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [64]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [65]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59112, 24)
Training Target Shape: (59112,)
Testing Predictors Shape: (19704, 24)
Testing Target Shape: (19704,)


In [66]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Make an instance of the Model
pca = PCA(.95)

# Fit model to data
pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [67]:
# How many components needed to explain the 95% of the variance?
pca.n_components_

12

In [68]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [69]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  33.16453798141358  & mae =  3.605440482473947  & rmse =  5.758866032598221


In [70]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  53.62434680920625  & mae =  4.727254029638652  & rmse =  7.322864658670557


In [71]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_3 = max(test_target)
test_target_mean_3 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_3 = max(errors)
mean_absolute_error_3 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_3 = 100 * (errors / test_target)
accuracy_3 = 100 - np.mean(mape_3)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_3, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_3, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_3, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_3, 2), 'MW')
print('Accuracy:', round(accuracy_3, 2), '%.')

Target Variable Max Value: 168.42 MW
Max Absolute Error: 57.36 MW
Mean Target Variable Value: 77.07 MW
Mean Absolute Error: 4.73 MW
Accuracy: 93.96 %.


In [72]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.2min finished


In [73]:
print('GRID SEARCH BEST SCORE:')
print(gs_knn.best_score_)
print('')

print('GRID SEARCH BEST ESTIMATOR:')
print(gs_knn.best_estimator_)
print('')

print('GRID SEARCH BEST PARAMETERS:')
print(gs_knn.best_params_)

GRID SEARCH BEST SCORE:
0.8646454298949753

GRID SEARCH BEST ESTIMATOR:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

GRID SEARCH BEST PARAMETERS:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [74]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_3 = max(test_target)
test_target_mean_GS_3 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_3 = max(errors)
mean_absolute_error_GS_3 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_3 = 100 * (errors / test_target)
accuracy_GS_3 = 100 - np.mean(mape_GS_3)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_3, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_3, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_3, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_3, 2), 'MW')
print('Accuracy:', round(accuracy_GS_3, 2), '%.')

Target Variable Max Value: 168.42 MW
Max Absolute Error: 62.8 MW
Mean Target Variable Value: 77.07 MW
Mean Absolute Error: 3.35 MW
Accuracy: 95.66 %.


### Predict UES load - 4 Days Ahead (KNN & PCA)

In [75]:
# Read in data
data = pd.read_csv("ISO.aggregate - 4 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.4.DAYS.AHEAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,61.94,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,59.924,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,58.818,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,58.968,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,61.194,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [76]:
# Shape of dataframe
data.shape

(78792, 19)

In [77]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                     int64
Day                       int64
Year                      int64
TI                        int64
ISO.tot.dmd             float64
LMP.DA.NH               float64
LMP.RT.NH               float64
UES.Cap                 float64
UES.Sea                 float64
UES.Sea.4.DAYS.AHEAD    float64
NH.RT.MWh               float64
ME.RT.MWh               float64
VT.RT.MWh               float64
CT.RT.MWh               float64
RI.RT.MWh               float64
MA.SE.RT.MWh            float64
MA.WC.RT.MWh            float64
MA.NE.RT.MWh            float64
Wk.Day_Fri                uint8
Wk.Day_Mon                uint8
Wk.Day_Sat                uint8
Wk.Day_Sun                uint8
Wk.Day_Thu                uint8
Wk.Day_Tue                uint8
Wk.Day_Wed                uint8
dtype: object

In [78]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.4.DAYS.AHEAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.4.DAYS.AHEAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [79]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [80]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59094, 24)
Training Target Shape: (59094,)
Testing Predictors Shape: (19698, 24)
Testing Target Shape: (19698,)


In [81]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Make an instance of the Model
pca = PCA(.95)

# Fit model to data
pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [82]:
# How many components needed to explain the 95% of the variance?
pca.n_components_

12

In [83]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [84]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  34.29490408243815  & mae =  3.6945032862896405  & rmse =  5.856185113402594


In [85]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  55.33583674740583  & mae =  4.842033688699361  & rmse =  7.438806137237737


In [86]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_4 = max(test_target)
test_target_mean_4 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_4 = max(errors)
mean_absolute_error_4 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_4 = 100 * (errors / test_target)
accuracy_4 = 100 - np.mean(mape_4)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_4, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_4, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_4, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_4, 2), 'MW')
print('Accuracy:', round(accuracy_4, 2), '%.')

Target Variable Max Value: 168.43 MW
Max Absolute Error: 64.92 MW
Mean Target Variable Value: 77.13 MW
Mean Absolute Error: 4.84 MW
Accuracy: 93.79 %.


In [87]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.3min finished


In [88]:
print('GRID SEARCH BEST SCORE:')
print(gs_knn.best_score_)
print('')

print('GRID SEARCH BEST ESTIMATOR:')
print(gs_knn.best_estimator_)
print('')

print('GRID SEARCH BEST PARAMETERS:')
print(gs_knn.best_params_)

GRID SEARCH BEST SCORE:
0.8609302675813164

GRID SEARCH BEST ESTIMATOR:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

GRID SEARCH BEST PARAMETERS:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [89]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_4 = max(test_target)
test_target_mean_GS_4 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_4 = max(errors)
mean_absolute_error_GS_4 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_4 = 100 * (errors / test_target)
accuracy_GS_4 = 100 - np.mean(mape_GS_4)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_4, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_4, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_4, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_4, 2), 'MW')
print('Accuracy:', round(accuracy_GS_4, 2), '%.')

Target Variable Max Value: 168.43 MW
Max Absolute Error: 56.06 MW
Mean Target Variable Value: 77.13 MW
Mean Absolute Error: 3.4 MW
Accuracy: 95.57 %.


### Predict UES load - 5 Days Ahead (KNN & PCA)

In [90]:
# Read in data
data = pd.read_csv("ISO.aggregate - 5 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.5.DAYS.AHEAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,61.512,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,59.122,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,58.31,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,58.611,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,60.988,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [91]:
# Shape of dataframe
data.shape

(78768, 19)

In [92]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                     int64
Day                       int64
Year                      int64
TI                        int64
ISO.tot.dmd             float64
LMP.DA.NH               float64
LMP.RT.NH               float64
UES.Cap                 float64
UES.Sea                 float64
UES.Sea.5.DAYS.AHEAD    float64
NH.RT.MWh               float64
ME.RT.MWh               float64
VT.RT.MWh               float64
CT.RT.MWh               float64
RI.RT.MWh               float64
MA.SE.RT.MWh            float64
MA.WC.RT.MWh            float64
MA.NE.RT.MWh            float64
Wk.Day_Fri                uint8
Wk.Day_Mon                uint8
Wk.Day_Sat                uint8
Wk.Day_Sun                uint8
Wk.Day_Thu                uint8
Wk.Day_Tue                uint8
Wk.Day_Wed                uint8
dtype: object

In [93]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.5.DAYS.AHEAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.5.DAYS.AHEAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [94]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [95]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59076, 24)
Training Target Shape: (59076,)
Testing Predictors Shape: (19692, 24)
Testing Target Shape: (19692,)


In [96]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Make an instance of the Model
pca = PCA(.95)

# Fit model to data
pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [97]:
# How many components needed to explain 95% of the variance?
pca.n_components_

12

In [98]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [99]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  34.63746432830591  & mae =  3.722092270972984  & rmse =  5.885360169803197


In [100]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  56.513218714462724  & mae =  4.895654966483852  & rmse =  7.517527433569014


In [101]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_5 = max(test_target)
test_target_mean_5 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_5 = max(errors)
mean_absolute_error_5 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_5 = 100 * (errors / test_target)
accuracy_5 = 100 - np.mean(mape_5)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_5, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_5, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_5, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_5, 2), 'MW')
print('Accuracy:', round(accuracy_5, 2), '%.')

Target Variable Max Value: 165.69 MW
Max Absolute Error: 59.0 MW
Mean Target Variable Value: 77.08 MW
Mean Absolute Error: 4.9 MW
Accuracy: 93.7 %.


In [102]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.3min finished


In [103]:
print('GRID SEARCH BEST SCORE:')
print(gs_knn.best_score_)
print('')

print('GRID SEARCH BEST ESTIMATOR:')
print(gs_knn.best_estimator_)
print('')

print('GRID SEARCH BEST PARAMETERS:')
print(gs_knn.best_params_)

GRID SEARCH BEST SCORE:
0.8593789429419881

GRID SEARCH BEST ESTIMATOR:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

GRID SEARCH BEST PARAMETERS:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [104]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_5 = max(test_target)
test_target_mean_GS_5 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_5 = max(errors)
mean_absolute_error_GS_5 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_5 = 100 * (errors / test_target)
accuracy_GS_5 = 100 - np.mean(mape_GS_5)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_5, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_5, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_5, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_5, 2), 'MW')
print('Accuracy:', round(accuracy_GS_5, 2), '%.')

Target Variable Max Value: 165.69 MW
Max Absolute Error: 64.86 MW
Mean Target Variable Value: 77.08 MW
Mean Absolute Error: 3.41 MW
Accuracy: 95.55 %.


### Predict UES load - 6 Days Ahead (KNN & PCA)

In [105]:
# Read in data
data = pd.read_csv("ISO.aggregate - 6 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.6.DAYS.AHEAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,62.741,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,60.452,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,59.588,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,59.829,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,62.058,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [106]:
# Shape of Dataframe
data.shape

(78744, 19)

In [107]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                     int64
Day                       int64
Year                      int64
TI                        int64
ISO.tot.dmd             float64
LMP.DA.NH               float64
LMP.RT.NH               float64
UES.Cap                 float64
UES.Sea                 float64
UES.Sea.6.DAYS.AHEAD    float64
NH.RT.MWh               float64
ME.RT.MWh               float64
VT.RT.MWh               float64
CT.RT.MWh               float64
RI.RT.MWh               float64
MA.SE.RT.MWh            float64
MA.WC.RT.MWh            float64
MA.NE.RT.MWh            float64
Wk.Day_Fri                uint8
Wk.Day_Mon                uint8
Wk.Day_Sat                uint8
Wk.Day_Sun                uint8
Wk.Day_Thu                uint8
Wk.Day_Tue                uint8
Wk.Day_Wed                uint8
dtype: object

In [108]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.6.DAYS.AHEAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.6.DAYS.AHEAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [109]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [110]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59058, 24)
Training Target Shape: (59058,)
Testing Predictors Shape: (19686, 24)
Testing Target Shape: (19686,)


In [111]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Make an instance of the Model
pca = PCA(.95)

# Fit model to data
pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [112]:
# How many components needed to explain 95% of the variance?
pca.n_components_

12

In [113]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [114]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  34.842552807988085  & mae =  3.7013686071319714  & rmse =  5.902758067885562


In [115]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  56.195761342867  & mae =  4.872516326323275  & rmse =  7.496383217450066


In [116]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_6 = max(test_target)
test_target_mean_6 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_6 = max(errors)
mean_absolute_error_6 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_6 = 100 * (errors / test_target)
accuracy_6 = 100 - np.mean(mape_6)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_6, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_6, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_6, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_6, 2), 'MW')
print('Accuracy:', round(accuracy_6, 2), '%.')

Target Variable Max Value: 166.73 MW
Max Absolute Error: 62.24 MW
Mean Target Variable Value: 76.96 MW
Mean Absolute Error: 4.87 MW
Accuracy: 93.7 %.


In [117]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.4min finished


In [118]:
print('GRID SEARCH BEST SCORE:')
print(gs_knn.best_score_)
print('')

print('GRID SEARCH BEST ESTIMATOR:')
print(gs_knn.best_estimator_)
print('')

print('GRID SEARCH BEST PARAMETERS:')
print(gs_knn.best_params_)

GRID SEARCH BEST SCORE:
0.8615133978580832

GRID SEARCH BEST ESTIMATOR:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

GRID SEARCH BEST PARAMETERS:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [119]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_6 = max(test_target)
test_target_mean_GS_6 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_6 = max(errors)
mean_absolute_error_GS_6 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_6 = 100 * (errors / test_target)
accuracy_GS_6 = 100 - np.mean(mape_GS_6)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_6, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_6, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_6, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_6, 2), 'MW')
print('Accuracy:', round(accuracy_GS_6, 2), '%.')

Target Variable Max Value: 166.73 MW
Max Absolute Error: 59.28 MW
Mean Target Variable Value: 76.96 MW
Mean Absolute Error: 3.38 MW
Accuracy: 95.57 %.


### Predict UES load - 7 Days Ahead (KNN & PCA)

In [120]:
# Read in data
data = pd.read_csv("ISO.aggregate - 7 day SHIFTED.csv")

# See first few rows of data
data.head()

Unnamed: 0,Month,Day,Year,TI,ISO.tot.dmd,LMP.DA.NH,LMP.RT.NH,UES.Cap,UES.Sea,UES.Sea.7.DAYS.AHEAD,NH.RT.MWh,ME.RT.MWh,VT.RT.MWh,CT.RT.MWh,RI.RT.MWh,MA.SE.RT.MWh,MA.WC.RT.MWh,MA.NE.RT.MWh,Wk.Day
0,1,1,2010,1,13797.0,63.04,63.33,30.142,67.858,62.538,1168.0,1119.7,640.4,3206.8,825.4,1498.2,1774.3,2688.9,Fri
1,1,1,2010,2,13560.0,57.61,69.74,27.37,63.276,60.144,1091.8,1077.1,605.4,3031.6,781.9,1404.3,1676.8,2569.0,Fri
2,1,1,2010,3,13121.0,53.74,55.39,25.573,59.899,58.989,1046.4,1045.1,578.9,2902.9,749.7,1340.0,1621.8,2483.2,Fri
3,1,1,2010,4,13134.0,42.09,60.49,24.728,58.275,59.15,1021.5,1030.5,556.9,2827.7,729.7,1306.1,1588.5,2428.1,Fri
4,1,1,2010,5,13144.0,47.95,53.43,24.974,58.516,61.425,1018.0,1047.4,553.7,2820.7,730.0,1308.1,1586.7,2432.2,Fri


In [121]:
# Shape of Dataframe
data.shape

(78720, 19)

In [122]:
# One-hot encoding
data = pd.get_dummies(data)

# Check dataframe data types
data.dtypes

Month                     int64
Day                       int64
Year                      int64
TI                        int64
ISO.tot.dmd             float64
LMP.DA.NH               float64
LMP.RT.NH               float64
UES.Cap                 float64
UES.Sea                 float64
UES.Sea.7.DAYS.AHEAD    float64
NH.RT.MWh               float64
ME.RT.MWh               float64
VT.RT.MWh               float64
CT.RT.MWh               float64
RI.RT.MWh               float64
MA.SE.RT.MWh            float64
MA.WC.RT.MWh            float64
MA.NE.RT.MWh            float64
Wk.Day_Fri                uint8
Wk.Day_Mon                uint8
Wk.Day_Sat                uint8
Wk.Day_Sun                uint8
Wk.Day_Thu                uint8
Wk.Day_Tue                uint8
Wk.Day_Wed                uint8
dtype: object

In [123]:
# Split data into target variable and predictor variables

# Target is the value we want to predict (in this case, MW of UES-Seacoast Load Region)
target = np.array(data['UES.Sea.7.DAYS.AHEAD'])

# Remove the target from the predictors
# axis 1 refers to the columns
predictors = data.drop('UES.Sea.7.DAYS.AHEAD', axis = 1)

# Saving predictor names for later use
predictor_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

In [124]:
# Split the data into training and testing sets using sklearn
train_predictors, test_predictors, train_target, test_target = train_test_split(predictors, target, test_size = 0.25, random_state = 77)


In [125]:
# Check shape of train and test data
print('Training Predictors Shape:', train_predictors.shape)
print('Training Target Shape:', train_target.shape)
print('Testing Predictors Shape:', test_predictors.shape)
print('Testing Target Shape:', test_target.shape)

Training Predictors Shape: (59040, 24)
Training Target Shape: (59040,)
Testing Predictors Shape: (19680, 24)
Testing Target Shape: (19680,)


In [126]:
# Create scaler
scaler = StandardScaler()

# Scale training predictors
train_pred_scaled = scaler.fit_transform(train_predictors)

# Scale test predictors
test_pred_scaled = scaler.transform(test_predictors)

# Make an instance of the Model
pca = PCA(.95)

# Fit model to data
pca.fit(train_pred_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [127]:
# How many components needed to explain 95% of the variance?
pca.n_components_

12

In [128]:
# Apply PCA transform to training and test set
train_pred_scaled = pca.transform(train_pred_scaled)
test_pred_scaled = pca.transform(test_pred_scaled)

# Create KNN model
model = KNeighborsRegressor()

# Train model with scaled training data and target values
model.fit(train_pred_scaled, train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [129]:
# Calculate errors for training data using mean squared error (MSE) and mean absolute error (MAE)
mse = mean_squared_error(train_target, model.predict(train_pred_scaled))
mae = mean_absolute_error(train_target, model.predict(train_pred_scaled))

# Display training MSE, MAE, & RMSE
print("mse = ",mse," & mae = ",mae," & rmse = ", sqrt(mse))

mse =  34.479223141296075  & mae =  3.672042571138211  & rmse =  5.871901152207526


In [130]:
# Calculate and display MSE, MAE, & RMSE on test data
test_mse = mean_squared_error(test_target, model.predict(test_pred_scaled))
test_mae = mean_absolute_error(test_target, model.predict(test_pred_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))

mse =  57.370441838321135  & mae =  4.840338983739837  & rmse =  7.574327814289604


In [131]:
# Use predict method on the test data
predictions = model.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_7 = max(test_target)
test_target_mean_7 = np.mean(test_target)

# Calculate the absolute errors, calculate max and mean absolute error
errors = abs(predictions - test_target)
max_absolute_error_7 = max(errors)
mean_absolute_error_7 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_7 = 100 * (errors / test_target)
accuracy_7 = 100 - np.mean(mape_7)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_7, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_7, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_7, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_7, 2), 'MW')
print('Accuracy:', round(accuracy_7, 2), '%.')

Target Variable Max Value: 168.42 MW
Max Absolute Error: 74.58 MW
Mean Target Variable Value: 77.03 MW
Mean Absolute Error: 4.84 MW
Accuracy: 93.82 %.


In [132]:
# Grid search for KNN to see if performance can be improved
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
     KNeighborsRegressor(),
     grid_params,
     verbose = 1,
     cv = 3,
     n_jobs = -1
)

gs_knn = gs.fit(train_pred_scaled, train_target)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.2min finished


In [133]:
print('GRID SEARCH BEST SCORE:')
print(gs_knn.best_score_)
print('')

print('GRID SEARCH BEST ESTIMATOR:')
print(gs_knn.best_estimator_)
print('')

print('GRID SEARCH BEST PARAMETERS:')
print(gs_knn.best_params_)

GRID SEARCH BEST SCORE:
0.8600431996479997

GRID SEARCH BEST ESTIMATOR:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='distance')

GRID SEARCH BEST PARAMETERS:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [134]:
# See if the grid search improved the accuracy of KNN
best_grid = gs_knn.best_estimator_

# Use predict method on the test data
predictions = best_grid.predict(test_pred_scaled)

# Calculate max and mean test_target value
test_target_max_GS_7 = max(test_target)
test_target_mean_GS_7 = np.mean(test_target)

# Calculate the absolute errors
errors = abs(predictions - test_target)
max_absolute_error_GS_7 = max(errors)
mean_absolute_error_GS_7 = np.mean(errors)

# Calculate mean absolute percentage error (MAPE), and Accuracy
mape_GS_7 = 100 * (errors / test_target)
accuracy_GS_7 = 100 - np.mean(mape_GS_7)

# Print out some statistics, including accuracy
print('Target Variable Max Value:', round(test_target_max_GS_7, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_7, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_7, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_7, 2), 'MW')
print('Accuracy:', round(accuracy_GS_7, 2), '%.')

Target Variable Max Value: 168.42 MW
Max Absolute Error: 66.66 MW
Mean Target Variable Value: 77.03 MW
Mean Absolute Error: 3.39 MW
Accuracy: 95.61 %.


#### Compare Performance of Models

In [135]:
### Print out performance metrics from the seven Random Forest Models

print('KNN w/PCA Model 1 - Predict UES Load 1 Day Ahead')
print('Target Variable Max Value:', round(test_target_max_1, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_1, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_1, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_1, 2), 'MW')
print('Accuracy:', round(accuracy_1, 2), '%.')
print('')

print('KNN w/PCA Model 1 after GridSearchCV - Predict UES Load 1 Day Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_1, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_1, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_1, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_1, 2), 'MW')
print('Accuracy:', round(accuracy_GS_1, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

print('KNN w/PCA Model 2 - Predict UES Load 2 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_2, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_2, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_2, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_2, 2), 'MW')
print('Accuracy:', round(accuracy_2, 2), '%.')
print('')

print('KNN w/PCA Model 2 after GridSearchCV - Predict UES Load 2 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_2, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_2, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_2, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_2, 2), 'MW')
print('Accuracy:', round(accuracy_GS_2, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

print('KNN w/PCA Model 3 - Predict UES Load 3 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_3, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_3, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_3, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_3, 2), 'MW')
print('Accuracy:', round(accuracy_3, 2), '%.')
print('')

print('KNN w/PCA Model 3 after GridSearchCV - Predict UES Load 3 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_3, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_3, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_3, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_3, 2), 'MW')
print('Accuracy:', round(accuracy_GS_3, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

print('KNN w/PCA Model 4 - Predict UES Load 4 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_4, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_4, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_4, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_4, 2), 'MW')
print('Accuracy:', round(accuracy_4, 2), '%.')
print('')

print('KNN w/PCA Model 4 after GridSearchCV - Predict UES Load 4 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_4, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_4, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_4, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_4, 2), 'MW')
print('Accuracy:', round(accuracy_GS_4, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

print('KNN w/PCA Model 5 - Predict UES Load 5 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_5, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_5, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_5, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_5, 2), 'MW')
print('Accuracy:', round(accuracy_5, 2), '%.')
print('')

print('KNN w/PCA Model 5 after GridSearchCV - Predict UES Load 5 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_5, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_5, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_5, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_5, 2), 'MW')
print('Accuracy:', round(accuracy_GS_5, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

print('KNN w/PCA Model 6 - Predict UES Load 6 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_6, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_6, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_6, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_6, 2), 'MW')
print('Accuracy:', round(accuracy_6, 2), '%.')
print('')

print('KNN w/PCA Model 6 after GridSearchCV - Predict UES Load 6 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_6, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_6, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_6, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_6, 2), 'MW')
print('Accuracy:', round(accuracy_GS_6, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

print('KNN w/PCA Model 7 - Predict UES Load 7 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_7, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_7, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_7, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_7, 2), 'MW')
print('Accuracy:', round(accuracy_7, 2), '%.')
print('')

print('KNN w/PCA Model 7 after GridSearchCV - Predict UES Load 7 Days Ahead')
print('Target Variable Max Value:', round(test_target_max_GS_7, 2), 'MW')
print('Max Absolute Error:', round(max_absolute_error_GS_7, 2), 'MW')
print('Mean Target Variable Value:', round(test_target_mean_GS_7, 2), 'MW')
print('Mean Absolute Error:', round(mean_absolute_error_GS_7, 2), 'MW')
print('Accuracy:', round(accuracy_GS_7, 2), '%.')
print('')
print('--------------------------------------------------------------------')
print('')

KNN w/PCA Model 1 - Predict UES Load 1 Day Ahead
Target Variable Max Value: 166.65 MW
Max Absolute Error: 47.35 MW
Mean Target Variable Value: 77.05 MW
Mean Absolute Error: 3.46 MW
Accuracy: 95.61 %.

KNN w/PCA Model 1 after GridSearchCV - Predict UES Load 1 Day Ahead
Target Variable Max Value: 166.65 MW
Max Absolute Error: 39.25 MW
Mean Target Variable Value: 77.05 MW
Mean Absolute Error: 2.54 MW
Accuracy: 96.74 %.

--------------------------------------------------------------------

KNN w/PCA Model 2 - Predict UES Load 2 Days Ahead
Target Variable Max Value: 168.43 MW
Max Absolute Error: 64.56 MW
Mean Target Variable Value: 76.92 MW
Mean Absolute Error: 4.32 MW
Accuracy: 94.48 %.

KNN w/PCA Model 2 after GridSearchCV - Predict UES Load 2 Days Ahead
Target Variable Max Value: 168.43 MW
Max Absolute Error: 66.78 MW
Mean Target Variable Value: 76.92 MW
Mean Absolute Error: 3.1 MW
Accuracy: 95.97 %.

--------------------------------------------------------------------

KNN w/PCA Model 3

In [136]:
# Create a dataframe of performance metrics of 7 models
model_number = [1,2,3,4,5,6,7]

test_target_max_values = [test_target_max_1, test_target_max_2, test_target_max_3, test_target_max_4, 
                          test_target_max_6, test_target_max_6, test_target_max_7]

max_absolute_error_values =[max_absolute_error_1, max_absolute_error_2, max_absolute_error_3, max_absolute_error_4, 
                            max_absolute_error_5, max_absolute_error_6, max_absolute_error_7]

max_absolute_error_values_GS =[max_absolute_error_GS_1, max_absolute_error_GS_2, max_absolute_error_GS_3,
                               max_absolute_error_GS_4, max_absolute_error_GS_5, max_absolute_error_GS_6,
                               max_absolute_error_GS_7]

test_target_mean_values = [test_target_mean_1, test_target_mean_2, test_target_mean_3, test_target_mean_4, 
                           test_target_mean_5, test_target_mean_6, test_target_mean_7]

mean_absolute_error_values = [mean_absolute_error_1, mean_absolute_error_2, mean_absolute_error_3, 
                              mean_absolute_error_4, mean_absolute_error_5, mean_absolute_error_6, 
                              mean_absolute_error_7]

mean_absolute_error_values_GS = [mean_absolute_error_GS_1, mean_absolute_error_GS_2, mean_absolute_error_GS_3, 
                                 mean_absolute_error_GS_4, mean_absolute_error_GS_5, mean_absolute_error_GS_6, 
                                 mean_absolute_error_GS_7]

accuracy_values = [accuracy_1, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7]

accuracy_values_GS = [accuracy_GS_1, accuracy_GS_2, accuracy_GS_3, accuracy_GS_4,
                      accuracy_GS_5, accuracy_GS_6, accuracy_GS_7]

### Create pandas dataframe using metrics
metrics_df = {'Model Number': model_number,
              'Test Target Max': test_target_max_values, 
              'Max Absolute Error': max_absolute_error_values,
              'Max Absolute Error w/GS': max_absolute_error_values_GS,
              'Test Target Mean': test_target_mean_values,
              'Mean Absolute Error': mean_absolute_error_values,
              'Mean Absolute Error w/GS': mean_absolute_error_values_GS,
              'Accuracy': accuracy_values,
              'Accuracy w/GS': accuracy_values_GS}

metrics_df = pd.DataFrame(metrics_df)
metrics_df


Unnamed: 0,Model Number,Test Target Max,Max Absolute Error,Max Absolute Error w/GS,Test Target Mean,Mean Absolute Error,Mean Absolute Error w/GS,Accuracy,Accuracy w/GS
0,1,166.649,47.3466,39.250073,77.047343,3.461851,2.54205,95.61177,96.735089
1,2,168.43,64.5622,66.783534,76.918785,4.322612,3.104818,94.480583,95.970021
2,3,168.42,57.3592,62.804764,77.070027,4.727254,3.349472,93.957752,95.655373
3,4,168.43,64.918,56.063384,77.126991,4.842034,3.396261,93.788199,95.574712
4,5,166.733,59.0042,64.864749,77.079134,4.895655,3.405921,93.702712,95.549284
5,6,166.733,62.2408,59.277204,76.964268,4.872516,3.383218,93.699266,95.567376
6,7,168.42,74.5846,66.663609,77.033948,4.840339,3.385984,93.817873,95.609867


In [137]:
# Write out a .csv of the performance metrics
metrics_df.to_csv('metrics_KNN with PCA.csv', index=False)