---

## Imports

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import pickle
import wget, os
import time
import glob
import pytz

---

## Function Definitions

In [5]:
def summarize(df):
    print(f'{df.shape[0]} rows,   {df.isna().sum().sum()} nulls')
    print(f'\n {df.head(3)}')
    return

---

## Read in Consolidated Data & Examine

In [6]:
with open('../data/consolidated_data.pkl', 'rb') as f:
    df = pickle.load(f)
df.head()

Unnamed: 0_level_0,date,hr_index,node,dam_price_per_mwh,hasp_price_per_mwh,rtm_price_per_mwh,7da_load_fcast_mw,2da_load_fcast_mw,dam_load_fcast_mw,rtm_load_fcast_mw,...,fres_vis,fres_ceil,rive_temp,rive_wind,rive_vis,rive_ceil,redd_temp,redd_wind,redd_vis,redd_ceil
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 01:00:00-08:00,2016-01-01,2,BAYSHOR2_1_N001,30.38613,25.030977,24.903872,21515.0,22070.77,22188.26,21331.625,...,11265.0,22000.0,133.0,51.0,16093.0,22000.0,94.0,72.0,16093.0,22000.0
2016-01-01 02:00:00-08:00,2016-01-01,3,BAYSHOR2_1_N001,30.29141,24.563878,24.630057,20854.85,21419.4,21581.45,20648.3125,...,11265.0,22000.0,128.0,46.0,16093.0,22000.0,83.0,93.0,16093.0,22000.0
2016-01-01 03:00:00-08:00,2016-01-01,4,BAYSHOR2_1_N001,28.45127,24.775715,24.689828,20495.68,20997.22,21261.0,20389.0,...,9656.0,22000.0,122.0,46.0,16093.0,22000.0,78.0,67.0,16093.0,22000.0
2016-01-01 04:00:00-08:00,2016-01-01,5,BAYSHOR2_1_N001,28.66504,24.92332,23.505206,20490.69,21008.41,21289.56,20443.8125,...,8047.0,22000.0,117.0,41.0,16093.0,22000.0,67.0,72.0,16093.0,22000.0
2016-01-01 05:00:00-08:00,2016-01-01,6,BAYSHOR2_1_N001,29.84836,25.353155,24.506423,20950.76,21464.04,21823.54,20868.125,...,6437.0,22000.0,111.0,51.0,16093.0,22000.0,67.0,72.0,16093.0,22000.0


In [7]:
summarize(df)

29067 rows,   0 nulls

                                  date  hr_index             node  \
datetime                                                           
2016-01-01 01:00:00-08:00  2016-01-01         2  BAYSHOR2_1_N001   
2016-01-01 02:00:00-08:00  2016-01-01         3  BAYSHOR2_1_N001   
2016-01-01 03:00:00-08:00  2016-01-01         4  BAYSHOR2_1_N001   

                           dam_price_per_mwh  hasp_price_per_mwh  \
datetime                                                           
2016-01-01 01:00:00-08:00           30.38613           25.030977   
2016-01-01 02:00:00-08:00           30.29141           24.563878   
2016-01-01 03:00:00-08:00           28.45127           24.775715   

                           rtm_price_per_mwh  7da_load_fcast_mw  \
datetime                                                          
2016-01-01 01:00:00-08:00          24.903872           21515.00   
2016-01-01 02:00:00-08:00          24.630057           20854.85   
2016-01-01 03:00:00-08:00 

---

## Add `year`, `month`, `day`, and  `hour` Columns  
(drop the `node`, `date`, and `hr` columns

In [8]:
df.drop(columns=['node', 'date', 'hr_index'], inplace=True)
df.head(3)

Unnamed: 0_level_0,dam_price_per_mwh,hasp_price_per_mwh,rtm_price_per_mwh,7da_load_fcast_mw,2da_load_fcast_mw,dam_load_fcast_mw,rtm_load_fcast_mw,water_acre_feet,sand_temp,sand_wind,...,fres_vis,fres_ceil,rive_temp,rive_wind,rive_vis,rive_ceil,redd_temp,redd_wind,redd_vis,redd_ceil
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 01:00:00-08:00,30.38613,25.030977,24.903872,21515.0,22070.77,22188.26,21331.625,6520032.0,156.0,26.0,...,11265.0,22000.0,133.0,51.0,16093.0,22000.0,94.0,72.0,16093.0,22000.0
2016-01-01 02:00:00-08:00,30.29141,24.563878,24.630057,20854.85,21419.4,21581.45,20648.3125,6325227.0,144.0,21.0,...,11265.0,22000.0,128.0,46.0,16093.0,22000.0,83.0,93.0,16093.0,22000.0
2016-01-01 03:00:00-08:00,28.45127,24.775715,24.689828,20495.68,20997.22,21261.0,20389.0,6520067.0,139.0,0.0,...,9656.0,22000.0,122.0,46.0,16093.0,22000.0,78.0,67.0,16093.0,22000.0


In [9]:
df['year']  = df.index.year
df['month'] = df.index.month
df['day']   = df.index.day
df['hour']  = df.index.hour

df.head(3)

Unnamed: 0_level_0,dam_price_per_mwh,hasp_price_per_mwh,rtm_price_per_mwh,7da_load_fcast_mw,2da_load_fcast_mw,dam_load_fcast_mw,rtm_load_fcast_mw,water_acre_feet,sand_temp,sand_wind,...,rive_vis,rive_ceil,redd_temp,redd_wind,redd_vis,redd_ceil,year,month,day,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 01:00:00-08:00,30.38613,25.030977,24.903872,21515.0,22070.77,22188.26,21331.625,6520032.0,156.0,26.0,...,16093.0,22000.0,94.0,72.0,16093.0,22000.0,2016,1,1,1
2016-01-01 02:00:00-08:00,30.29141,24.563878,24.630057,20854.85,21419.4,21581.45,20648.3125,6325227.0,144.0,21.0,...,16093.0,22000.0,83.0,93.0,16093.0,22000.0,2016,1,1,2
2016-01-01 03:00:00-08:00,28.45127,24.775715,24.689828,20495.68,20997.22,21261.0,20389.0,6520067.0,139.0,0.0,...,16093.0,22000.0,78.0,67.0,16093.0,22000.0,2016,1,1,3


In [10]:
df.shape

(29067, 28)

In [11]:
df.to_csv('../data/pre_processed_data.csv')

with open('../data/pre_processed_df.pkl', 'wb') as f:
    pickle.dump(df, f)

---

## Train-test-split and Scale the Data

In [12]:
train_set_length = int(round((df.shape[0] * 0.75), 0))
test_set_length  = df.shape[0] - train_set_length

print(f'\nTrain: {train_set_length} rows\nTest:  {test_set_length} rows')

train = df.iloc[: train_set_length]
test  = df.iloc[train_set_length: ]

print(f'\nTrain set shape: {train.shape}\nTest set shape:  {test.shape}\n')


Train: 21800 rows
Test:  7267 rows

Train set shape: (21800, 28)
Test set shape:  (7267, 28)



In [13]:
train.head(3)

Unnamed: 0_level_0,dam_price_per_mwh,hasp_price_per_mwh,rtm_price_per_mwh,7da_load_fcast_mw,2da_load_fcast_mw,dam_load_fcast_mw,rtm_load_fcast_mw,water_acre_feet,sand_temp,sand_wind,...,rive_vis,rive_ceil,redd_temp,redd_wind,redd_vis,redd_ceil,year,month,day,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 01:00:00-08:00,30.38613,25.030977,24.903872,21515.0,22070.77,22188.26,21331.625,6520032.0,156.0,26.0,...,16093.0,22000.0,94.0,72.0,16093.0,22000.0,2016,1,1,1
2016-01-01 02:00:00-08:00,30.29141,24.563878,24.630057,20854.85,21419.4,21581.45,20648.3125,6325227.0,144.0,21.0,...,16093.0,22000.0,83.0,93.0,16093.0,22000.0,2016,1,1,2
2016-01-01 03:00:00-08:00,28.45127,24.775715,24.689828,20495.68,20997.22,21261.0,20389.0,6520067.0,139.0,0.0,...,16093.0,22000.0,78.0,67.0,16093.0,22000.0,2016,1,1,3


In [14]:
test.head()

Unnamed: 0_level_0,dam_price_per_mwh,hasp_price_per_mwh,rtm_price_per_mwh,7da_load_fcast_mw,2da_load_fcast_mw,dam_load_fcast_mw,rtm_load_fcast_mw,water_acre_feet,sand_temp,sand_wind,...,rive_vis,rive_ceil,redd_temp,redd_wind,redd_vis,redd_ceil,year,month,day,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-01 02:00:00-07:00,29.69366,29.104173,23.534568,22053.56,22030.18,22324.73,21792.0625,19621028.0,180.0,31.0,...,16093.0,22000.0,400.0,62.0,16093.0,22000.0,2018,7,1,2
2018-07-01 03:00:00-07:00,29.10472,27.884945,21.094333,21464.51,21430.11,21720.68,21113.125,19618892.0,172.0,31.0,...,16093.0,22000.0,383.0,41.0,16093.0,22000.0,2018,7,1,3
2018-07-01 04:00:00-07:00,29.34643,25.761775,21.418978,21223.45,21218.12,21491.83,20803.625,19615712.0,172.0,31.0,...,16093.0,22000.0,356.0,26.0,16093.0,22000.0,2018,7,1,4
2018-07-01 05:00:00-07:00,30.91718,25.976083,23.791108,21304.13,21259.22,21519.05,20772.125,19612875.0,178.0,15.0,...,16093.0,22000.0,356.0,26.0,16093.0,22000.0,2018,7,1,5
2018-07-01 06:00:00-07:00,26.75131,23.313758,23.203258,21139.35,21041.6,21354.51,20466.75,19610633.0,178.0,15.0,...,16093.0,22000.0,294.0,15.0,16093.0,22000.0,2018,7,1,6


In [15]:
ss = StandardScaler()

train_sc = ss.fit_transform(train)
test_sc  = ss.transform(test)

In [16]:
train_sc.shape

(21800, 28)

In [17]:
test_sc.shape

(7267, 28)

In [18]:
train_sc_df = pd.DataFrame(train_sc, columns=train.columns, index=train.index)
test_sc_df = pd.DataFrame(test_sc, columns=test.columns, index=test.index)

In [19]:
train_sc_df.shape

(21800, 28)

In [20]:
test_sc_df.shape

(7267, 28)

In [21]:
train_sc_df.head(3)

Unnamed: 0_level_0,dam_price_per_mwh,hasp_price_per_mwh,rtm_price_per_mwh,7da_load_fcast_mw,2da_load_fcast_mw,dam_load_fcast_mw,rtm_load_fcast_mw,water_acre_feet,sand_temp,sand_wind,...,rive_vis,rive_ceil,redd_temp,redd_wind,redd_vis,redd_ceil,year,month,day,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 01:00:00-08:00,-0.114824,-0.184246,-0.139153,-0.846923,-0.744743,-0.742247,-0.706692,-2.749662,-0.708672,-0.166307,...,0.364153,0.667452,-0.403021,1.780942,0.312427,0.609238,-1.066859,-1.447601,-1.676634,-1.518671
2016-01-01 02:00:00-08:00,-0.119411,-0.19565,-0.144865,-0.981831,-0.878188,-0.865296,-0.848378,-2.80159,-1.015979,-0.466297,...,0.364153,0.667452,-0.45053,2.687161,0.312427,0.609238,-1.066859,-1.447601,-1.676634,-1.374074
2016-01-01 03:00:00-08:00,-0.208528,-0.190478,-0.143618,-1.055231,-0.96468,-0.930276,-0.902146,-2.749653,-1.144024,-1.726254,...,0.364153,0.667452,-0.472125,1.565176,0.312427,0.609238,-1.066859,-1.447601,-1.676634,-1.229477


In [22]:
train.to_csv('../data/processed/train.csv')
test.to_csv('../data/processed/test.csv')

In [23]:
with open('../data/processed/train.pkl', 'wb') as f:
    pickle.dump(train, f)
    
with open('../data/processed/test.pkl', 'wb') as f:
    pickle.dump(test, f)

with open('../data/processed/train_sc_df.pkl', 'wb') as f:
    pickle.dump(train_sc_df, f)
    
with open('../data/processed/test_sc_df.pkl', 'wb') as f:
    pickle.dump(test_sc_df, f)