In [38]:
import numpy as np 
import pandas as pd 
import holidays
from fbprophet import Prophet 
from sklearn.model_selection import TimeSeriesSplit

In [29]:
data = pd.read_csv("msa_merged_data.csv", thousands=",")
data.head()

Unnamed: 0,BEGIN_DATE_GMT,HE,POOL_PRICE,AIL_DEMAND,Avg_temp,Weighted_Avg_Temp,future 1,future 2,future 3,future 4,WTI spot,dayofweek,month,year
0,2010-01-01 07:00:00,1,51.12,8307.0,-24.1,-22.952601,79.36,80.02,80.63,81.11,79.39,4,1,2010
1,2010-01-01 08:00:00,2,48.79,8186.0,-23.775,-22.588126,79.36,80.02,80.63,81.11,79.39,4,1,2010
2,2010-01-01 09:00:00,3,39.56,8075.0,-23.425,-22.339761,79.36,80.02,80.63,81.11,79.39,4,1,2010
3,2010-01-01 10:00:00,4,36.27,8013.0,-23.0,-21.877969,79.36,80.02,80.63,81.11,79.39,4,1,2010
4,2010-01-01 11:00:00,5,36.16,7982.0,-22.4,-21.527531,79.36,80.02,80.63,81.11,79.39,4,1,2010


In [32]:
# Converting Pool_price to float - no need to rerun

# data["POOL_PRICE"] = data["POOL_PRICE"].replace(" -   ", np.NaN).replace("[\s,]", "", regex=True)
# data["POOL_PRICE"] = pd.to_numeric( data["POOL_PRICE"] )
# data.dtypes

BEGIN_DATE_GMT        object
HE                     int64
POOL_PRICE           float64
AIL_DEMAND           float64
Avg_temp             float64
Weighted_Avg_Temp    float64
future 1             float64
future 2             float64
future 3             float64
future 4             float64
WTI spot             float64
dayofweek              int64
month                  int64
year                   int64
dtype: object

In [33]:
# Checking for null values
print(data.shape)
data.isnull().sum()

(96390, 14)


BEGIN_DATE_GMT         0
HE                     0
POOL_PRICE           150
AIL_DEMAND             0
Avg_temp               0
Weighted_Avg_Temp      0
future 1               0
future 2               0
future 3               0
future 4               0
WTI spot               0
dayofweek              0
month                  0
year                   0
dtype: int64

In [48]:
# Converting to datetime and checking for gaps in the the time series
data["BEGIN_DATE_GMT"] = pd.to_datetime(data["BEGIN_DATE_GMT"])
gaps = data[data["BEGIN_DATE_GMT"] - data["BEGIN_DATE_GMT"].shift(1) > pd.Timedelta(1, unit = "h")].index

for gap in gaps:
    print(data[gap - 1:gap+2]["BEGIN_DATE_GMT"], "\n")

7440   2010-11-07 07:00:00
7441   2010-11-07 09:00:00
7442   2010-11-07 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

16175   2011-11-06 07:00:00
16176   2011-11-06 09:00:00
16177   2011-11-06 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

24910   2012-11-04 07:00:00
24911   2012-11-04 09:00:00
24912   2012-11-04 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

33645   2013-11-03 07:00:00
33646   2013-11-03 09:00:00
33647   2013-11-03 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

42380   2014-11-02 07:00:00
42381   2014-11-02 09:00:00
42382   2014-11-02 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

51115   2015-11-01 07:00:00
51116   2015-11-01 09:00:00
51117   2015-11-01 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

60018   2016-11-06 07:00:00
60019   2016-11-06 09:00:00
60020   2016-11-06 10:00:00
Name: BEGIN_DATE_GMT, dtype: datetime64[ns] 

68753   2017-11-05 07:00:00
68754   2017-11-05 09:00:00
68755   2017-11-05 10:00:00
Name: BEG

In [36]:
canada_holidays = holidays.CA()
data["holiday"] = [1 if i.date() in canada_holidays else 0 for i in data["BEGIN_DATE_GMT"]]
data["workingday"] = data.apply(lambda row: 0 if row["holiday"] == 1 or row["dayofweek"] in [5,6] else 1 , axis = 1)

In [23]:
data.columns

Index(['BEGIN_DATE_GMT', 'HE', 'POOL_PRICE', 'AIL_DEMAND', 'Avg_temp',
       'Weighted_Avg_Temp', 'future 1', 'future 2', 'future 3', 'future 4',
       'WTI spot', 'dayofweek', 'month', 'year', 'holiday', 'workingday'],
      dtype='object')

In [37]:
data.to_csv("msa_merged_data.csv")

In [21]:
# Train-test split

In [None]:
# Making FBProphet Datasets
def make_prophet_df(df, y, ds, regressors):
    data = pd.DataFrame()
    data["y"] = df[y]
    data['ds'] = df[ds]
    for i in regressors:
        data[i] = df[i]
    return data

regressors = ["POOL_PRICE", "Weighted_Avg_Temp", "workingday"]

# Prophet model
prophet_model = Prophet(growth='linear', interval_width = 0.95, 
                yearly_seasonality='auto',
                weekly_seasonality='auto',
                daily_seasonality='auto',
                seasonality_mode='additive'
              )
# Adding regressors
# Continous vars
prop.add_regressor(regressors[0], 
                    prior_scale=20, mode='additive', standardize=True)
prop.add_regressor(regressors[1], 
                    #prior_scale = 1, 
                    mode='additive',
                    standardize=True)
# Binary vars  
prop.add_regressor(regressors[2], 
                    #prior_scale=10, 
                    mode='additive', 
                    standardize='auto') 



In [24]:
help(Prophet.add_regressor)

Help on function add_regressor in module fbprophet.forecaster:

add_regressor(self, name, prior_scale=None, standardize='auto', mode=None)
    Add an additional regressor to be used for fitting and predicting.
    
    The dataframe passed to `fit` and `predict` will have a column with the
    specified name to be used as a regressor. When standardize='auto', the
    regressor will be standardized unless it is binary. The regression
    coefficient is given a prior with the specified scale parameter.
    Decreasing the prior scale will add additional regularization. If no
    prior scale is provided, self.holidays_prior_scale will be used.
    Mode can be specified as either 'additive' or 'multiplicative'. If not
    specified, self.seasonality_mode will be used. 'additive' means the
    effect of the regressor will be added to the trend, 'multiplicative'
    means it will multiply the trend.
    
    Parameters
    ----------
    name: string name of the regressor.
    prior_scale: op