In [1]:
# Standard Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from plotly.offline import iplot

# Greykite functions
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

In [10]:
nyc_df = pd.read_csv('nyc_data.csv')
future_df = pd.read_csv('future.csv')
df = pd.concat([nyc_df, future_df])
df = df.reset_index(drop = True)

df = df.rename(columns = {'Demand': 'y'})
df

Unnamed: 0,Date,y,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305
1,1/2/2015,581.276773,0,0,0,4.73,131.574
2,1/3/2015,754.117039,0,0,0,7.23,162.700
3,1/4/2015,622.252774,0,0,0,10.96,160.281
4,1/5/2015,785.373319,0,0,0,6.92,51.077
...,...,...,...,...,...,...,...
2218,1/27/2021,,0,0,0,3.33,39.664
2219,1/28/2021,,0,0,0,1.67,195.314
2220,1/29/2021,,0,0,0,-2.78,235.894
2221,1/30/2021,,0,0,0,1.11,152.752


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2223 entries, 0 to 2222
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          2223 non-null   object 
 1   y             2192 non-null   float64
 2   Easter        2223 non-null   int64  
 3   Thanksgiving  2223 non-null   int64  
 4   Christmas     2223 non-null   int64  
 5   Temperature   2223 non-null   float64
 6   Marketing     2223 non-null   float64
dtypes: float64(3), int64(3), object(1)
memory usage: 121.7+ KB


## Silverkite Params

In [13]:
metadata = MetadataParam(time_col = 'Date',
                          value_col = 'y',
                          freq = 'D',
                          train_end_date = pd.to_datetime('2020-12-31'))
metadata

MetadataParam(anomaly_info=None, date_format=None, freq='D', time_col='Date', train_end_date=Timestamp('2020-12-31 00:00:00'), value_col='y')

In [19]:
growth = {'growth_term': ['linear', 'quadratic', 'sqrt']}
growth

{'growth_term': ['linear', 'quadratic', 'sqrt']}

In [20]:
seasonality = {'yearly_seasonality': 'auto',
               'quarterly_seasonality': 'auto',
               'monthly_seasonality': 'auto',
               'weekly_seasonality': 'auto',
               'daily_seasonality': 'auto'}
seasonality

{'yearly_seasonality': 'auto',
 'quarterly_seasonality': 'auto',
 'monthly_seasonality': 'auto',
 'weekly_seasonality': 'auto',
 'daily_seasonality': 'auto'}

In [24]:
get_available_holiday_lookup_countries(['US'])

get_available_holidays_across_countries(countries = ['US'],
                                        year_start=2015,
                                        year_end=2021)

['Christmas Day',
 'Christmas Day (Observed)',
 'Columbus Day',
 'Halloween',
 'Independence Day',
 'Independence Day (Observed)',
 'Juneteenth National Independence Day',
 'Juneteenth National Independence Day (Observed)',
 'Labor Day',
 'Martin Luther King Jr. Day',
 'Memorial Day',
 "New Year's Day",
 "New Year's Day (Observed)",
 'Thanksgiving',
 'Veterans Day',
 'Veterans Day (Observed)',
 "Washington's Birthday"]

In [64]:
events = {'holidays_to_model_separately': ["New Year's Day"],
          'holiday_lookup_countries': ['US'],
          'holiday_pre_num_days': 2,
          'holiday_post_num_days': 2,
          'holiday_pre_post_num_dict': {"New Year's Day": (3, 1)},
          'daily_event_df_dict': {
              'elections': pd.DataFrame({
                  'date': ['2016-11-08', '2020-11-30'],
                  'event_name': 'elections'
              })
          }}

events

{'holidays_to_model_separately': ["New Year's Day"],
 'holiday_lookup_countries': ['US'],
 'holiday_pre_num_days': 2,
 'holiday_post_num_days': 2,
 'holiday_pre_post_num_dict': {"New Year's Day": (3, 1)},
 'daily_event_df_dict': {'elections':          date event_name
  0  2016-11-08  elections
  1  2020-11-30  elections}}

In [65]:
changepoints = {'changepoints_dict': {'method': 'auto'}}
changepoints

{'changepoints_dict': {'method': 'auto'}}

In [66]:
regressors = {'regressor_cols': ['Easter', 'Temperature', 'Marketing']}
regressors

{'regressor_cols': ['Easter', 'Temperature', 'Marketing']}

In [67]:
lagged_regressors = {'lagged_regressor_dict': {'Temperature': 'auto',
                                               'Easter': 'auto',
                                               'Marketing': 'auto'}}
lagged_regressors

{'lagged_regressor_dict': {'Temperature': 'auto',
  'Easter': 'auto',
  'Marketing': 'auto'}}

In [68]:
autoregression = {'autoreg_dict': 'auto'}
autoregression

{'autoreg_dict': 'auto'}

In [69]:
custom = {'fit_algorithm_dict': [{'fit_algorithm': 'linear'},
                                 {'fit_algorithm': 'ridge'},
                                 {'fit_algorithm': 'gradient_boosting'}]
        }
custom                         

{'fit_algorithm_dict': [{'fit_algorithm': 'linear'},
  {'fit_algorithm': 'ridge'},
  {'fit_algorithm': 'gradient_boosting'}]}

## Model Building

In [70]:
model = ModelComponentsParam(growth=growth,
                             seasonality=seasonality,
                             events=events,
                             changepoints=changepoints,
                             regressors=regressors,
                             lagged_regressors=lagged_regressors,
                             autoregression=autoregression,
                             custom=custom)

In [71]:
evaluation_metric = EvaluationMetricParam(cv_selection_metric=
                    EvaluationMetricEnum.RootMeanSquaredError.name)

In [72]:
evaluation_period = EvaluationPeriodParam(
    cv_min_train_periods= df.shape[0]-211,
    cv_expanding_window=True,
    cv_max_splits=50,
    cv_periods_between_splits=16)

In [73]:
config = ForecastConfig(
    model_template = ModelTemplateEnum.SILVERKITE.name,
    forecast_horizon=31,
    metadata_param=metadata,
    model_components_param=model,
    evaluation_metric_param=evaluation_metric,
    evaluation_period_param=evaluation_period)

In [74]:
forecaster = Forecaster()
result = forecaster.run_forecast_config(df = df,
                                        config = config)

Fitting 8 folds for each of 9 candidates, totalling 72 fits



The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly'

In [76]:
cv_results = summarize_grid_search_results(
    grid_search=result.grid_search,
    decimals=2,
    score_func = EvaluationMetricEnum.RootMeanSquaredError.name)

cv_results['params'] = cv_results['params'].astype(str)
cv_results.set_index('params', drop=True, inplace=True)
cv_results

Unnamed: 0_level_0,rank_test_CORR,rank_test_R2,rank_test_MSE,rank_test_RMSE,rank_test_MAE,rank_test_MedAE,rank_test_MAPE,rank_test_MedAPE,rank_test_sMAPE,rank_test_Q80,...,std_test_OutsideTolerance5p,split0_train_OutsideTolerance5p,split1_train_OutsideTolerance5p,split2_train_OutsideTolerance5p,split3_train_OutsideTolerance5p,split4_train_OutsideTolerance5p,split5_train_OutsideTolerance5p,split6_train_OutsideTolerance5p,split7_train_OutsideTolerance5p,std_train_OutsideTolerance5p
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",4,1,1,1,1,2,1,2,1,5,...,0.06,0.45,0.45,0.43,0.43,0.43,0.44,0.44,0.43,0.01
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,3,2,2,2,1,3,1,2,4,...,0.07,0.45,0.45,0.43,0.43,0.42,0.44,0.43,0.43,0.01
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,2,3,3,3,3,2,3,3,6,...,0.06,0.45,0.45,0.43,0.44,0.43,0.44,0.43,0.43,0.01
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,6,6,6,6,6,5,6,6,9,...,0.04,0.45,0.45,0.44,0.44,0.44,0.44,0.44,0.44,0.01
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",1,4,4,4,4,4,4,4,4,7,...,0.06,0.45,0.45,0.43,0.43,0.44,0.43,0.43,0.44,0.01
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,5,5,5,5,5,6,5,5,8,...,0.04,0.45,0.45,0.44,0.43,0.44,0.44,0.43,0.44,0.01
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",9,7,7,8,7,7,7,7,7,2,...,0.11,0.37,0.36,0.36,0.36,0.36,0.37,0.37,0.36,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",7,9,9,9,9,9,9,9,9,3,...,0.11,0.37,0.36,0.36,0.36,0.37,0.36,0.37,0.36,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",8,8,8,7,8,8,8,8,8,1,...,0.11,0.37,0.36,0.36,0.36,0.35,0.36,0.37,0.36,0.01


In [77]:
cv_results[["rank_test_RMSE", "mean_test_RMSE",
            "param_estimator__fit_algorithm_dict",
            "param_estimator__growth_term"]]

Unnamed: 0_level_0,rank_test_RMSE,mean_test_RMSE,param_estimator__fit_algorithm_dict,param_estimator__growth_term
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",1,47.3,{'fit_algorithm': 'linear'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,47.42,{'fit_algorithm': 'linear'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,47.43,{'fit_algorithm': 'linear'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,48.4,{'fit_algorithm': 'ridge'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",4,47.93,{'fit_algorithm': 'ridge'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,48.39,{'fit_algorithm': 'ridge'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",8,56.54,{'fit_algorithm': 'gradient_boosting'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",9,56.62,{'fit_algorithm': 'gradient_boosting'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",7,56.53,{'fit_algorithm': 'gradient_boosting'},sqrt


In [83]:
result.backtest.plot()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [88]:
from greykite.common.constants import SEASONALITY_REGEX
result.forecast.plot_components()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [90]:
df

Unnamed: 0,Date,y,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305
1,1/2/2015,581.276773,0,0,0,4.73,131.574
2,1/3/2015,754.117039,0,0,0,7.23,162.700
3,1/4/2015,622.252774,0,0,0,10.96,160.281
4,1/5/2015,785.373319,0,0,0,6.92,51.077
...,...,...,...,...,...,...,...
2218,1/27/2021,,0,0,0,3.33,39.664
2219,1/28/2021,,0,0,0,1.67,195.314
2220,1/29/2021,,0,0,0,-2.78,235.894
2221,1/30/2021,,0,0,0,1.11,152.752


In [91]:
forecast = result.forecast.df[['Date', 'forecast']]
forecast_silverkite = forecast.iloc[-len(future_df):,:]

forecast_silverkite

Unnamed: 0,Date,forecast
2192,2021-01-01,750.528138
2193,2021-01-02,833.85973
2194,2021-01-03,752.893998
2195,2021-01-04,897.355507
2196,2021-01-05,778.988897
2197,2021-01-06,913.949976
2198,2021-01-07,825.744425
2199,2021-01-08,754.156193
2200,2021-01-09,821.35175
2201,2021-01-10,736.587352
