In [None]:
import math, itertools, uuid
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

In [None]:
import statsmodels.api as sm
import statsmodels.discrete.discrete_model as dm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.tsa.deterministic import TimeTrend, Seasonality
from statsmodels.tools.eval_measures import rmse, mse, medianabs, meanabs, iqr

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [None]:
# data
# https://data.cityofnewyork.us/Transportation/Bicycle-Counts-for-East-River-Bridges-Historical-/gua4-p9wg

In [None]:
bikes = pd.read_csv('bikes.csv', header=0, index_col='Date')

In [None]:
bikes.index = pd.DatetimeIndex(bikes.index, freq='infer')

In [None]:
bikes.shape

In [None]:
bikes.head()

In [None]:
bikes.tail()

In [None]:
ax = bikes.plot(figsize=(9,6), lw=1, )
plt.title('Daily bikes (all bridges)')

In [None]:
bikes_roll = bikes.rolling(window=4, step=4).sum().reset_index()


In [None]:
ax = bikes_roll.plot(figsize=(9,6), lw=1, )
plt.title('4-day bikes (all bridges)')

# Forecasting pipeline

In [None]:
def naive_forecast(series_train, series_test, h=1, period=1, use_seasonal=True):
    if period == 1 or not use_seasonal:
        f = pd.Series(np.repeat(series_train[-1:],h))
        
    else:
        f = pd.Series(np.tile(series_train[(len(series_train)-period):], 
                              (1,int(np.ceil(h/period))) ).flatten()[:h])   
        
    f.index = series_test.index
    return f

In [None]:
def make_trend_dummies(series_train, period=1, 
                        use_trend=True, use_seasonal=True,
                      use_mult_seas=False):
    if use_trend:
        trend_gen = TimeTrend(constant=True, order=1)
        trend_df = trend_gen.in_sample(series_train.index)
    else:  # just a constant, ie overal series mean
        trend_gen = TimeTrend(constant=True, order=0)
        trend_df = trend_gen.in_sample(series_train.index)
        
    if use_seasonal and period > 1:
        seas_gen = Seasonality(period)
        seas_df = seas_gen.in_sample(series_train.index)
        seas_df.columns = ['S'+str(i+1) for i in range(period) ]
        seas_df.drop(columns='S1', inplace=True)
        
        if use_mult_seas and use_trend:
            mult_df = pd.DataFrame([seas_df['S'+str(m)]*trend_df['trend'] for m in range(2,period+1)]).transpose()  
            trend_df = pd.concat([trend_df, seas_df, mult_df], axis=1)
        else:
            trend_df = pd.concat([trend_df, seas_df], axis=1)
        
        
    return trend_df


In [None]:
def simple_reg_forecast(series_train, series_test, h=1, period=1, 
                   use_trend=True, use_seasonal=True, use_mult_seas=False):
    
    exog = make_trend_dummies(series_train, period=period,
                              use_trend=use_trend, use_seasonal=use_seasonal, 
                              use_mult_seas=use_mult_seas)
    exog_predict = make_trend_dummies(series_test, period=period,
                              use_trend=use_trend, use_seasonal=use_seasonal, 
                              use_mult_seas=use_mult_seas).iloc[:h]
    
    res = sm.OLS(series_train, exog).fit()
    return res.predict(exog_predict)

## Generate forecast pool

In [None]:
h = 5
per = 7

tr_len = 43
series_train = bikes_roll.Brooklyn_Bridge.iloc[:tr_len]
series_val = bikes_roll.Brooklyn_Bridge.iloc[tr_len:(tr_len+h)]
series_test = bikes_roll.Brooklyn_Bridge.iloc[(tr_len+h):(tr_len+h+h)]

series_name = 'bikes_roll.Brooklyn_Bridge'
series_target = 'raw_count'

forecast_pool = []

In [None]:
series_train.shape

### naive forecasts

In [None]:
tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Naive',
                'exogX': None,
                'modelParams': {'use_seasonal':False},
                'forecastHorizon': h,
                'forecastResult': naive_forecast(series_train, series_test, h, per, use_seasonal=False)
               }
forecast_pool.append(tmp_dict)

tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Naive',
                'exogX': None,
                'modelParams': {'use_seasonal':True},
                'forecastHorizon': h,
                'forecastResult': naive_forecast(series_train, series_test, h, per, use_seasonal=True)
               }
forecast_pool.append(tmp_dict)

In [None]:
len(forecast_pool)

### simple regression models 
Trend and seasonal dummies
(uncorrelated errors)

In [None]:
# just the series level average
tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Regression',
                'exogX': None,
                'modelParams': {'use_trend': False, 'use_seasonal':False, 'use_mult_seas':False},
                'forecastHorizon': h,
                'forecastResult': simple_reg_forecast(series_train, series_test, h=h, period=per, 
                   use_trend=False, use_seasonal=False, use_mult_seas=False)
               }
forecast_pool.append(tmp_dict)

# linear trend regression
tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Regression',
                'exogX': None,
                'modelParams': {'use_trend': True, 'use_seasonal':False,'use_mult_seas':False},
                'forecastHorizon': h,
                'forecastResult': simple_reg_forecast(series_train, series_test, h=h, period=per, 
                   use_trend=True, use_seasonal=False, use_mult_seas=False)
               }
forecast_pool.append(tmp_dict)

# constant + seasonal dummies
tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Regression',
                'exogX': None,
                'modelParams': {'use_trend': False, 'use_seasonal':True,'use_mult_seas':False},
                'forecastHorizon': h,
                'forecastResult': simple_reg_forecast(series_train, series_test, h=h, period=per, 
                   use_trend=False, use_seasonal=True, use_mult_seas=False)
               }
forecast_pool.append(tmp_dict)


# linear trend with seasonal dummies
tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Regression',
                'exogX': None,
                'modelParams': {'use_trend': True, 'use_seasonal':True,'use_mult_seas':False},
                'forecastHorizon': h,
                'forecastResult': simple_reg_forecast(series_train, series_test, h=h, period=per, 
                   use_trend=True, use_seasonal=True, use_mult_seas=False)
               }
forecast_pool.append(tmp_dict)

# linear trend, seasonal dummies, and interaction effects trend*dummies
tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Regression',
                'exogX': None,
                'modelParams': {'use_trend': True, 'use_seasonal':True,'use_mult_seas':True},
                'forecastHorizon': h,
                'forecastResult': simple_reg_forecast(series_train, series_test, h=h, period=per, 
                   use_trend=True, use_seasonal=True, use_mult_seas=True)
               }
forecast_pool.append(tmp_dict)

In [None]:
len(forecast_pool)

### ARIMA forecasts

In [None]:
p = np.arange(0,5)
q = np.arange(0,5)
d = np.arange(0,2)
orders = list(itertools.product(p,d,q))
orders.remove((0,0,0))
orders.remove((0,1,0))

for o in orders:
    tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'ARIMA',
                'exogX': None,
                'modelParams': {'p':o[0], 'd':o[1], 'q':o[2]},
                'forecastHorizon': h,
               }
    
    try:
        mod  = ARIMA(series_train, order=o,
                enforce_invertibility=False,
                enforce_stationarity=False,)
        res  = mod.fit(method_kwargs={'maxiter':1000, "warn_convergence": False})
        tmp_dict['forecastResult'] = res.forecast(h)
        
    except:
        tmp_dict['forecastResult'] = None
        
    forecast_pool.append(tmp_dict)

### SES

In [None]:
alpha = np.arange(0.1,1.0,0.1)
mod  = SimpleExpSmoothing(series_train, initialization_method="heuristic")

for a in alpha:
    tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'SES',
                'exogX': None,
                'modelParams': {'alpha':a, 'trend': 'constant'},
                'forecastHorizon': h,
               }
    try:
        res  = mod.fit(smoothing_level=a)
        tmp_dict['forecastResult'] = res.forecast(h)
        
    except:
        tmp_dict['forecastResult'] = None
        
    forecast_pool.append(tmp_dict)    

In [None]:
len(forecast_pool)

### Holt-Winters

In [None]:
alpha = np.arange(0.05,.401,0.05)
beta = np.arange(0.05,0.201,0.05)

params = list(itertools.product(alpha,beta))
mod  = Holt(series_train, initialization_method="estimated")
modd  = Holt(series_train, damped_trend=True, initialization_method="estimated")

for a in params:
    tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Holt',
                'exogX': None,
                'modelParams': {'alpha':a[0], 'beta':a[1], 'trend': 'additive', 'damped_trend':False},
                'forecastHorizon': h,
               }
    try:
        res  = mod.fit(smoothing_level=a[0], smoothing_trend=a[1])
        tmp_dict['forecastResult'] = res.forecast(h)
        
    except:
        tmp_dict['forecastResult'] = None
        #print('damped false', a)
        
    forecast_pool.append(tmp_dict) 

    # with damped trend (damping param auto-estimated)
    tmp_dict = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Holt',
                'exogX': None,
                'modelParams': {'alpha':a[0], 'beta':a[1], 'trend': 'additive','damped_trend':True},
                'forecastHorizon': h,
               }
    try:
        res = modd.fit(smoothing_level=a[0], smoothing_trend=a[1])
        tmp_dict['forecastResult'] = res.forecast(h)
        
    except:
        tmp_dict['forecastResult'] = None
        #print('damped true', a)
        
    forecast_pool.append(tmp_dict) 

In [None]:
forecast_df = pd.DataFrame(forecast_pool)
forecast_df.shape

In [None]:
forecast_df.head()

## performance metrics

In [None]:
# from statsmodels.tools.eval_measures import rmse, medianabs, meanabs, iqr
for row in range(len(forecast_df.index)):
    forecast_df.loc[row,'RMSE'] = rmse(forecast_df.loc[row,'forecastResult'], series_val)
    forecast_df.loc[row,'MedianABS'] = medianabs(forecast_df.loc[row,'forecastResult'], series_val)
    forecast_df.loc[row,'MeanABS'] = meanabs(forecast_df.loc[row,'forecastResult'], series_val)
    forecast_df.loc[row,'IQRE'] = iqr(forecast_df.loc[row,'forecastResult'], series_val)
    


In [None]:
forecast_df.tail()

In [None]:
forecast_df.columns

In [None]:
forecast_df[['RMSE', 'MeanABS', 'MedianABS', 'IQRE']].plot(figsize=(9,7))

In [None]:
forecast_df.sort_values(by='RMSE').head(10)

## forecast combination

In [None]:
K = len(forecast_df.index)
for k in [3,5,10,20,]:
    avg_forecast = pd.concat([r for r in forecast_df.sort_values(by='RMSE').forecastResult[:k]], axis=1).mean(axis=1)
    forecast_df.loc[len(forecast_df.index)] = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Combination',
                'modelParams': {'weight':'equal', 'topk':k, 'metric':'RMSE'},
                'forecastHorizon': h, 'forecastResult': avg_forecast,
                'RMSE': rmse(forecast_df.loc[row,'forecastResult'], series_test),
                'MedianABS': medianabs(forecast_df.loc[row,'forecastResult'], series_test),
                'MeanABS': meanabs(forecast_df.loc[row,'forecastResult'], series_test),
                'IQRE': iqr(forecast_df.loc[row,'forecastResult'], series_test),
                 }
    
    avg_forecast = pd.concat([r for r in forecast_df.sort_values(by='MedianABS').forecastResult[:k]], axis=1).mean(axis=1)
    forecast_df.loc[len(forecast_df.index)] = {'runUUID': uuid.uuid1(),
                'seriesName': series_name,
                'seriesPeriod': per,
                'seriesTarget': series_target,
                'modelFamily': 'Combination',
                'modelParams': {'weight':'equal', 'topk':k, 'metric':'MedianABS'},
                'forecastHorizon': h, 'forecastResult': avg_forecast,
                'RMSE': rmse(forecast_df.loc[row,'forecastResult'], series_test),
                'MedianABS': medianabs(forecast_df.loc[row,'forecastResult'], series_test),
                'MeanABS': meanabs(forecast_df.loc[row,'forecastResult'], series_test),
                'IQRE': iqr(forecast_df.loc[row,'forecastResult'], series_test),
                 }

In [None]:
ax = series_train.plot( figsize=(10,7))

for f in forecast_df.forecastResult[:len(forecast_df.index)-8]:
    if f is not None:
        f.plot(ax=ax, color='gray')
for f in forecast_df.forecastResult[len(forecast_df.index)-8:len(forecast_df.index)-4]:
    if f is not None:
        f.plot(ax=ax, color='red')

for f in forecast_df.forecastResult[len(forecast_df.index)-4:]:
    if f is not None:
        f.plot(ax=ax, color='purple')

series_val.plot(ax=ax, color='green')
forecast_df.forecastResult[len(forecast_df.index)-1].plot(ax=ax, color='red')
#plt.xlim(pd.to_datetime('2017-03-27'), pd.to_datetime('2017-07-17'))

In [None]:
len(forecast_df.forecastResult)

In [None]:
forecast_df.sort_values(by='MedianABS').head(10)

In [None]:
forecast_df[forecast_df['modelFamily']=='Combination'].sort_values(by='RMSE').head(10)

In [None]:
forecast_df.shape

In [None]:
# from statsmodels.tools.eval_measures import rmse, medianabs, meanabs, iqr
for row in range(len(forecast_df.index)):
    forecast_df.loc[row,'RMSE'] = rmse(forecast_df.loc[row,'forecastResult'], series_val)
    forecast_df.loc[row,'MedianABS'] = medianabs(forecast_df.loc[row,'forecastResult'], series_val)
    forecast_df.loc[row,'MeanABS'] = meanabs(forecast_df.loc[row,'forecastResult'], series_val)
    forecast_df.loc[row,'IQRE'] = iqr(forecast_df.loc[row,'forecastResult'], series_val)
    


In [None]:
forecast_df[['RMSE', 'MeanABS', 'MedianABS', 'IQRE']].plot(figsize=(9,7))