In [1]:
import ipywidgets as widgets
from ipywidgets import HBox, VBox, Tab, Label, Checkbox, Button
from ipywidgets import FloatSlider, IntSlider, Dropdown, SelectMultiple
from IPython.display import display

import matplotlib.pyplot as plt
import matplotlib; matplotlib.rcParams.update({'font.size': 14})
import seaborn as sns; sns.set_style('whitegrid')
import numpy as np

import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_parquet("cmm_erdos_bootcamp_2020_timeseries.pq", engine='pyarrow')
df.date_val = pd.to_datetime(df.date_val)

test_start_dates = {'1 month': datetime(2019, 12, 1),
                    '3 months': datetime(2019, 10, 1),
                    '6 months': datetime(2019, 7, 1),
                    '1 year': datetime(2019, 1, 1)}

def get_train_test(test_start_date):
    df_train = df.loc[df.date_val < test_start_date]
    df_test = df.loc[df.date_val >= test_start_date]
    
    train_a, train_b, train_c = df_train.volume_A.values, df_train.volume_B.values, df_train.volume_C.values
    test_a, test_b, test_c = df_test.volume_A.values, df_test.volume_B.values, df_test.volume_C.values

    train_sets = {'Volume A': train_a,
                 'Volume B': train_b,
                 'Volume C': train_c}

    test_sets = {'Volume A': test_a,
                'Volume B': test_b,
                'Volume C': test_c}
    
    return train_sets, test_sets, df_test.date_val

In [3]:
def mape(actual, prediction):
    return np.mean(np.abs((actual - prediction) / actual))

def normalized(vals):
    return (vals - vals.mean()) / (vals.max() - vals.min())

# Explore Data

In [4]:
from statsmodels.tsa.seasonal import STL

## Trend

In [5]:
colors = {'Volume A': 'C1',
          'Volume B': 'C2',
          'Volume C': 'C3'}

def plot_trends(volumes, normalize=True, reg_holidays=True, holiday_dates=True):
    if len(volumes) == 0:
        return
    
    # fetch data from dataframe
    vol_a = df.volume_A.values.copy()
    vol_b = df.volume_B.values.copy()
    vol_c = df.volume_C.values.copy()

    # preprocess if requested
    if reg_holidays:
        for i in df.index[df.is_holiday == 1]:
            if i >= 7:
                source_i = i - 7
            else:
                source_i = i + 7
            vol_a[i] = vol_a[source_i]
            vol_b[i] = vol_b[source_i]
            vol_c[i] = vol_c[source_i]

    # extract trends using STL
    trend_a = STL(vol_a, 7).fit().trend
    trend_b = STL(vol_b, 7).fit().trend
    trend_c = STL(vol_c, 7).fit().trend
    trends = {'Volume A': trend_a,
              'Volume B': trend_b,
              'Volume C': trend_c}
    
    
    plt.figure(figsize=(20, 5))
    for volume in volumes:
        trend = trends[volume]
        if normalize:
            trend = normalized(trend)
        plt.plot(df.date_val, trend, label=str(volume), c=colors[volume])
    plt.legend()
    
    if holiday_dates:
        for ind in df.date_val[df.is_holiday == 1]:
            plt.axvline(ind, lw=1, ls='--', c='k')
    
    
    #plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

normalize_w = widgets.Checkbox(value=True, description="Normalize")
reg_holidays_w = widgets.Checkbox(value=True, description="Regularize Holidays")
holiday_dates_w = widgets.Checkbox(value=True, description="Show Holidays")


volumes_w = widgets.SelectMultiple(options=['Volume A', 'Volume B', 'Volume C'], value=['Volume A'],)
    
plot_trends_w = widgets.interactive_output(plot_trends, dict(normalize=normalize_w,
                                                            volumes=volumes_w,
                                                            reg_holidays=reg_holidays_w,
                                                            holiday_dates=holiday_dates_w))


explore_trends_subtab = VBox([HBox([VBox([normalize_w,
                                          reg_holidays_w,
                                          holiday_dates_w]), Label(value='Select Volumes: '), volumes_w]),
                              plot_trends_w,])

## Seasonality

In [6]:
colors = {'Volume A': 'C1',
          'Volume B': 'C2',
          'Volume C': 'C3'}

def plot_seasonality(volumes, normalize=True, reg_holidays=True, holiday_dates=True):
    if len(volumes) == 0:
        return
    
    # fetch data from dataframe
    vol_a = df.volume_A.values.copy()
    vol_b = df.volume_B.values.copy()
    vol_c = df.volume_C.values.copy()

    # preprocess if requested
    if reg_holidays:
        for i in df.index[df.is_holiday == 1]:
            if i >= 7:
                source_i = i - 7
            else:
                source_i = i + 7
            vol_a[i] = vol_a[source_i]
            vol_b[i] = vol_b[source_i]
            vol_c[i] = vol_c[source_i]

    # extract trends using STL
    season_a = STL(vol_a, 7).fit().seasonal
    season_b = STL(vol_b, 7).fit().seasonal
    season_c = STL(vol_c, 7).fit().seasonal
    seasons = {'Volume A': season_a,
               'Volume B': season_b,
               'Volume C': season_c}
    
    
    plt.figure(figsize=(20, 5))
    for volume in volumes:
        season = seasons[volume]
        if normalize:
            season = normalized(season)
        plt.plot(df.date_val, season, label=str(volume), c=colors[volume])
    plt.legend()
    
    if holiday_dates:
        for ind in df.date_val[df.is_holiday == 1]:
            plt.axvline(ind, lw=1, ls='--', c='k')
    
    
    #plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

normalize_w = Checkbox(value=True, description="Normalize")
reg_holidays_w = Checkbox(value=True, description="Regularize Holidays")
holiday_dates_w = Checkbox(value=True, description="Show Holidays")


volumes_w = SelectMultiple(options=['Volume A', 'Volume B', 'Volume C'], value=['Volume A'],)
    
plot_seasonality_w = widgets.interactive_output(plot_seasonality, dict(normalize=normalize_w,
                                                                  volumes=volumes_w,
                                                                  reg_holidays=reg_holidays_w,
                                                                  holiday_dates=holiday_dates_w))


explore_seasonality_subtab = VBox([HBox([VBox([normalize_w,
                                          reg_holidays_w,
                                          holiday_dates_w]), Label(value="Select Volumes: "), volumes_w]),
                              plot_seasonality_w,])

## Causality

In [7]:
explore_causality_subtab = VBox([])

## Daily Trends

In [8]:
explore_daily_subtab = VBox([])

## Monthly Trends

In [9]:
explore_monthly_subtab = VBox([])

## Yearly Trends

In [10]:
explore_yearly_subtab = VBox([])

In [11]:
explore_tab = Tab(children=[explore_trends_subtab, explore_seasonality_subtab,
                            explore_causality_subtab, explore_daily_subtab,
                           explore_monthly_subtab, explore_yearly_subtab])
explore_tab.set_title(0, "Trend")
explore_tab.set_title(1, "Seasonal ")
explore_tab.set_title(2, "Causality")
explore_tab.set_title(3, "Daily Trend")
explore_tab.set_title(4, "Monthly Trend")
explore_tab.set_title(5, "Yearly Trend")

# Models and Forecasting

## Seasonal Naive Method

In [12]:
# naive sesonal method for benchmarking
def naive_forecast(volume, horizon='1 year'):
    period = 7
    season_to_repeat = 3
    
    train_sets_, test_sets_, _ = get_train_test(test_start_dates[horizon])
    
    steps = len(test_sets_[volume])
    train = train_sets_[volume]
    
    index_base = len(train) - season_to_repeat*period
    indices = index_base + np.arange(0, steps, dtype=int)%period
    return train[indices]

In [13]:
horizon_shorter_name = {'1 month': '1m',
                        '3 months': '3m',
                        '6 months': '6m',
                        '1 year': '12m'}

def get_sarimax_filename(volume, horizon):
    return "vasudha/" + volume[-1] + "_" + horizon_shorter_name[horizon] + "_sarimax.csv"

volumes = ['Volume A', 'Volume B', 'Volume C']
horizons = ['1 month', '3 months', '6 months', '1 year']

sarimax_forecasts = {}

# confidence intervals
sarimax_lower = {}
sarimax_upper = {}

for volume in volumes:
    sarimax_forecasts[volume] = {}
    sarimax_upper[volume] = {}
    sarimax_lower[volume] = {}

    for horizon in horizons:
        csv_df = pd.read_csv(get_sarimax_filename(volume, horizon),
                             header=None, names=["forecast", "upper", "lower"])

    sarimax_forecasts[volume][horizon] = csv_df.forecast

In [14]:
colors = {'Volume A': 'C1',
          'Volume B': 'C2',
          'Volume C': 'C3'}

models_dic = {'Seasonal Naive Method': 'snm',
              'SARIMAX': 'sarimax',
              'sVARMAX': 'svarmax',
              'ARIMA': 'arima'}




def get_forecast(volume, horizon, model):
    model_codename = models_dic[model]
    
    if model_codename is "snm":
        return naive_forecast(volume, horizon)
    elif model_codename is "sarimax":
        return sarimax_forecasts[volume][horizon]
    else:
        return np.zeros_like(naive_forecast(volume, horizon))


def plot_forecast(volumes, horizon, models, merge=False):
    if len(volumes) == 0:
        return
    
    train_sets, test_sets, test_date_vals = get_train_test(test_start_dates[horizon])
    ax_height = 3
    if merge:
        num_rows = 2
        fig, axs = plt.subplots(num_rows, 1, figsize=(15, num_rows * ax_height), sharex=True)
        forecast_ax = axs[0]
        data_ax = axs[0]
        err_ax = axs[1]
    else:
        num_rows = 3
        fig, axs = plt.subplots(num_rows, 1, figsize=(15, num_rows * ax_height), sharex=True)
        forecast_ax = axs[0]
        data_ax = axs[1]
        err_ax = axs[2]

    
    for volume in volumes:
        train = train_sets[volume]
        test = test_sets[volume]
        
        forecasts = np.array([get_forecast(volume, horizon, model) for model in models])
        forecast = np.mean(forecasts, axis=0)
        ape = 100.0 * np.abs(forecast - test) / test

        forecast_ax.plot(test_date_vals, forecast, color=colors[volume], ls='--', lw=2, label=volume)
        data_ax.plot(test_date_vals, test, color=colors[volume], lw=1, label=volume)
        
        err_ax.plot(test_date_vals, ape, color=colors[volume], label=volume)
        err_ax.set_ylabel("Absolute Percent Error")
        
    plt.legend()
    plt.xticks(rotation=20)
    plt.tight_layout()
    plt.show()
    
    print(fr"Mean absolute percent error: {np.mean(ape):.2f}")

    
 
horizon_label_w = Label(value="Forecast horizon: ")
horizon_w = Dropdown(options=['1 month', '3 months', '6 months', '1 year'], value='1 year',)

volumes_label_w = Label(value="Volumes: ")
volumes_w = SelectMultiple(options=['Volume A', 'Volume B', 'Volume C'], value=['Volume A'],)

models_label_w = Label(value="Model(s): ")
models_w = SelectMultiple(options=models_dic.keys(),
                                 value=['Seasonal Naive Method'],)

merge_plots_w = Checkbox(value=False, description="Merge forecast and test plots")


plot_forecast_w = widgets.interactive_output(plot_forecast, dict(volumes=volumes_w,
                                                                 horizon=horizon_w,
                                                                 models=models_w,
                                                                 merge=merge_plots_w))

forecast_subtab = VBox([HBox([volumes_label_w, volumes_w,
                              VBox([HBox([horizon_label_w, horizon_w]), merge_plots_w]),
                                    models_label_w, models_w]),
                              plot_forecast_w,])

# SARIMA

In [15]:
models_tab = Tab(children=[forecast_subtab])
models_tab.set_title(0, "Forecast and Error")

# Summary

In [16]:
tabs = Tab(children=[explore_tab, models_tab])
tabs.set_title(0, "Explore Data")
tabs.set_title(1, "Models & Forecasts")
display(tabs)

Tab(children=(Tab(children=(VBox(children=(HBox(children=(VBox(children=(Checkbox(value=True, description='Nor…