### Final Model Runs for all stations
- In previous notebooks a hyperparameter Search with cross validation of the basic prophet model with the small set of german-holidays (<20) was performed for each counting-station
- Several stations needed to be excluded, as too many missing values resulted in prophet not being able to run.
- This resulted in 16 stations for which the found hyperparameter values were entered into a table ("../data/hyperparameter_search_complete.csv")

- In the following codeblocks we define functions to apply the best model as found by the hyperparameter search with cross-validation for each station
    - As a proof of concept we train the a model for each station with all available to predict the traffic sum of a single day in march of 2022.
    - the model will be trained with all data until the day before the predicted day.
    - the procedure will be repeated for each day of march 2022 to provide a range of predicted dates for the proof of concept in the dashboard
    - As a result, we will save the model objects in a hierarchical dictionary indexed by the stations and the dates
    - Also a dictionary indexed by stations with dataframes containing all predictions for a station and the corresponding previous 7 days will be saved for the final input of the dashboard


In [None]:
### Import modules and necessary objects/functions
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib.dates as mdates

from prophet import Prophet
import plotly.express as px
from prophet.plot import add_changepoints_to_plot, plot_cross_validation_metric
from prophet.diagnostics import cross_validation, performance_metrics

from datetime import datetime as dt

import itertools


from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

%matplotlib inline


warnings.filterwarnings('ignore')

import trafficModules as tm

#### Read necessary data
1. DataFrame of hourly traffic counts for each station from 2012-2022
2. DataFrame of hyperparameter values for the final model for each station.

In [None]:
### Read traffic data
df = pd.read_csv('../data/compiled-zaehlstellen.csv',index_col=0, parse_dates=True)

### Read final hyperparameter settings for stations
### and station-classification
df_params = pd.read_csv("../data/hyperparameter_search_complete.csv")

#### Data-Cleaning of the hyper-parameter values
- remove stations to excluded because of too many missing values of daily traffic sums
- Drop unnecessary columns

In [None]:
### Clean df_params
### 1054 occurs twice
ind = df_params[df_params.Zählstelle == 1054].index[0]
df_params.drop(ind, axis = 0, inplace = True)

df_params = df_params[df_params.Include == 1] ### Keep only Zähöstellen for which a model shall be calculated
df_params.drop(["Include","Comments"], axis = 1, inplace = True) ### Drop unnecessary columns
df_params.reset_index(inplace=True)
df_params.drop("index", axis = 1, inplace = True)
df_params["seasonality_mode"] = df_params["seasonality_mode"].str.replace("'", "")


#### Define Function for the preprocessing of traffic data of one station
- In a time-series, the data must have continuous instances for a sequence of equally spaced points in time. Thus missing instances will be replaced with an NaN for the traffic variables
- DailySums are calculated and replaced by NaN if too many (2) hourly instances are NaN

In [None]:
### Function to get daily sums for each station
def daily_sum_station(df, zähl): ### zähl

    zähl = str(zähl) # Query needs zähl as a string

    ### Filter for one single zählstelle
    # Set datetime as column, insert rows, where the time is missing in timeseries as y = NaN
    df = df.query(f'Zählstelle == {zähl}').asfreq(freq="H").reset_index()

    ##Drop unused columns
    df.drop(['Zählstelle','PKW','LKW'],axis=1, inplace=True)

    ## Rename columns for Prophet
    df.rename(columns={'Gesamt': 'y', 'datetime': 'ds'}, inplace=True)

    ### Which hours have NaNs?
    df['NaN'] = df['y'].isna()

    ### Set the index again to 'ds' and make daily sums of traffic and hourly NaNs
    df_daily = df.set_index('ds').resample('d').sum()

    ### If a day has more than 2 hourly NaNs, replace the day with nan
    df_daily.loc[df_daily['NaN'] > 2,f'y'] = np.nan
    df_daily.drop(f'NaN', axis=1,inplace=True)

    ## Overwrite dataframe of all zählstellen with chosen zählstelle in daily sums
    df = df_daily.reset_index()

    return df

#### Train-Test-Split Function
- As the train-test split changes with each day to be predicted, we need to define a function for splitting accordingly by a provided cutoff-day

In [None]:
### Define Function for time-train test split
def time_split(df, cutoff_1):
    # Copy the data for splitting
    y = df.copy(deep=True)

    #Make first test/train cutoff. Test is not used for cross-validation
    y_train = y[y.ds <= cutoff_1]
    y_test = y[y.ds > cutoff_1]

    return y_train, y_test

#### Define Cutoff-Days
- these will be given to modelling procedure to split into train/test datasets

In [None]:


### Cutoffs
cv_cutoffs = pd.date_range(end = pd.to_datetime("2022-03-31"), periods = 32)

### Prediction horizon: Predict for the next day only
horizon=1

#### Function to get hyperparameter Values
- from df_params
- for one station.
- indexes the row of the station, and returns a dictionary with the hyperparameter-values
- the dictionary can be passed to the Prophet instance

In [None]:
### Define Function to get hyperparameter values for this station as found by grid search
def get_params_zst(df_params, zähl):
    ### Index by Zählstelle and the 4 columns defining the hyperparameters
    row_params = df_params.loc[df_params.Zählstelle== zähl, ['changepoint_prior_scale', 'seasonality_prior_scale',
       'holidays_prior_scale', 'seasonality_mode']]
    ### transform into dictionary
    params = row_params.to_dict(orient="records")[0]
    return params

#### Comparing the predicted traffic to the previous year
- this enables the user of the dashboard to intuitively interpret the traffic prediction
- A mean of the measured daily sums from the same weekday of the previous year will be calculated
- the difference between the prediction and the mean is divided by the mean to relate the preddiction to the mean

In [None]:
### Function for getting the difference from the mean in percentage of the mean
def get_dif_mean_pred(y_train, day_pred, y_hat_):
    day_pred_wd = day_pred.weekday() # Get weekday of day to predict
    y_year = y_train[-365:] # Shorten y_train to the last year
    y_wd = y_year.y[y_train.ds.dt.weekday == day_pred_wd] # get only weekdays of day to predict from last year
    mean_y = y_wd.mean() # mean of these weekdays fro last year
    #std_y = y_wd.std() # standard-deviation of these weekdays from last year

    return (y_hat_ - mean_y) / mean_y # Relate difference between predicted value and mean of weekday related to mean

#### Core function to train and predict for one day and one station
- Returns a df of (8) rows including date, y, yhat, date of prediction, and difference to mean 

In [None]:
### Function for the actual modelling for one station and one day and prediction in Loop
def model_station_day(params, df_st, cutoff_train, day_pred, rows_outp, country_hol):
    ### Apply time train-test-split for each of the dates in cv_cutoffs list. Y_train ends today, cutoff_train is tomorrow
    y_train, y_test = time_split(df_st, cutoff_train)

    ### Build model
    m = Prophet(**params, daily_seasonality=False)
    ### Add Country holidays 
    m.add_country_holidays(country_name=country_hol)
    ### Fit model with values until yesterday
    with tm.suppress_stdout_stderr():
        m.fit(y_train)

    ### Construct future df (One row only - next day). This gives the dates only
    df_pred = m.make_future_dataframe(periods = 1)
    ### Predict: get y hat values
    df_pred = m.predict(df_pred)

    ### Column for identifying the day for which the model predicted
    df_pred["day_predicted"] = day_pred
    ### add true y_values to df_pred
    y_val = y_train.y.tolist()
    y_val.append(np.NaN)
    df_pred["y"] = y_val
    ### Shorten as output for dashboard
    df_pred = df_pred.tail(rows_outp)
    df_pred.reset_index(inplace = True)

    ### Add difference of prediction to mean traffic of this weekday
    y_hat_ = df_pred.yhat.iloc[-1]
    y_dif_mean_lst = [np.NaN]*(rows_outp-1)
    y_dif_mean = get_dif_mean_pred(y_train, day_pred, y_hat_)
    y_dif_mean_lst.append(y_dif_mean)
    df_pred["y_dif_mean"] = y_dif_mean_lst

    ### Return Model and DataFrame of Predictions
    return m, df_pred 

#### Function of preprocessing and applying modelling function
- is applied for one station
- gets hyperparameter-values
- preprocesses the target-timeseries:
- applies the model for each day as defined by the range of cutoffs (march 2022)
- saves the necessary part of the resulting data.frame in a list
- returns the list of dataframes (one df of rows = rows_output(8)) and a dictionary of prophet-model objects 

In [None]:
### Function for preparing modelling input and start the loop through all cutoffs (dates of one month) for one station
def model_station(zähl, df_params, df, cv_cutoffs, country_hol, rows_outp):
    ### Get station params from params_df with zähl as dictionary
    params = get_params_zst(df_params, zähl)

    ### Apply daily_sum_stations() to df for this station only
    df_st = daily_sum_station(df, zähl)

    ### Daily prediction is done for preparation purposes for the whole month in advance
    len_mon = len(cv_cutoffs) - 1
    ### List of dfs to store the predictions in
    ls_df = []
    ### Dictionary to store model objects in
    dict_m = {}
    ### Loop through dates. apply prophet and predict
    for i_day in range(len_mon):
        ### day to predict traffic for
        day_pred = cv_cutoffs[i_day + 1]
        ### last day of training
        cutoff_train = cv_cutoffs[i_day]

        ### Apply modelling function model_station_day
        m, df_pred = model_station_day(params, df_st, cutoff_train, day_pred, country_hol = country_hol, 
                                       rows_outp = rows_outp)
        ### Column for identifying the Zählstelle
        df_pred["Zählstelle"] = zähl
        ### Write df into list. Only the last year is stored
        ls_df.append(df_pred[['ds','yhat_lower', 'yhat_upper', 'yhat', 'y', 'day_predicted', 
        'y_dif_mean', 'Zählstelle']])
        ### Write model object into dictionary
        dict_m[day_pred] = m
    
    ### ls_df back into one df
    return dict_m, ls_df

#### Loop through all stations and apply model_station
- resulting data-frames will be stored in a dict indexed by the station number
- resulting model-dictionaries will be stored in a dict indexed by the station number

In [None]:
dict_df_zähl = {}
dict_model_zähl = {}
for zähl in df_params.Zählstelle:
    ### Model for one station and all days of march 2022
    dict_m, ls_df = model_station(zähl, df_params, df, cv_cutoffs, country_hol="DE", rows_outp = 8)
    ### One df of ls_df
    df_pred = pd.concat(ls_df, axis = 0)

    ### remove unnessacary columns
    df_pred = df_pred[["ds", "yhat_lower", "yhat_upper", "yhat", "y", "y_dif_mean", "day_predicted", "Zählstelle"]]
    ### Save in dictionaries
    dict_df_zähl[zähl] = df_pred
    dict_model_zähl[zähl] = dict_m
    del dict_m, ls_df, df_pred

#### Save dictionary of dfs for dashboard

In [None]:

pd.to_pickle(dict_df_zähl, "../data/pred_station_date.pkl")