In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np
import math
from itertools import combinations
from statsmodels.tsa.statespace.sarimax import SARIMAX
from metrics import *
from typing import Callable
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
def transf(x: int)-> int:
    """ 
    This function performs normalization on the data
    
    Arguments
    ----------
    x: int
        the integer data 

    Returned Values
    ----------
    x: int
        the normalized data
        
    """ 
    if (x != 0 and x != np.nan):
        return np.log(x)
    if (x == 0 and x != np.nan): #checks for zeros
        return 0
    if x == np.nan: #checks for nan values
        return np.nan
      
def itransf(x: int)-> int:
    """ 
    This function performs inverse normalization on the data
    
    Arguments
    ----------
    x: int
        the normalized data

    Returned Values
    ----------
    x: int
        original data (unnormalized data)
        
    """ 
    return np.exp(x)

In [3]:
def output_h(temp_results: pd.DataFrame, h: int, n_test:int, horizons: int)-> pd.DataFrame:
    """
    This function processes the 'temp_results' DataFrame, which contains predictions for h weeks ahead for each iteration and organizes the data into a single column for each horizon.

    Arguments
    ----------
    temp_results : pd.DataFrame
        the dataframe that has h weeks ahead prediction for each window
    h : int
        the current horizon value e.g 1, 2, 3, ... , or 6 for COVID-19 case
    n_test : int
        the size of testing set
    horizons: int
        the total number of horizons e.g 6 in the COVID-19 case

    Returned Values
    ----------
    df_results_python : pd.DataFrame
        Dataframe for each horizon.  

    """ 
    n_iter = n_test-horizons
    df_results = []
    if h==1:
        for i in range(int(n_iter)):
            window = i + 1
            current_results = temp_results[['window_'+str(window)]].dropna()
            current_results = current_results.rename(columns={'window_'+str(window): "h="+str(h)})
            df_results.append(pd.DataFrame(current_results.iloc[0]).transpose())
    if h>1:
        for i in range(int(h-1)):
            window = i + 1
            current_results = temp_results[['window_'+str(window)]].dropna()
            current_results = current_results.rename(columns={'window_'+str(window): "h="+str(h)})
            df_results.append(pd.DataFrame(current_results.iloc[0]).transpose())

        for i in range(int(n_iter)):
            window = i + 1
            current_results = temp_results[['window_'+str(window)]].dropna()
            current_results = current_results.rename(columns={'window_'+str(window): "h="+str(h)})
            df_results.append(pd.DataFrame(current_results.iloc[int(h-1)]).transpose())
    df_results_python = pd.concat(df_results,axis=0)
    
    return df_results_python

In [4]:
def predict_table(n_test, temp_results,horizons)-> pd.DataFrame:
    """
    A function that iterates through each horizon and combines the predictions for each horizon into a final DataFrame that has 'h' columns for each week.

    Arguments
    ----------
    n_test : int
        the size of testing set
    temp_results : pd.DataFrame
        the dataframe that has 'horizons' weeks ahead prediction for each iteration
    horizons: int
        the total number of horizons e.g 6 in the COVID-19 case
    
    Returned Values
    ----------
    df_final_forecast : pd.DataFrame
        DataFrame with 'horizons' columns presenting the 'horizons' weeks ahead prediction for each week.

    """ 
    df_final_forecast = pd.DataFrame()
    for h in range(1,horizons+1):
        df_current = output_h(temp_results, h, n_test,horizons)
        df_final_forecast = pd.concat([df_final_forecast, df_current], axis=1)
        
    return df_final_forecast

In [5]:
def smape_results(df_final_forecast: pd.DataFrame ,y: str, file_name:str)-> pd.DataFrame:
    """ 
    This function calculates the sMAPE score using the sMAPE definition between the original data and the inverse normalized 
    predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        DataFrame containing predictions for each week
    y : str
        the target variable
    file_name : str
        filename to read the data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with SMAPE scores for each horizon and the average SMAPE score.
        
    """ 
    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    df_US = df_US.drop(columns=['date'])
    df_pred = itransf(df_final_forecast)
    h_level = df_pred.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = df_US.loc[df_pred.index][y]
        y_pred = df_pred[cols]
        # plt.figure(figsize=(10, 6))
        # plt.plot(df_US[y], label='True')
        # plt.plot(df_pred[cols], "-o", label='Predicted')
        # plt.xlabel('Date')
        # plt.ylabel('Number of Deaths')
        # plt.title('SMAPE')
        # plt.legend()
        # plt.show()
        value = smape(y_true, y_pred)
        eval.append(value)
    eval.append(np.mean(eval))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [6]:
def mae_results(df_final_forecast: pd.DataFrame ,y: str, file_name:str)-> pd.DataFrame:
    """ 
    This function calculates the MAE score using the MAE definition between the original data and the inverse normalized 
    predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        DataFrame containing predictions for each week
    y : str
        the target variable
    file_name : str
        filename to read the data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with MAE scores for each horizon and the average MAE score.
        
    """ 
    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    df_US = df_US.drop(columns=['date'])
    df_pred = itransf(df_final_forecast)
    h_level = df_pred.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = df_US.loc[df_pred.index][y]
        y_pred = df_pred[cols]
        # plt.figure(figsize=(10, 6))
        # plt.plot(df_US[y], label='True')
        # plt.plot(df_pred[cols], "-o", label='Predicted')
        # plt.xlabel('Date')
        # plt.ylabel('Number of Deaths')
        # plt.title('MAE')
        # plt.legend()
        # plt.show()
        value = mae(y_true, y_pred)
        eval.append(value)
    eval.append(np.mean(eval))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [7]:
def normmae_results(df_final_forecast: pd.DataFrame ,y: str, file_name:str)-> pd.DataFrame:
    """ 
    This function calculates the MAE score using the MAE definition between the normalized data and the predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        DataFrame containing predictions for each week
    y : str
        the target variable
    file_name : str
        filename to read the data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with MAE scores for each horizon and the average MAE score.
        
    """ 
    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    df_US = df_US.drop(columns=['date'])
    h_level = df_final_forecast.columns.to_list()
    eval = []

    df_US_transf = pd.DataFrame()
    for col in df_US.columns:
        df_US_transf[col] = df_US[col].apply(lambda x: transf(x))

    for cols in h_level:
        y_true = df_US_transf.loc[df_final_forecast.index][y]
        y_pred = df_final_forecast[cols]
        # plt.figure(figsize=(10, 6))
        # plt.plot(df_US_transf[y], label='True')
        # plt.plot(df_final_forecast[cols], "-o", label='Predicted')
        # plt.xlabel('Date')
        # plt.ylabel('Number of Deaths')
        # plt.title('normMAE')
        # plt.legend()
        # plt.show()
        value = mae(y_true, y_pred)
        eval.append(value)
    eval.append(np.mean(eval))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level

    return df_results

In [8]:
def run_model(n_train: int, n_test: int, y: str, x: list[str], horizons: int, file_name: str, m: int, p: int, d: int,
                        q: int, ps: int, ds:int, qs: int, lags: int)-> pd.DataFrame:
    """
    This function uses the SARIMAX model from statsmodels. It predicts 'y' by using 'x' as exogenous variables. 
    Each exogenous variable is shifted by lags. It trains the model once and performs dynamic forecasting for the entire test set. 
    It forecasts 'h_max' time steps in the future.

    Arguments
    ----------
    n_train : int
        the size of training set
    n_test : int
        the size of testing set
    y : str
        the target variable
    x : list[str]
        the list of exogenous variables
    horizons : int
        the total number of horizons to be forecasted
    file_name : str
        filename to read the data
    m : int
        the number of time steps for a single seasonal period
    p : int
        the value for auto regression order
    d : int
        the value for differencing 
    q : int
        the value for moving average order
    ps : int
        the value for seasonal auto regression order
    ds : int
        the value for seasonal differencing 
    qs : int
        the value for seasonal moving average order
    lags: int
        past values/lags for the exogenous variables


    Returned Values
    ----------
    temp_results : pd.DataFrame
      DataFrame containing the 'horizons' step ahead forecast for the target variable 'y' for the entire test set.

    """
    # Read the CSV file into a DataFrame and set the 'date' column as the index
    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    date_list = df_US['date'].tolist()
    df_US = df_US.drop(columns=['date'])

    # Apply normalization to the DataFrame
    df_US_transf = pd.DataFrame()
    for col in df_US.columns:
        df_US_transf[col] = df_US[col].apply(lambda x: transf(x))
    
    df_y = df_US_transf[y]
    df_x = df_US_transf[x]

     # Shift exogenous variables by the specified lags
    x_lagged_variables = df_x.shift(lags).bfill()
    exog_variables = x_lagged_variables[x]

    # Split the data into training and testing sets
    col_train = df_y.iloc[0:n_train]
    col_test = df_y.iloc[n_train:]

    exo_train = exog_variables.iloc[0:n_train] #training set for exogenous varaible
    exo_test = exog_variables.iloc[n_train:] #testing set for exogenous varaible
    
    # Initialize and fit the SARIMAX model
    model = SARIMAX(col_train, exog=exo_train, order=(p, d, q), seasonal_order=(ps, ds, qs, m),
                    enforce_stationarity=False, enforce_invertibility=False)
    model_fit = model.fit(disp=False)

    temp_results = pd.DataFrame()  # Store forecasts for all iterations

    for i in range(n_test-horizons):  # Loop through the entire dataset
        
        window = i + 1
        exo_data = exo_test.iloc[i:i+horizons]

        forecast = model_fit.forecast(steps=horizons, dynamic=True, exog=exo_data) 
        forecast_f = forecast.to_frame() #change the series into data frame
        
        # Update the model with the new observations
        model_fit = model_fit.append(endog=col_test.iloc[i:i + 1], exog=exo_test.iloc[i:i+ 1], refit=False)

        df_forecast = forecast_f.rename(columns={'predicted_mean': y})
        predict_current = df_forecast.rename(columns={str(y): "window_" + str(window)})
        temp_results = pd.concat([temp_results, predict_current], axis=1)

    return temp_results

In [12]:
#FOR COVID-19
x = ['icu_patients','hosp_patients','new_tests','new_cases','people_vaccinated','people_fully_vaccinated']
file_name = 'datasets/covid_till14May22.csv'
y = 'new_deaths'
horizons = 6
train_ratio = 0.8
df_US = pd.read_csv(file_name)
n_train = round(len(df_US) * train_ratio)
n_test = len(df_US) - n_train
m = 10
d = 1
ds = 0
lags_list = [15]
for lags in lags_list:
    for p in range(5,6):
        for q in range(3,4):
            for ps in range(4,5):
                for qs in range(1,2):
                    print(p,d,q,ps,ds,qs)
                    temp_results =run_model(n_train,n_test,y,x,horizons,file_name,m,p,d,q,ps,ds,qs,lags)
                    df_final_forecast = predict_table(n_test, temp_results,horizons)
                    df_smape_results = smape_results(df_final_forecast, y,file_name)
                    print("sMAPE {}".format(round(df_smape_results,2)))
                    df_mae_results = mae_results(df_final_forecast, y,file_name)
                    print("realMAE {}".format(round(df_mae_results,3)))
                    df_normmae_results = normmae_results(df_final_forecast, y,file_name)
                    print("normMAE {}".format(round(df_normmae_results,3)))

5 1 3 4 0 1
sMAPE          new_deaths
h=1           10.95
h=2           14.84
h=3           19.61
h=4           28.06
h=5           35.35
h=6           42.71
Average       25.25
realMAE          new_deaths
h=1        1391.310
h=2        1835.157
h=3        2156.136
h=4        2822.783
h=5        3605.384
h=6        4536.260
Average    2724.505
normMAE          new_deaths
h=1           0.110
h=2           0.150
h=3           0.198
h=4           0.286
h=5           0.367
h=6           0.451
Average       0.260


In [10]:
#FOR ILI
file_name = 'datasets/national_illness.csv'
read_data = pd.read_csv(file_name)
n_train = round(len(read_data) * 0.8)
n_test = len(read_data) - n_train
y = 'ILITOTAL'
x = ['%UNWEIGHTED ILI','% WEIGHTED ILI', 'AGE 0-4', 'AGE 5-24','NUM. OF PROVIDERS', 'OT']
data = "ILI"
m = 13
d = 1
ds = 0
lags= 6
for horizons in [6]:
    print("horizons: ", horizons)
    for p in range(1,2):
        for q in range(2,3):
            for ps in range(3,4):
                for qs in range(3,4):
                        print(p,d,q,ps,ds,qs)
                        temp_results =run_model(n_train,n_test,y,x,horizons,file_name,m,p,d,q,ps,ds,qs,lags)
                        df_final_forecast = predict_table(n_test, temp_results,horizons)
                        df_smape_results = smape_results(df_final_forecast, y,file_name)
                        print("sMAPE {}".format(df_smape_results.tail(1)))
                        df_mae_results = mae_results(df_final_forecast, y,file_name)
                        print("MAE {}".format(df_mae_results.tail(1)))
                        df_normmae_results = normmae_results(df_final_forecast, y,file_name)
                        print("normMAE {}".format(df_normmae_results.tail(1)))
                        print("===========================")

horizons:  6
1 1 2 3 0 3
sMAPE          ILITOTAL
Average  19.48722
MAE             ILITOTAL
Average  6228.309162
normMAE          ILITOTAL
Average  0.199342
