In [1]:
#import required libraries
import pandas as pd
import numpy as np
import math
from itertools import combinations
from statsmodels.tsa.statespace.sarimax import SARIMAX
from metrics import *
from typing import Callable
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
def transf(x: int)-> int:
    """ 
    This function performs normalization on the data
    
    Arguments
    ----------
    x: int
        the integer data 

    Returned Values
    ----------
    x: int
        the normalized data
        
    """ 
    if (x != 0 and x != np.nan):
        return np.log(x)
    if (x == 0 and x != np.nan): #checks for zeros
        return 0
    if x == np.nan: #checks for nan values
        return np.nan
      
def itransf(x: int)-> int:
    """ 
    This function performs inverse normalization on the data
    
    Arguments
    ----------
    x: int
        the normalized data

    Returned Values
    ----------
    x: int
        original data (unnormalized data)
        
    """ 
    return np.exp(x)

In [3]:
def forecast_SARIMA(window: int, n_train: int, p: int, d: int, q: int, ps: int, ds: int, qs: int, m: int, y: str, h_max: int, transf: Callable[[float], float], itransf: Callable[[float], float] , file_name: str )-> pd.DataFrame:
    
    """
    This function uses the SARIMAX model from statsmodels. It forecasts 'h' time steps in future.

    Arguments
    ----------
    window : int
        window number
    n_train : int
        the size of training set
    p : int
        the value for auto regression order
    d : int
        the value for differencing 
    q : int
        the value for moving average order
    ps : int
        the value for seasonal auto regression order
    ds : int
        the value for seasonal differencing 
    qs : int
        the value for seasonal moving average order
    m : int
        the number of time steps for a single seasonal period
    y : str
        the forecasted variable
    h_max : int
        the maximum number of forecasted horizons
    transf: Callable[[float], float]
        the definition for transformation
    itransf: Callable[[float], float]
        the definition for inverse transformation
    file_name: str
        the file name for the data

    Returned Values
    ----------
    forecast_f : pd.DataFrame
      DataFrame containing the forecasted variable for 'h_max' horizons.

    """

    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    df_US = df_US.drop(columns=['date'])
    
    #Apply transformation
    df_US_transf = pd.DataFrame()
    for col in df_US.columns:
        df_US_transf[col] = df_US[col].apply(lambda x: transf(x))

    df_y = df_US_transf[y]

    # Its a moving window that starts from the window value till the train size plus the 'h' time steps in future
    df_window = df_y[int(window-1):int(window-1+n_train+h_max)]

    i = h_max-1

    #Defining train data
    col_train = df_y.loc[df_window.index][0:n_train]

    model = SARIMAX(col_train, order=(p,d,q), seasonal_order=(ps,ds,qs,m), enforce_stationarity=False, enforce_invertibility=False)
    model_fit = model.fit(disp=False)
    forecast = model_fit.predict(len(col_train), len(col_train)+i)

    forecast = itransf(forecast)
    forecast_f = forecast.to_frame()

    df_forecast = forecast_f.rename(columns= {'predicted_mean': y})

    
    return df_forecast

In [4]:
def window_results(n_train: int, n_test: int, p: int, d: int, q: int, ps: int, ds: int, qs: int, m: int, 
                     y: str, h_max: int, transf: Callable[[float], float] ,itransf: Callable[[float], float], file_name: str)-> pd.DataFrame:
     
    """
    A function that returns the weekly prediction for each window. 

    Arguments
    ----------
    n_train : int
        the size of training set
    n_test : int
        the size of testing set
    p : int
        the value for auto regression order
    d : int
        the value for differencing 
    q : int
        the value for moving average order
    ps : int
        the value for seasonal auto regression order
    ds : int
        the value for seasonal differencing 
    qs : int
        the value for seasonal moving average order
    m : int
        the number of time steps for a single seasonal period
    y : str
        the forecasted variable
    h_max : int
        the maximum number of forecasted horizons
    transf: Callable[[float], float]
        the definition for transformation
    itransf: Callable[[float], float]
        the definition for inverse transformation
    file_name: str
        the file name for the data
        
    Returned Values
    ----------
    temp_results : pd.DataFrame
        A DataFrame containing weekly predictions for each window

    """  
    temp_results = pd.DataFrame()
    for i in range(n_test):
        window = i + 1
        predict_current = forecast_SARIMA(window, n_train, p, d, q, ps, ds, qs, m, y, h_max, transf, itransf, file_name)
        predict_current = predict_current.rename(columns={str(y):"window_"+str(window)})
        temp_results = pd.concat([temp_results, predict_current], axis=1)
#         print(temp_results)
    return temp_results

In [5]:
def output_h(temp_results: pd.DataFrame, h: int, n_test:int)-> pd.DataFrame:
    """
    This function processes the 'temp_results' DataFrame, which contains predictions for h weeks ahead for each window, and organizes the data into a single column for each horizon.

    Arguments
    ----------
    temp_results : pd.DataFrame
        the dataframe that has h weeks ahead prediction for each window
    h : int
        horizon value
    n_test : int
        the size of testing set

    Returned Values
    ----------
    df_results_python : pd.DataFrame
        Dataframe for each horizon.  

    """ 
    n_iter = n_test-h+1
    df_results = []
    if h==1:
        for i in range(int(n_iter)):
          window = i + 1
          current_results = temp_results[['window_'+str(window)]].dropna()
          current_results = current_results.rename(columns={'window_'+str(window): "h="+str(h)})
          df_results.append(pd.DataFrame(current_results.iloc[0]).transpose())
    if h>1:
        for i in range(int(h-1)):
          window = i + 1
          current_results = temp_results[['window_'+str(window)]].dropna()
          current_results = current_results.rename(columns={'window_'+str(window): "h="+str(h)})
          df_results.append(pd.DataFrame(current_results.iloc[0]).transpose())

        for i in range(int(n_iter)):
          window = i + 1
          current_results = temp_results[['window_'+str(window)]].dropna()
          current_results = current_results.rename(columns={'window_'+str(window): "h="+str(h)})
          df_results.append(pd.DataFrame(current_results.iloc[int(h-1)]).transpose())
    df_results_python = pd.concat(df_results,axis=0)
    return df_results_python

In [6]:
def predict_table(n_test: int, temp_results:pd.DataFrame, h_max: int)-> pd.DataFrame:
    """
    A function that iterates through each horizon and combines the predictions for each horizon into a final DataFrame that has 'h' columns for each week.

    Arguments
    ----------
    n_test : int
        the size of testing set
    temp_results : pd.DataFrame
        the dataframe that has h weeks ahead prediction for each window
    h_max : int
        the maximum number of forecasted horizons

    Returned Values
    ----------
    df_final_forecast : pd.DataFrame
        DataFrame presenting the h weeks ahead prediction for each week.

    """ 
    df_final_forecast = pd.DataFrame()
    for h in range(1,h_max+1):
        df_current = output_h(temp_results, h, n_test)
        df_final_forecast = pd.concat([df_final_forecast, df_current], axis=1)
    return df_final_forecast

In [7]:
def smape_results(df_final_forecast: pd.DataFrame ,y: str, file_name: str)-> pd.DataFrame:
    """ 
    This function calculates the sMAPE score using the sMAPE definition between the original data and the predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        the predicted DataFrame containing h weeks ahead prediction for each week
    y : str
        the forecasted variable
    file_name: str
        the file name for the data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with SMAPE scores for each horizon and the average SMAPE score.
        
    """  
    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    h_level = df_final_forecast.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = df_US.loc[df_final_forecast.index][y]
        y_pred = df_final_forecast[cols]
        value = smape(y_true, y_pred)
        eval.append(value)
    eval.append("{:.2f}".format(np.mean(eval)))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [8]:
def mae_results(df_final_forecast: pd.DataFrame ,y: str, file_name: str)-> pd.DataFrame:
    """ 
    This function calculates the MAE score using the MAE definition between the original data and the predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        the predicted DataFrame containing h weeks ahead prediction for each week
    y : str
        the forecasted variable
    file_name: str
        the file name for the data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with MAE scores for each horizon and the average MAE score.
        
    """  
    df_US = pd.read_csv(file_name)
    df_US.index = pd.to_datetime(df_US['date'])
    h_level = df_final_forecast.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = df_US.loc[df_final_forecast.index][y]
        y_pred = df_final_forecast[cols]
        value = mae(y_true, y_pred)
        eval.append(value)
    eval.append("{:.2f}".format(np.mean(eval)))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [9]:
#SARIMA cdc results without data alignment
import warnings
warnings.filterwarnings('ignore')
n_train = 40
n_test = 79
y = 'new_deaths'
file_name = 'covid_weekly_cdc.csv'
m = 10
p = 4
d = 1
q = 0
ps = 1
ds = 0
qs = 0
h_max = 4
temp_results = window_results(n_train, n_test, p, d, q, ps, ds, qs, m, y, h_max, transf,itransf,file_name)
df_final_forecast = predict_table(n_test, temp_results, h_max)
df_smape_results = smape_results(df_final_forecast, y,file_name)
print("sMAPE {}".format(df_smape_results))
df_mae_results = mae_results(df_final_forecast, y,file_name)
print("MAE {}".format(df_mae_results))

sMAPE         new_deaths
h=1      12.793226
h=2      18.264951
h=3      24.747963
h=4      33.405157
Average      22.30
MAE           new_deaths
h=1      1184.381173
h=2      1838.002248
h=3      2371.215004
h=4      3004.409528
Average      2099.50


In [10]:
def calculate_smape(df_final_forecast: pd.DataFrame, df_US_CDC: pd.DataFrame, model: str)-> pd.DataFrame:
    """ 
    This function calculates the SMAPE score of SARIMA model using the SMAPE definition by aligning the dates with the CDC data (removing
    the missing dates)
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        the predicted DataFrame from SARIMA model
    df_US_CDC: pd.DataFrame
        the predicted DataFrame from CDC model
    model: str
        the name of the CDC model

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with SMAPE scores for each horizon and the average SMAPE score for SARIMA model.
        
    """ 
    common_dates = df_final_forecast.index.intersection(df_US_CDC.index)
    dates_not_present = df_final_forecast.index.difference(df_US_CDC.index)

    df_final_forecast_aligned = df_final_forecast.loc[common_dates]

    col_core = 'new_deaths'
    df_US = pd.read_csv('covid_weekly_cdc.csv')
    df_US.index = pd.to_datetime(df_US['date'])
    h_level = df_final_forecast_aligned.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = df_US.loc[df_final_forecast_aligned.index][y]
        y_pred = df_final_forecast_aligned[cols]
        value = smape(y_true, y_pred)
        eval.append(value)
    eval.append("{:.2f}".format(np.mean(eval)))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [11]:
def calculate_mae(df_final_forecast: pd.DataFrame, df_US_CDC: pd.DataFrame, model: str)-> pd.DataFrame:
    """ 
    This function calculates the MAE score of SARIMA model using the MAE definition by aligning the dates with the CDC data (removing
    the missing dates)
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        the predicted DataFrame from SARIMA model
    df_US_CDC: pd.DataFrame
        the predicted DataFrame from CDC model
    model: str
        the name of the CDC model

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with MAE scores for each horizon and the average MAE score for SARIMA model.
        
    """ 
    common_dates = df_final_forecast.index.intersection(df_US_CDC.index)
    dates_not_present = df_final_forecast.index.difference(df_US_CDC.index)
    df_final_forecast_aligned = df_final_forecast.loc[common_dates]
    col_core = 'new_deaths'
    df_US = pd.read_csv('covid_weekly_cdc.csv')
    df_US.index = pd.to_datetime(df_US['date'])
    h_level = df_final_forecast_aligned.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = df_US.loc[df_final_forecast_aligned.index][y]
        y_pred = df_final_forecast_aligned[cols]
        value = mae(y_true, y_pred)
        eval.append(value)
    eval.append("{:.2f}".format(np.mean(eval)))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [12]:
def smape_results_for_model(df_final_forecast: pd.DataFrame ,y: str, CDC_file: str)-> pd.DataFrame:
    """ 
    This function calculates the sMAPE score of CDC model using the sMAPE definition between the original data and the predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        the predicted DataFrame containing h weeks ahead prediction for each week
    p : int
        the value for auto regression order
    CDC_file : str
        The path for the CDC data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with SMAPE scores for each horizon and the average SMAPE score.
        
    """  
    df_US = pd.read_csv(CDC_file, on_bad_lines='skip')
    observed_data = df_US[df_US["Model"]=='observed']
    observed_data = observed_data[['Date of Forecasted Point', 'Point']]
    observed_data.index = pd.to_datetime(observed_data['Date of Forecasted Point'])
    observed_data = observed_data.drop(columns=['Date of Forecasted Point'])
    h_level = df_final_forecast.columns.to_list()
    eval = []
    for cols in h_level:
        y_true = observed_data.loc[df_final_forecast.index][y]
        y_pred = df_final_forecast[cols]
        value = smape(y_true, y_pred)
        eval.append(value)
    eval.append("{:.2f}".format(np.mean(eval)))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [13]:
def mae_results_for_model(df_final_forecast: pd.DataFrame ,y: str, CDC_file: str)-> pd.DataFrame:
    """ 
    This function calculates the sMAPE score of CDC model using the sMAPE definition between the original data and the predictions for each horizon
    
    Arguments
    ----------
    df_final_forecast: pd.DataFrame
        the predicted DataFrame containing h weeks ahead prediction for each week
    p : int
        the value for auto regression order
    CDC_file : str
        The path for the CDC data

    Returned Values
    ----------
    df_results : pd.DataFrame
        DataFrame with SMAPE scores for each horizon and the average SMAPE score.
        
    """  
    df_US = pd.read_csv(CDC_file, on_bad_lines='skip')
    observed_data = df_US[df_US["Model"]=='observed']
    observed_data = observed_data[['Date of Forecasted Point', 'Point']]
    observed_data.index = pd.to_datetime(observed_data['Date of Forecasted Point'])
    observed_data = observed_data.drop(columns=['Date of Forecasted Point'])
    h_level = df_final_forecast.columns.to_list()

    eval = []
    for cols in h_level:
        y_true = observed_data.loc[df_final_forecast.index][y]
        y_pred = df_final_forecast[cols]
        value = mae(y_true, y_pred)
        eval.append(value)
    eval.append("{:.2f}".format(np.mean(eval)))
    df_results = pd.DataFrame(eval, columns=[str(y)])
    h_level.append('Average')
    df_results.index = h_level
    return df_results

In [14]:
def cdc_data(model: str, CDC_file: str):
    """ 
    This function shows the 4 weeks ahead prediction with dates aligned and the results for the CDC model and SARIMA
    
    Arguments
    ----------
    model : str
        The name of the CDC model
    CDC_file : str
        The path for the CDC data   
    """ 

    df_US_CDC = pd.read_csv(CDC_file, on_bad_lines='skip')  # Read the CDC data from the specified file
    df_US_CDC = df_US_CDC[(df_US_CDC['Model'] == model)] # Filter the data for the specified model
    df_US_CDC = df_US_CDC[['Date Model was run', 'Point']] # Select relevant columns
    df_US_CDC['Date Model was run'] = pd.to_datetime(df_US_CDC['Date Model was run'])

     # Sort values by 'Date Model was run' and group by the same to align the dates
    df_US_CDC = df_US_CDC.sort_values('Date Model was run').groupby('Date Model was run')['Point'].apply(lambda df: df.reset_index(drop=True)).unstack().reset_index()

    df_US_CDC.set_index('Date Model was run', inplace=True)
    df_US_CDC.columns =['h=1', 'h=2', 'h=3', 'h=4'] # Rename columns to indicate the horizon (h=1 to h=4)
    pd.set_option('display.max_rows', 10)

    # Filter the dataframe for the specified date range
    start_date = pd.to_datetime('2020-12-05')
    end_date = pd.to_datetime('2022-06-04')
    df_US_CDC_1 = df_US_CDC.loc[start_date:end_date]

   # Print results (sMAPE and MAE) for the specified CDC model
    print(model)
    df_smape_results_model = smape_results_for_model(df_US_CDC_1,"Point",CDC_file)
    df_smape_results_model.columns = [model] * len(df_smape_results_model.columns)
    print("{}".format(df_smape_results_model))
    df_mae_results = mae_results_for_model(df_US_CDC_1,"Point", CDC_file)
    df_mae_results.columns = [''] * len(df_mae_results.columns)
    print("{}".format(df_mae_results))

    #Print results for SARIMA according to the missing dates in CDC data
    print("SARIMA")
    smape_result = calculate_smape(df_final_forecast, df_US_CDC_1, model)
    print(smape_result)
    mae_result = calculate_mae(df_final_forecast, df_US_CDC_1, model)
    print(mae_result)

In [15]:
#define the CDC file name and call the method to display results for the specified model and SARIMA
CDC_file = 'concatenated_CDC_20_21_22_23.csv'
cdc_data("MIT-LCP", CDC_file )

MIT-LCP
           MIT-LCP
h=1      14.404509
h=2      13.874602
h=3      12.332699
h=4      15.659168
Average      14.07
                    
h=1      1278.690141
h=2      1276.788732
h=3      1131.605634
h=4      1430.478873
Average      1279.39
SARIMA
        new_deaths
h=1      12.902679
h=2      18.406593
h=3      23.904825
h=4      34.152437
Average      22.34
          new_deaths
h=1       1173.96651
h=2      1843.061037
h=3      2294.944499
h=4      3117.443485
Average      2107.35


In [16]:
#Category 1 models
# cdc_data("BPagano", CDC_file)
# cdc_data("Columbia", CDC_file)
# cdc_data("MIT-LCP", CDC_file)
# cdc_data("CovidComplete", CDC_file)
# cdc_data("ESG", CDC_file)
# cdc_data("GT-DeepCOVID", CDC_file)
# cdc_data("JHU-APL", CDC_file)
# cdc_data("Karlen", CDC_file)
# cdc_data("MIT-ORC", CDC_file)
# cdc_data("MOBS", CDC_file)
# cdc_data("PSI", CDC_file)
# cdc_data("UCSD-NEU", CDC_file)
# cdc_data("UM", CDC_file)

In [17]:
# #Category 2 models
# cdc_data("Covid19Sim", CDC_file)
# cdc_data("IHME", CDC_file)
# cdc_data("JHU-IDD", CDC_file)
# cdc_data("Microsoft", CDC_file)
# cdc_data("MIT-ISOLAT", CDC_file)
# cdc_data("QJHong", CDC_file)
# cdc_data("UMass-MB", CDC_file)
# cdc_data("Columbia-UNC", CDC_file)
# cdc_data("DDS", CDC_file)
# cdc_data("IEM", CDC_file)
# cdc_data("ISU", CDC_file)
# cdc_data("JCB", CDC_file)
# cdc_data("JHU-CSSE", CDC_file)
# cdc_data("LANL", CDC_file)
# cdc_data("LUcompUncertLab", CDC_file)
# cdc_data("Masaryk", CDC_file)
# cdc_data("NotreDame-Mobility", CDC_file)
# cdc_data("Oliver-Wyman", CDC_file)
# cdc_data("RPI-UW", CDC_file)
# cdc_data("UA", CDC_file)
# cdc_data("UCLA", CDC_file)
# cdc_data("UCM", CDC_file)
# cdc_data("UGA-CEID", CDC_file)
# cdc_data("UpstateSU", CDC_file)
# cdc_data("UT", CDC_file)
# cdc_data("Ensemble", CDC_file)
# cdc_data("ERDC", CDC_file)
# cdc_data("LSHTM", CDC_file)
# cdc_data("LNQ", CDC_file)
# cdc_data("UCLA", CDC_file)
# cdc_data("UCM", CDC_file)
# cdc_data("UGA-CEID", CDC_file)
# cdc_data("UpstateSU", CDC_file)
# cdc_data("UT", CDC_file)