In [4]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import math
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.graphics.tsaplots import plot_acf
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error


In [2]:
df=pd.read_csv('daily_covid_cases.csv')
train_size = int(len(df) * 0.65)
train, test = df.iloc[:train_size], df.iloc[train_size:]

In [11]:
# Function to find optimal lag using ACF heuristic
def find_optimal_lag(series, max_lag=60):
    acf_vals = acf(series, nlags=max_lag)
    threshold = 2 / np.sqrt(len(series))
    significant_lags = np.where(np.abs(acf_vals) > threshold)[0]
    # Exclude lag 0 and return max significant lag
    significant_lags = significant_lags[significant_lags > 0]
    return significant_lags[-1] if len(significant_lags) > 0 else 1

# Find optimal lag
optimal_lag = find_optimal_lag(train['new_cases'])
print(f"Optimal lag based on ACF heuristic: {optimal_lag}")

# Train AR model with optimal lag
model = AutoReg(train['new_cases'], lags=optimal_lag, old_names=False).fit()


# Calculate metrics
def safe_mape(actual, pred):
    ape = np.abs((actual - pred) / actual)
    ape = ape.replace([np.inf, -np.inf], np.nan).dropna()
    return np.mean(ape) * 100 if len(ape) > 0 else np.nan

rmse_optimal = np.sqrt(mean_squared_error(test['new_cases'], predictions))
mape_optimal = safe_mape(test['new_cases'], pd.Series(predictions))

print(f"\nOptimal AR({optimal_lag}) Model Performance:")
print(f"RMSE: {rmse_optimal:.2f} µg/m³")
print(f"MAPE: {mape_optimal:.2f}%")


Optimal lag based on ACF heuristic: 60

Optimal AR(60) Model Performance:
RMSE: 1860.87 µg/m³
MAPE: nan%
