# ARIMA

I decided to try out ARIMA as a baseline because the mean/median has be shown to perform very well and I had never used ARIMA before this so I wanted to try it out.

ARIMA is a linear regression model that uses its own lag as predictors. Arima models must be made stationary before use as linear regression models work best when the predictors are not correlated and are independent of each other.

To summarise it's very useful for getting a better understanding of the data - but I would not recommend you use it. It's (weirdly?) very slow - atleast with Statsmodels, a little fiddly and the performance is not that great (similar to mean/median). Perhaps it could be useful for imputation of the missing values or useful in combination with another model.

- AR is the auto regressive term - It refers to the number of lags to be used as predictors (auto - itself / regression - regression)
- MA is the moving average term - It refers refers to the number of lagged forecast errors that should go into the ARIMA Model.



Useful resources:

[Resource 1](https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/)

[Resource 2](https://online.stat.psu.edu/stat510/lesson/4/4.1)



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
import seaborn as sns
import datetime

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss

from statsmodels.stats.diagnostic import acorr_ljungbox #Ljung-Box test of autocorrelation in residuals. Also has Box-Pierce 
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_predict

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv", index_col='row_id', parse_dates=['time'])
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv", index_col='row_id', parse_dates=['time'])

In [None]:
train_df["roadway"] = train_df["x"].astype(str) + train_df["y"].astype(str) + train_df["direction"]
test_df["roadway"] = test_df["x"].astype(str) + test_df["y"].astype(str) + test_df["direction"]
train_df.drop(columns=["x","y","direction"], inplace=True)
test_df.drop(columns=["x","y","direction"], inplace=True)

In [None]:
def add_features(df):
    new_df = df.copy()
    
    new_df['minutes'] = df['time'].dt.hour * 60 + df['time'].dt.minute
    new_df['dayofweek'] = df['time'].dt.dayofweek
    new_df['week'] = df['time'].dt.week
    new_df['date'] = df['time'].dt.date
 
    #new_df.drop(columns=["time"], inplace=True)
    
    return new_df

In [None]:
train_df_2 = add_features(train_df)
test_df_2 = add_features(test_df)

I have decided to use a time series of Mondays only, so that I am only dealing with daily seasonality and not weekly seasonality.

In [None]:
#Removing: 
#Memorial day - Monday May 27 1991
#Labor Day - Monday 2nd September 1991

mon = train_df_2[(train_df_2["dayofweek"] == 0) & (~train_df_2["date"].isin([datetime.date(1991, 5, 27),datetime.date(1991, 9, 2)]))]

In [None]:
#Assign each time a unique ID number to make plotting easier.
enc = OrdinalEncoder()
enc.fit(mon[["time"]])
mon["time_count"] = enc.transform(mon[["time"]]).astype(int)
mon

In [None]:
mon_test = mon.copy()

In [None]:
mon = mon.loc[(mon["week"] != 40)] # Remove September 30th morning for validation purposes
val_df = mon[(mon["week"] == 39) & (mon["minutes"] >= 720)] # Using 23rd September afternoon for validation
mon.loc[:,"val"] = 0 # Identifies that the row will/wont be used for validation
mon.loc[val_df.index,"val"] = 1

In [None]:
def plot_series(df):
    plt.subplots(figsize=(25, 6))
    plt.title("Time series")
    xticks = df[df["minutes"]==0]["time_count"].values
    xtick_dates = df["week"].unique()
    ax = sns.lineplot(data=df, x="time_count", y="congestion", linewidth=1,hue="val" );


In [None]:
def plot_series_diff(df):
    temp_df = df.copy()

    temp_df["congestion_diff_72"] = temp_df["congestion"].diff(periods=72)
    temp_df = temp_df.dropna()
    plt.subplots(figsize=(25, 6))
    plt.title("Stationary time series (difference 72)")
    xticks = df[df["minutes"]==0]["time_count"].values
    xtick_dates = df["week"].unique()
    ax = sns.lineplot(data=temp_df, x="time_count", y="congestion_diff_72", linewidth=1 );

    return temp_df["congestion_diff_72"]

In [None]:
#KPSS: Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null hypothesis that x is level or trend stationary.

#ADF: The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. 
#If the pvalue is above a critical size, then we cannot reject that there is a unit root.

#Note: These tests wont detect seasonal stationarity.

def check_stationary(series):
    adf_results = adfuller(series)
    kpss_results = kpss(series)
    print("ADF Statistic:", adf_results[0])
    print("p-value:", adf_results[1])
    
    print("KPSS Statistic:", kpss_results[0])
    print("p-value:", kpss_results[1])
    if adf_results[1]<0.05:
        print("ADF: p<0.05, we can reject the null hypothesis - the time series is likely stationary")
    else:
        print("ADF: p>0.05, we cannot reject the null hypothesis the time series is likely NOT stationary.")
        
    if kpss_results[1]<0.05:
        print("KPSS: p<0.05, we can reject the null hypothesis that x is level/trend stationary - the time series is likely NOT stationary")
    else:
        print("KPSS: p>0.05, we cannot reject the null hypothesis the time series is likely stationary.")

In [None]:
def plot_acf_pacf(df):
    temp_df = df.copy()
    temp_df["congestion_diff_72"] = temp_df["congestion"].diff(periods=72)
    temp_df = temp_df.dropna()
    
    f,ax= plt.subplots(figsize=(25, 12))
    ax = plt.subplot(2, 1, 1)
    plot_acf(temp_df["congestion_diff_72"], lags=300, ax=ax);
    for i in range(5):
            plt.axvline(i*72, color='r', lw=1)
            
    ax = plt.subplot(2, 1, 2)
    plot_pacf(temp_df["congestion_diff_72"], lags=300, ax=ax);
    for i in range(5):
        plt.axvline(i*72, color='r', lw=1)

In [None]:
def plot_forecasts(model, y, y_val, mean_preds,median_preds):
    y.rename("train congestion", inplace=True)
    y_val.rename("validation congestion", inplace=True)
    fig, ax = plt.subplots(figsize=(25, 20))
    ax = plt.subplot(4, 1, 1)
    ax = y.loc["1980-1-22":].plot(ax=ax)
    ax = y_val.plot(ax=ax)
    ax = mean_preds.plot(ax=ax)
    fig = plot_predict(model, start = "1980-1-22", end ="1980-1-25", ax=ax, linewidth = 1, dynamic = False)
    #ax.set_ylim([20,80]);
    
    ax = plt.subplot(4, 1, 2)
    ax = y.loc["1980-1-20":].plot(ax=ax)
    ax = y_val.plot(ax=ax)
    ax = mean_preds.plot(ax=ax)
    fig = plot_predict(model, start = "1980-1-20", end ="1980-1-26", ax=ax, linewidth = 1, dynamic = False)
    #ax.set_ylim([20,80]);
    
    ax = plt.subplot(4, 1, 3)
    ax = y.plot()
    ax = y_val.plot(ax=ax)
    ax = mean_preds.plot(ax=ax)
    fig = plot_predict(model, ax=ax)
    ax.set_ylim([0,100]);
    
    ax = plt.subplot(4, 1, 4)
    ax = y.loc["1980-1-24":].plot(ax=ax)
    ax = y_val.plot(ax=ax)
    ax = mean_preds.plot(ax=ax)
    ax = median_preds.plot(ax=ax)
    fig = plot_predict(model, start = "1980-1-24", end ="1980-1-25", ax=ax, linewidth = 1, dynamic = False)
    #ax.set_ylim([20,80]);

In [None]:
def val_error(model, mean_preds, median_preds, y_val):
    preds = model.predict(start = "1980-1-24 12:00:00", end ="1980-1-24 23:40:00")
    mae_arima = mean_absolute_error(preds,y_val)
    mae_mean = mean_absolute_error(mean_preds,y_val)
    mae_median = mean_absolute_error(median_preds,y_val)
    df = pd.DataFrame({"MAE": [mae_arima, mae_mean, mae_median]},index=["arima","mean", "median"])
    return df

#val_error(model=arma_res, mean_preds=mean_preds, median_preds=median_preds, y_val=y_val)

In [None]:
def decide_orders(df):
    plot_series(df)
    series = plot_series_diff(df)
    check_stationary(series)
    plot_acf_pacf(df)

In [None]:
def sarima(df, order, seasonal_order):
    
    dates = pd.date_range("1980-1-1", freq="20T", periods=temp_df.loc[temp_df["val"]==0,"time_count"].nunique()) # Wrong dates for now just so it shows up nicer in plots
    dates_2 = pd.date_range("1980-1-24 12:00:00", freq="20T", periods=temp_df.loc[temp_df["val"]==1,"time_count"].nunique())
    y = pd.Series(df.loc[df["val"]==0,"congestion"].values, index=dates)
    y_val =  pd.Series(df.loc[df["val"]==1,"congestion"].values, index=dates_2)
    
    arma_model = ARIMA(y, order=order, seasonal_order = seasonal_order, trend="n")
    arma_res = arma_model.fit()
    
    print(arma_res.summary())
    arma_res.plot_diagnostics(figsize=(10, 10), lags = 10, auto_ylims=True);
    
    mean_preds = df.loc[df["val"] == 0].groupby(["minutes"])["congestion"].mean()
    median_preds = df.loc[df["val"] == 0].groupby(["minutes"])["congestion"].median()
    mean_preds = pd.Series(mean_preds.loc[720:].values, index=dates_2, name = "mean prediction")
    median_preds = pd.Series(median_preds.loc[720:].values, index=dates_2, name = "median prediction")
    
    plot_forecasts(model = arma_res, y = y, y_val=y_val, mean_preds=mean_preds,median_preds=median_preds)
    
    val_err = val_error(arma_res, mean_preds, median_preds, y_val)
    return val_err

In [None]:
def sarima_all():
    model_dict = {}
    roadways = mon["roadway"].unique()
    for roadway in roadways:
        print(roadway)
        df = mon[mon["roadway"] == roadway]
    
        dates = pd.date_range("1980-1-1", freq="20T", periods=df.loc[df["val"]==0,"time_count"].nunique()) # Wrong dates for now just so it shows up nicer in plots
        dates_2 = pd.date_range("1980-1-24 12:00:00", freq="20T", periods=df.loc[df["val"]==1,"time_count"].nunique())

        y = pd.Series(df.loc[df["val"]==0,"congestion"].values, index=dates)
        y_val =  pd.Series(df.loc[df["val"]==1,"congestion"].values, index=dates_2)
        
        arma_model = ARIMA(y, order = (1,0,1), seasonal_order = (0,1,1,72), trend="n") # Lets do them all the same for now - not optimal
        arma_res = arma_model.fit()
        
        mean_preds = df.loc[df["val"] == 0].groupby(["minutes"])["congestion"].mean()
        median_preds = df.loc[df["val"] == 0].groupby(["minutes"])["congestion"].median()
        mean_preds = pd.Series(mean_preds.loc[720:].values, index=dates_2, name = "mean prediction")
        median_preds = pd.Series(median_preds.loc[720:].values, index=dates_2, name = "median prediction")
    
        val_err = val_error(arma_res, mean_preds, median_preds, y_val)
        print(val_err)
        model_dict[roadway] = val_err.sort_values(by="MAE").head(1).index[0]
        
        arma_res.remove_data() # RAM keeps increasing each roadway for some reason - maybe this will help?
    return model_dict
    
        

In [None]:
best_models = sarima_all()

# Arima Experiments

## 02NB

An example of the decision making process behind selecting the orders for the model, using ACF and PACF.

In [None]:
temp_df = mon[mon["roadway"] == "02NB"]
decide_orders(temp_df)

Observations:
- Significant ACF spikes at lag 1,71,72,73. 
- The PACF tapers to 0 seasonally.
- The PACF has 2 significant PACF spikes at lags 1 and 2.

Non-Seasonal terms: 
- A Spike at lag 1 in both the ACF and PACF could indicate either MA(1) or AR(1) terms. Perhaps both - ARMA(1,1). As the PCAF early terms might be slightly tapering an MA(1) term is more likely than an AR(1). 
- Perhaps an AR(2) term is also possible.

Seasonal
- There is 1 significant spike at lag 72 in the ACF (spikes at lags 71 and 73 too). A seasonal MA(1) component seems likely.

Possible ARIMA modles to consider (in order of most likely):

1. (0,0,1) x (0,1,1)$_{72}$
2. (1,0,1) x (0,1,1)$_{72}$
3. (1,0,0) x (0,1,1)$_{72}$
3. (2,0,0) x (0,1,1)$_{72}$
3. (2,0,1) x (0,1,1)$_{72}$

non_seasonal order(AR order , diff order, MA order) x season_order(AR order, diff order, MA order)lags in season

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

In [None]:
model = sarima(temp_df, order = (0,0,1), seasonal_order = (0,1,1,72))

In [None]:
model = sarima(temp_df, order = (1,0,0), seasonal_order = (0,1,1,72))

In [None]:
model = sarima(temp_df, order = (2,0,0), seasonal_order = (0,1,1,72))

In [None]:
model = sarima(temp_df, order = (2,0,1), seasonal_order = (0,1,1,72))

(1,0,1) x (0,1,1)72 or (0,0,1) x (0,1,1)72 seem fine.

Lets try out some more:

## 01NB

In [None]:
temp_df = mon[mon["roadway"] == "01NB"]
decide_orders(temp_df)

- This has the same pattern as 02NB

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 01SB

In [None]:
temp_df = mon[mon["roadway"] == "01SB"]
decide_orders(temp_df)

- Similar pattern

In [None]:
sarima(temp_df, order = (0,0,1), seasonal_order = (0,1,1,72))

# 03NB

In [None]:
temp_df = mon[mon["roadway"] == "03NB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (0,0,1), seasonal_order = (0,1,1,72))

# 03WB

In [None]:
temp_df = mon[mon["roadway"] == "03WB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 10WB

In [None]:
temp_df = mon[mon["roadway"] == "10WB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 11NB

In [None]:
temp_df = mon[mon["roadway"] == "11NB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 12NB

In [None]:
temp_df = mon[mon["roadway"] == "12NB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 12EB

In [None]:
temp_df = mon[mon["roadway"] == "12EB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (0,0,1), seasonal_order = (0,1,1,72))

# 12SB

In [None]:
temp_df = mon[mon["roadway"] == "12SB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 13NB

In [None]:
temp_df = mon[mon["roadway"] == "13NB"] 
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 20WB

In [None]:
temp_df = mon[mon["roadway"] == "20WB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 21EB

In [None]:
temp_df = mon[mon["roadway"] == "21EB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 21WB

In [None]:
temp_df = mon[mon["roadway"] == "21WB"] 
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 22NB

In [None]:
temp_df = mon[mon["roadway"] == "22NB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 22EB

In [None]:
temp_df = mon[mon["roadway"] == "22EB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# 22SB

In [None]:
temp_df = mon[mon["roadway"] == "22SB"]
decide_orders(temp_df)

In [None]:
sarima(temp_df, order = (1,0,1), seasonal_order = (0,1,1,72))

# Predictions on test day

We use the full time series of mondays to make predictions on the test day. We use either the mean,median or ARIMA depending on which model performed best on validation.

In [None]:
import operator
print("mean", operator.countOf(best_models.values(), "mean"))
print("median", operator.countOf(best_models.values(), "median"))
print("arima", operator.countOf(best_models.values(), "arima"))

In [None]:
def make_preds():
    test = test_df_2.copy()
    for key,val in best_models.items():
        print(key)
        df = mon_test[mon_test["roadway"] == key]
        dates = pd.date_range("1980-1-1", freq="20T", periods=df["time_count"].nunique()) # Wrong dates for now just so it shows up nicer in plots
        if val == "arima":
            y = pd.Series(df["congestion"].values, index=dates)

            arma_model = ARIMA(y, order = (1,0,1), seasonal_order = (0,1,1,72), trend="n") # Lets do them all the same for now - definitely not optimal
            arma_res = arma_model.fit()
            preds = arma_res.predict(start = "1980-1-25 12:00:00", end ="1980-1-25 23:40:00")
            arma_res.remove_data() # RAM keeps increasing each roadway for some reason - maybe this will help?
            
            preds = preds.rename("congestion")
            p = preds.reset_index()
            p["minutes"] = p["index"].dt.minute + p["index"].dt.hour*60
            p["roadway"] = key
            preds = p.groupby(["roadway","minutes"])["congestion"].first()
        
        if val == "mean":
            preds = df[df["minutes"]>=720].groupby(["roadway","minutes"])["congestion"].mean()
            #return preds
        if val == "median":
            preds = df[df["minutes"]>=720].groupby(["roadway","minutes"])["congestion"].median()
            #preds = pd.Series(median_preds.loc[720:].values, index=dates_2, name = "median prediction")
            
        #test = test.merge(preds, how="left", on=["roadway", "minutes"])
        if key == "00EB":
            preds_all = preds
        else:
            preds_all = pd.concat([preds_all,preds])
    
    test = test.merge(preds_all, how="left", left_on=["roadway", "minutes"], right_index=True)
    return test
pred_df = make_preds()
    

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
submission['congestion'] = pred_df["congestion"].values.round().astype(int)

In [None]:
submission.to_csv('submission.csv', index=False)