# Overview

This note is a trial of some ideas, and also I used kaggle's time series course as a reference.

https://www.kaggle.com/learn/time-series

In [None]:
from warnings import simplefilter

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

# Checking data

In [None]:
train_df['direction'].unique()

In [None]:
print(pd.to_datetime(train_df['time']).max())
print(pd.to_datetime(train_df['time']).min())
print(pd.to_datetime(test_df['time']).max())
print(pd.to_datetime(test_df['time']).min())

In [None]:
print(train_df['x'].unique())
print(train_df['y'].unique())

In [None]:
# Preparing Data.

train_time = train_df.copy()
train_time['datetime'] = pd.to_datetime(train_time['time'])
train_time.drop('time', axis=1, inplace=True)
train_time.set_index('datetime', inplace=True)
train_time.drop('row_id',axis=1, inplace=True)

test_time = test_df.copy()
test_time['datetime'] = pd.to_datetime(test_time['time'])
test_time.drop('time', axis=1, inplace=True)
test_time.set_index('datetime', inplace=True)
test_time.drop('row_id',axis=1, inplace=True)

The data show a clear seasonality.

In [None]:
train_time['congestion'].resample('M').mean().plot()

# Position and Congestion

In [None]:
train_time_xy = train_time.groupby(['datetime','x','y'],as_index=True ).mean()
train_time_xy.reset_index(inplace = True)
train_time_xy.set_index('datetime', inplace=True)

test_time_xy = test_time.groupby(['datetime','x','y'],as_index=True ).mean()
test_time_xy.reset_index(inplace = True)
test_time_xy.set_index('datetime', inplace=True)

In [None]:
fig, plx = plt.subplots(4, 3, figsize=(32, 24))

for posx in range(3):
    for posy in range(4):
        cs0 = train_time.query(f'x == {posx} & y == {posy}')
        aggd = cs0.resample('M').mean()
        cc = aggd.pivot(columns='x',values='congestion')
        ax = plx[posy, posx]
        ax.plot(cc)
        ax.set_title(f'x={posx} y={posy}')

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


# annotations: https://stackoverflow.com/a/49238256/5769929
def seasonal_plot(X, y, period, freq, ax=None, ttlpos=''):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq}) " + ttlpos)
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax


def plot_periodogram(ts, detrend='linear', ax=None, ttl='Periodogram'):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title(ttl)
    return ax

In [None]:
X = train_time_xy.copy()
X["day"] = X.index.dayofweek  # the x-axis (freq)
X["week"] = X.index.week  # the seasonal period (period)
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year

#fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
fig, (ax0) = plt.subplots(4, 3, figsize=(22, 12))

for posx in range(3):
    for posy in range(4):
        X0 = X.query(f'x == {posx} & y == {posy}')
        seasonal_plot(X0, y="congestion", period="week", freq="day", ax=ax0[posy, posx], ttlpos=f'x={posx} y={posy}')

In [None]:
fig, (ax0) = plt.subplots(4, 3, figsize=(22, 12))

for posx in range(3):
    for posy in range(4):
        X0 = X.query(f'x == {posx} & y == {posy}')
        seasonal_plot(X0, y="congestion", period="year", freq="dayofyear", ax=ax0[posy, posx], ttlpos=f'x={posx} y={posy}');

In [None]:
fig, (ax0) = plt.subplots(4, 3, figsize=(22, 16))

for posx in range(3):
    for posy in range(4):
        X0 = X.query(f'x == {posx} & y == {posy}')
        #plot_periodogram(X0.congestion, f'Periodogram x={posx} y={posy}', ax=ax0[posy, posx]);
        plot_periodogram(X0.congestion, ax=ax0[posy, posx], ttl=f'Periodogram x={posx} y={posy}');

# Direction and Congestion

In [None]:
train_time_dir = train_time.groupby(['datetime','direction'],as_index=True ).mean()
train_time_dir.reset_index(inplace = True)
train_time_dir.set_index('datetime', inplace=True)
train_time_dir.drop(['x','y'],axis=1, inplace=True)

test_time_dir = test_time.groupby(['datetime','direction'],as_index=True ).mean()
test_time_dir.reset_index(inplace = True)
test_time_dir.set_index('datetime', inplace=True)
test_time_dir.drop(['x','y'],axis=1, inplace=True)

In [None]:
fig, alx = plt.subplots(8,1,figsize=(24, 32))
dirs = train_time_dir['direction'].unique()
wd = train_time_dir.pivot(columns='direction',values='congestion')

for i,d in enumerate(dirs):
    ax = alx[i]
    ax.plot(wd[d])
    ax.set_title(f'dir={d}')

In [None]:
fig, (ax0) = plt.subplots(8, 1, figsize=(24, 32))
dirs = train_time_dir['direction'].unique()

for i,d in enumerate(dirs):
    X0 = train_time_dir[train_time_dir['direction']==d]
    plot_periodogram(X0['congestion'],ax=ax0[i], ttl=f'Periodogram dir={d}');

Is there a 72-step seasonality?

72 * 20min. = 24H...

No wonder traffic is on a 24-hour cycle.

In [None]:
import statsmodels.api as sm

fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

for d in ['NB','SB','WB','EB']:
    fig = sm.graphics.tsa.plot_acf(wd[d], lags=80, ax=ax1)
    fig = sm.graphics.tsa.plot_pacf(wd[d], lags=80, ax=ax2)

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

for d in ['NW','SW','NE','SE']:
    fig = sm.graphics.tsa.plot_acf(wd[d], lags=80, ax=ax1)
    fig = sm.graphics.tsa.plot_pacf(wd[d], lags=80, ax=ax2)

# Test SARIMA model

I think there is room for improvement in the parameters.

In [None]:
X = train_time_dir[train_time_dir['direction']=='NB']
testX = test_time_dir[test_time_dir['direction']=='NB']
XH = X.resample('H').mean()
y = pd.DataFrame(data=XH.pop('congestion'), index=XH.index) #Reduce data volume

mod = sm.tsa.statespace.SARIMAX(y,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 1, 24),
                                enforce_stationarity=True,
                                enforce_invertibility=True)
results = mod.fit()

results.plot_diagnostics(figsize=(18, 8))
plt.show()

In [None]:
pred = results.get_prediction(start=pd.to_datetime('1991-09-30 00:00:00', format='%Y%m%d %H:%M:%S'),
                              end=pd.to_datetime('1991-09-30 11:40:00', format='%Y%m%d %H:%M:%S'),
                              dynamic=False)
pred_ci = pred.conf_int()
ax = y['1991-09-29':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 4))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('congestion')
plt.legend()
plt.show()

# Build SARIMA model

In [None]:
dir_models = train_time_dir['direction'].unique()

forecasts = pd.DataFrame(index=pd.date_range(start=pd.to_datetime(test_df['time'].min()), end=pd.to_datetime(test_df['time'].max()), freq="20T"))

for i,d in enumerate(dirs):
    testX = test_time_dir[test_time_dir['direction']==d]
    X = train_time_dir[train_time_dir['direction']==d]
    XH = X.resample('H').mean()
    y = pd.DataFrame(data=XH.pop('congestion'), index=XH.index)
    mod = sm.tsa.statespace.SARIMAX(y,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 1, 24),
                                enforce_stationarity=True,
                                enforce_invertibility=True)
    results = mod.fit()
    test_pred  = results.forecast(len(testX))
    #pred_ci = results.get_forecast(1).conf_int()

    ax = y['1991-09-29':].plot(label='observed')
    test_pred.plot(ax=ax, label='Forecast(test)')
    ax.legend()
    ax.set_title(f'Direction={d}')
    
    forecasts[d] = test_pred.resample('20T').ffill()


In [None]:
cols = ['time','direction','forecast']
forecasts = pd.DataFrame(data=forecasts.stack().reset_index())
forecasts.columns = cols

test_sarima = test_df.copy()
test_sarima['datetime'] = pd.to_datetime(test_sarima['time'])
test_sarima = test_sarima.drop('time', axis=1)
test_sarima.rename(columns={'datetime': 'time'}, inplace=True)

test_sarima = pd.merge(test_sarima, forecasts, on=['time','direction'])

# Build XGB model

In [None]:
def prepare_features(df):
    df['day'] = df.index.dayofweek  # the x-axis (freq)
    df['week'] = df.index.week  # the seasonal period (period)
    df['dayofyear'] = df.index.dayofyear
    df['year'] = df.index.year
    df['timecnt'] = df.index.hour * 100 + df.index.minute
    df = pd.get_dummies(df, columns=['direction'])
    df.reset_index(inplace = True)
    df.drop('datetime', axis=1, inplace=True)
    return df

In [None]:
trainx = train_time.copy()
X = prepare_features(trainx) 
y = X.pop('congestion')

xgb = XGBRegressor()
xgb.fit(X, y)

testX = prepare_features(test_time)
predict = xgb.predict(testX)

# Make submission

In [None]:
results = test_sarima.copy()

results['preds'] = predict
results['congestion'] = (results['forecast'] + results['preds'])/2
results[['row_id','congestion']].to_csv('submission.csv', index=False)

print(results)