In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

inputdir = "/kaggle/input/tabular-playground-series-mar-2022/"

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Helper for longer listing of series or dataframes
def longdisp(x):
    with pd.option_context("display.max_rows", 999):
        print(type(x))
        print(len(x))
        display(x)

In [None]:
data = pd.read_csv(inputdir + "train.csv", index_col=0)
data

# Reshaping the data
I want to see the congestion as a timeseries for each spot/highway (x,y,direction).
For simplicity I name each spot by their x,y,direction values.
Since I don't know if values exist for all timestamps I also add a new time index with entries for all 20 minute intervals.


In [None]:
data["dt"] = pd.to_datetime(data["time"])
data["spot"] = data.apply(lambda r: str(r.x) + str(r.y) + r.direction, axis=1)
data

In [None]:
data_pv = data.pivot(index="dt", columns="spot", values="congestion")
data_pv

In [None]:
time_idx = pd.date_range("1991-04-01 00:00:00","1991-09-30 11:40:00", name="time", freq="20min")
time_idx

In [None]:
spot_ts = data_pv.reindex(index = time_idx)
spot_ts

# Missing values

In [None]:
missing = spot_ts.isna()
missing_count = 65 - spot_ts.count(axis=1)
missing.sum().sum()

In [None]:
longdisp(missing_count[missing_count > 0])


In [None]:
longdisp(missing_count[(missing_count > 0) & (missing_count < 65)])

Data is missing for 81 times for all locations with no apparent pattern, maybe simply measurement downtime?  
Filling with previous values to have a complete time series.

In [None]:
spot_ts.fillna(method="ffill", axis=0, inplace=True)
spot_ts.isna().sum().sum()

# Frequency analysis

I assume there is a daily and weekly periodicity in the data, doing a fft to confirm that and maybe find other patterns.  
Measuring frequency in cycles per day. Removing the mean before transforming to avoid a huge constant component.

In [None]:
spot_ts_zero_mean = spot_ts - spot_ts.mean()
spectra = np.fft.rfft(spot_ts_zero_mean.to_numpy(), axis=0)
print(type(spectra))
print(spectra.shape)

## Interpreting sample interval and frequencies
- The time series length is 13140 with sample interval 20 minutes
- A full FFT calculates 13140 values with base frequency 1/13140 measured in cycles/sample interval, RFFT keeps only first half because of conjugate symmetry.
- I prefer cycles/day instead of cycles/20 min, using rfftfreq to create a frequency index based on converting 20 minutes to fraction of a day

In [None]:
samples_per_day = 24*3
freq_idx = np.fft.rfftfreq(len(time_idx), d=1.0/samples_per_day)
spot_freq = pd.DataFrame(spectra, index=freq_idx, columns=spot_ts.columns)
spot_freq

## Visualizing the frequency spectra
It would be nice to compare the spectra for different locations, but 65 graphs in one diagram seems too messy. Instead I look only at the amplitude spectrum averaged over all locations.  
Looking in particular for variations with frequencies that are multiples of 1 (daily), and 1/7 (weekly).

In [None]:
# Taking the average of the spectrum over all locations to get an overview
spot_freq_mean = np.abs(spot_freq).mean(axis=1)

In [None]:
fig,ax = plt.subplots()
ax.set_xlabel("Frequency (cycles/day)")
ax.set_ylabel("Average amplitude spectrum")
ax.plot(freq_idx, spot_freq_mean)

In [None]:
# Focusing on the low frequencies
lowfreq_idx = freq_idx[freq_idx < 5]
fig,ax = plt.subplots()
ax.set_xlabel("Frequency (cycles/day)")
ax.set_ylabel("Average amplitude spectrum")
ax.plot(lowfreq_idx, spot_freq_mean.iloc[0:len(lowfreq_idx)])

In [None]:
# Focusing on the low frequencies
lowfreq_idx = freq_idx[freq_idx < 0.5]
fig,ax = plt.subplots()
ax.set_xlabel("Frequency (cycles/day)")
ax.set_ylabel("Average amplitude spectrum")
ax.plot(lowfreq_idx, spot_freq_mean.iloc[0:len(lowfreq_idx)])

## Conclusions
The daily cycle is clearly visible, as well as a weekly component at 1/7 (0.14).

# Reconstruction and prediction
To get predictions for the second half of 9/30/1991, I reconstruct a time series based on only the low frequency components.
- Take away the higher frequencies above 18 (1h 20min cycles). This is quite arbitrarily chosen to get some smoothing without losing to much detail. This should also remove some aliasing effects.
- Reconstruct a time series with the inverse FFT.

In [None]:
# Creating new fft with only low frequency components
lpcount = np.sum(freq_idx < 18)
spectra_lp = np.zeros_like(spectra)
spectra_lp[0:lpcount] = spectra[0:lpcount]

reconstruct = np.fft.irfft(spectra_lp, axis=0)
print(type(reconstruct))
print(reconstruct.shape)

spot_ts_lp = pd.DataFrame(reconstruct, index=time_idx, columns=spot_ts.columns)

# Adding mean value again since it was removed before the FFT
spot_ts_lp = spot_ts_lp + spot_ts.mean()
spot_ts_lp

For predictions I use the reconstructed time series values from the same time interval one week earlier, rounded to integers.

In [None]:
test = pd.read_csv(inputdir + "test.csv", index_col=0)
test["dt"] = pd.to_datetime(test["time"])
test["spot"] = test.apply(lambda r: str(r.x) + str(r.y) + r.direction, axis=1)
test["previousweek"] = test["dt"] - pd.Timedelta(days=7)
test["prediction"] = test.apply(lambda r: round(spot_ts_lp.loc[r["previousweek"],r["spot"]]), axis=1)
test["prediction"] = test["prediction"].astype("int32")
test

In [None]:
test.to_csv("submission_fft_1.csv", header=["congestion"], columns=["prediction"])