In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(14, 6),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)

plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', index_col='date', parse_dates=['date']).to_period('D')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', index_col='date', parse_dates=['date']).to_period('D')

train_df.head()

In [None]:
test_df.head()

In [None]:
train_average = train_df.groupby('date').mean()
test_average = test_df.groupby('date').mean()
train_average

In [None]:
month_trend = train_average['num_sold'].rolling(window=30, min_periods=15, center=True).mean()
week_trend = train_average['num_sold'].rolling(window=7, min_periods=4, center=True).mean()
year_trend = train_average['num_sold'].rolling(window=365, min_periods=183, center=True).mean()

In [None]:
_, ax = plt.subplots()
ax = train_average['num_sold'].plot(style='.', color='0.5')
week_trend.plot(ax=ax, color='C0')
month_trend.plot(ax=ax, color='C6')
year_trend.plot(ax=ax, color='C3', linewidth=3, title='Periodic Trends')

In [None]:
X = train_average.copy()
X['day'] = train_average.index.dayofweek
X['week'] = train_average.index.week
X['year'] = train_average.index.year
X['dayofyear'] = train_average.index.dayofyear

test = test_average.copy()
test['day'] = test_average.index.dayofweek
test['week'] = test_average.index.week
test['year'] = test_average.index.year
test['dayofyear'] = test_average.index.dayofyear

X.head()

In [None]:
test.head()

In [None]:
_, ax = plt.subplots()
palette = sns.color_palette('husl', n_colors=X.week.nunique())
sns.lineplot(x='day', y='num_sold', hue='week', data=X, ci=False, legend=False, palette=palette)

In [None]:
_, ax = plt.subplots()
palette = sns.color_palette('husl', n_colors=X.year.nunique())
sns.lineplot(x='dayofyear', y='num_sold', hue='year', data=X, ci=False, legend=False, palette=palette)

In [None]:
from scipy.signal import periodogram
fs = pd.Timedelta('1Y') / pd.Timedelta('1D')
freq, spec = periodogram(X.num_sold, fs, scaling='spectrum', detrend='linear', window='boxcar')
_, ax = plt.subplots()
ax.step(freq, spec, color='purple')
ax.set_xscale('log')
ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
ax.set_xticklabels([            
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)"])

In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
fourier = CalendarFourier(freq='A', order=12)
dp = DeterministicProcess(
    index=X.index,
    additional_terms=[fourier],
    constant=True,
    drop=True,
    order=1,
    seasonal=True
)
X_train = dp.in_sample()
X_train.head()

In [None]:
test.head()

In [None]:
test_dp = DeterministicProcess(
    index=test.index,
    additional_terms=[fourier],
    constant=True,
    drop=True,
    order=1,
    seasonal=True
)
test_data = test_dp.in_sample()
test_data.head()

In [None]:
from sklearn.linear_model import LinearRegression

y = X.num_sold
model = LinearRegression()
model.fit(X_train, y)
y_pred = pd.Series(model.predict(X_train), index=X_train.index)
y.plot(**plot_params)
y_pred.plot()

In [None]:
y_test = pd.Series(model.predict(test_data), index=test_data.index)
y.plot(**plot_params)
y_pred.plot()
y_test.plot(color='C3')