In this notebook, I am trying to understand seasonality in the data
- without using any statistical techniques
- with statistical decomposition technique

In other notebooks, we have seen how statistical decomposition helps in filtering/displaying different components like trend, seasonality and residuals. Here we are going to focus only on seasonality. 

### Concept of seasonality helps us to understand why time based features (hour of the day, day of the week, season/quarter) are helpful in building models.

At the end of the notebook, we will understand the variation of data across weekdays vs weekends, various months, quarters and seasons using box plots.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.tsa.api as tsa

%matplotlib inline

matplotlib.style.use("Solarize_Light2")

    
def create_us_season(source_df, target_df, feature_name):
    """
    Winter: December - February (12. 1, 2)
    Spring: March - May (3. 4, 5)
    Summer: June - August [6, 7, 8]
    Fall: September - November (9, 10, 11)
    """
    month_to_season_map = {
        1: "winter",
        2: "winter",
        3: "spring",
        4: "spring",
        5: "spring",
        6: "summer",
        7: "summer",
        8: "summer",
        9: "fall",
        10: "fall",
        11: "fall",
        12: "winter",
    }
    target_df.loc[:, "us_season"] = source_df.loc[:, feature_name].dt.month.map(
        month_to_season_map
    )
    return target_df


def create_date_features(source_df, target_df, feature_name):
    """
    Create new features related to dates

    source_df : DataFrame consisting of the timestamp related feature
    target_df : DataFrame where new features will be added
    feature_name : Name of the feature of date type which needs to be decomposed.
    """
    target_df.loc[:, "year"] = source_df.loc[:, feature_name].dt.year.astype("uint16")
    target_df.loc[:, "month"] = source_df.loc[:, feature_name].dt.month.astype("uint8")
    target_df.loc[:, "quarter"] = source_df.loc[:, feature_name].dt.quarter.astype(
        "uint8"
    )
    target_df.loc[:, "weekofyear"] = (
        source_df.loc[:, feature_name].dt.isocalendar().week.astype("uint8")
    )

    target_df.loc[:, "hour"] = source_df.loc[:, feature_name].dt.hour.astype("uint8")

    target_df.loc[:, "day"] = source_df.loc[:, feature_name].dt.day.astype("uint8")
    target_df.loc[:, "day_name"] = source_df.loc[:, feature_name].dt.day_name()
    target_df.loc[:, "dayofweek"] = source_df.loc[:, feature_name].dt.dayofweek.astype(
        "uint8"
    )

    target_df.loc[:, "day_type"] = np.where(
        source_df.loc[:, feature_name].dt.dayofweek < 5, "week_day", "week_end"
    )
    target_df.loc[:, "dayofyear"] = source_df.loc[:, feature_name].dt.dayofyear.astype(
        "uint8"
    )
    target_df.loc[:, "is_month_start"] = source_df.loc[
        :, feature_name
    ].dt.is_month_start
    target_df.loc[:, "is_month_end"] = source_df.loc[:, feature_name].dt.is_month_end
    target_df.loc[:, "is_quarter_start"] = source_df.loc[
        :, feature_name
    ].dt.is_quarter_start
    target_df.loc[:, "is_quarter_end"] = source_df.loc[
        :, feature_name
    ].dt.is_quarter_end
    target_df.loc[:, "is_year_start"] = source_df.loc[:, feature_name].dt.is_year_start
    target_df.loc[:, "is_year_end"] = source_df.loc[:, feature_name].dt.is_year_end

    # This is of type object
    target_df.loc[:, "month_year"] = source_df.loc[:, feature_name].dt.to_period("M")

    return target_df


def plt_seasonal_decomposition(df, feature, freq, freq_type="daily", model="additive", figsize=(20, 10)):
    plt.rcParams["figure.figsize"] = figsize
    decomposition = tsa.seasonal_decompose(df[feature], model=model, period=freq)
    decomposition.plot()
    plt.title(f"{model} {freq_type} seasonal decomposition of {feature}")
    plt.show()


def plt_seasonality(df, feature, freq, freq_type="daily", model="additive", figsize=(20, 10)):
    plt.rcParams["figure.figsize"] = figsize
    decomposition = tsa.seasonal_decompose(df[feature], model=model, period=freq)
    decomposition.seasonal.plot(color="blue", linewidth=0.5)
    plt.title(f"{model} {freq_type} seasonality of {feature}")
    plt.show()


def plot_trend(df, feature, freq, freq_type="daily", model="additive", figsize=(20, 10)):
    plt.rcParams["figure.figsize"] = figsize
    decomposition = tsa.seasonal_decompose(df[feature], model=model, period=freq)
    decomposition.trend.plot(color="blue", linewidth=0.5)
    plt.title(f"{model} {freq_type} seasonality of {feature}")
    plt.show()

    
def plot_ts_line_groupby(df, ts_index_feature, groupby_feature, value_feature, title, xlabel, ylabel, figsize=(15, 8)):
    fig, ax = plt.subplots(figsize=figsize)
    for label, df in df.groupby(groupby_feature):
        df.set_index(ts_index_feature)[value_feature].plot(
            kind="line", alpha=0.3, ax=ax, color="blue", linewidth=0.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

    
def plot_multiple_seasonalities(df, feature_name, figsize=(20, 6)):
    period_names = ["daily", "weekly", "monthly", "quarterly"]
    periods = [24, 24*7, 24*30, 24*90]

    for name, period in zip(period_names, periods):
        plt_seasonality(df.set_index("date_time")[0: period*3], feature=feature_name, freq=period, freq_type=name, figsize=(20, 6))


def plot_boxh_groupby(df, feature_name, by):
    """
    Box plot with groupby feature
    """
    df.boxplot(column=feature_name, by=by, vert=False, figsize=(10, 6), color="blue")
    plt.title(f"Distribution of {feature_name} by {by}")
    plt.show()


DATA_DIR = "/kaggle/input/tabular-playground-series-jul-2021/"
train_df = pd.read_csv(f"{DATA_DIR}/train.csv", parse_dates=["date_time"])
test_df = pd.read_csv(f"{DATA_DIR}/test.csv", parse_dates=["date_time"])

train_df = train_df.sort_values(by="date_time")

test_df = test_df.sort_values(by="date_time")

train_df = create_date_features(train_df, train_df, "date_time")
train_df = create_date_features(train_df, train_df, "date_time")
train_df = create_us_season(train_df, train_df, "date_time")

# Understanding Seasonality without Statistics

### Plot target_carbon_monoxide against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="target_carbon_monoxide",
    title="target_carbon_monoxide vs hour of the day",
    xlabel="Hour of the day",
    ylabel="target_carbon_monoxide"
)

1. target_carbon_monoxide has a strong dependency on hour of the day. 
2. During the early morning hours, it's value is lesser. 
3. The value starts increasing 6 AM onwards. 
3. There are two sharp, well defined peaks (around 8 AM & 6PM). This is the time when people usually travel to and from office. 

### Plot target_benzene against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="target_benzene",
    title="target_benzene vs hour of the day",
    xlabel="Hour of the day",
    ylabel="target_benzene"
)

target_benzene's behavior is similar to target_carbon_monoxide:

1. Strong dependency on hour of the day. 
2. During the early morning hours, it's value is lesser. 
3. The value starts increasing 6 AM onwards. 
3. There are two peaks (around 8 AM & 6 PM). This is the time when people usually travel to and from office.


### Plot target_nitrogen_oxides against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="target_nitrogen_oxides",
    title="target_nitrogen_oxides vs hour of the day",
    xlabel="hour of the day",
    ylabel="target_nitrogen_oxides"
)

1. target_nitrogen_oxides indicates higher values during the day time compared to night/early morning.
2. The peaks are not that prominent here (unlike target_benzene and target_carbon_monoxide). 
3. Also the peaks during office hours are little spread out (diffused).

### Plot deg_C against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="deg_C",
    title="deg_C vs hour of the day",
    xlabel="hour of the day",
    ylabel="deg_C"
)

Temperature of the day is of course dependent with hour of the day. No surprise here.

### Plot relative_humidity against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="relative_humidity",
    title="relative_humidity vs hour of the day",
    xlabel="hour of the day",
    ylabel="relative_humidity"
)

relative_humidity has a dependency with hour of the day. The value drops during the day.

### Plot absolute_humidity against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="absolute_humidity",
    title="absolute_humidity vs hour of the day",
    xlabel="hour of the day",
    ylabel="absolute_humidity"
)

absolute_humidity doesn't show any dependency with the hour of the day. It seems the values are spread out through out the day.

### Plot sensor_1 against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="sensor_1",
    title="sensor_1 vs hour of the day",
    xlabel="hour of the day",
    ylabel="sensor_1"
)

1. Sensor_1 has strong dependency with 2 peaks (one in the morning and one in the evening)
2. Value dips during the early morning and increases during the day time

### Plot sensor_2 against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="sensor_2",
    title="sensor_2 vs hour of the day",
    xlabel="hour of the day",
    ylabel="sensor_2"
)

1. Sensor_1 has strong dependency with 2 peaks (one in the morning and one in the evening)
2. Value dips during the early morning and increases during the day time
3. The dip in the value during the early morning seems to be higher compared to sensor 1.

### Plot sensor_3 against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="sensor_3",
    title="sensor_3 vs hour of the day",
    xlabel="hour of the day",
    ylabel="sensor_3"
)

1. Sensor_3 has strong dependency with 2 **downward** peaks (one in the morning and one in the evening)
2. Value increases during the late night/early morning and decreases during the day time

### Plot sensor_4 against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="sensor_4",
    title="sensor_4 vs hour of the day",
    xlabel="hour of the day",
    ylabel="sensor_4"
)

For sensor_5, the plot looks similar to sensor 1 and 2. Strong dependency on hours. Two sharp, well defined peaks.

### Plot sensor_5 against hour of the day

In [None]:
plot_ts_line_groupby(
    train_df, 
    ts_index_feature="hour",
    groupby_feature="day",
    value_feature="sensor_5",
    title="sensor_5 vs hour of the day",
    xlabel="hour of the day",
    ylabel="sensor_5"
)

For sensor_6, the plot looks similar to sensor 1 and 2. Strong dependency on hours. Two sharp, well defined peaks.

# Understanding Seasonality with Statistical Decomposition

To understand seasonality using statistical decomposition, I will pick up one the variable **carbon monxide** (target_carbon_monoxide)

Following is the seasonal decomposition plot where `target_carbon_monoxide` is broken down into trend, seasonality and residuals.

However, next we will focus mostly on seasonality.

In [None]:
plt_seasonal_decomposition(train_df.set_index("date_time"), feature="target_carbon_monoxide", freq=24, freq_type="daily", figsize=(15, 6), model="additive")

### Let's check if there is daily seasonality in `target_carbon_monoxide` data

In [None]:
plt_seasonality(train_df.set_index("date_time"), feature="target_carbon_monoxide", freq=24, freq_type="daily", figsize=(20, 6), model="additive")

The plot below clearly shows a repeatative patten indicating daily seasonality. However, the pattern is not clear enough. Let's zoom in. I will pick up only 4 days data to understand how data repeats itself.

In [None]:
plt_seasonality(train_df.set_index("date_time")[0:24*4], feature="target_carbon_monoxide", freq=24, freq_type="daily", figsize=(20, 6), model="additive")

The seasonal component clearly shows that there are two peaks: one in the morning and one in the afternoon. This matches with our seasonality plot without statistics.

### Let's check if there is weekly seasonality in `target_carbon_monoxide` data

In [None]:
plt_seasonality(train_df.set_index("date_time"), feature="target_carbon_monoxide", freq=24*7, freq_type="weekly", figsize=(20, 6), model="additive")

Again, the plot below clearly shows a repeatative patten indicating weekly seasonality. To understand the pattern better, I will pick up only 2 weeks of data.

In [None]:
plt_seasonality(train_df.set_index("date_time")[0:24*7*2], feature="target_carbon_monoxide", freq=24*7, freq_type="weekly", figsize=(20, 6), model="additive")

Clearly the seasonal component is minimum during the weekends and higher on the weekdays (Specially on Momday & Wednesdays)

### Let's check if there is monthly seasonality in `target_carbon_monoxide` data

In [None]:
plt_seasonality(train_df.set_index("date_time"), feature="target_carbon_monoxide", freq=24*30, freq_type="monthly", figsize=(20, 6), model="additive")

Again, the plot below clearly shows a repeatative patten indicating monthly seasonality. To understand the pattern better, I will pick up only 2 months of data.

In [None]:
plt_seasonality(train_df.set_index("date_time")[0:24*30*3], feature="target_carbon_monoxide", freq=24*30, freq_type="monthly", figsize=(20, 6), model="additive")

However, on a close look, it seems that the seasonality is mostly weekly in nature.

### Let's check if there is quaterly seasonality in `target_carbon_monoxide` data

In [None]:
plt_seasonality(train_df.set_index("date_time"), feature="target_carbon_monoxide", freq=24*90, freq_type="quarterly", figsize=(20, 6), model="additive")

Here, we don't need to zoom in. This plot itself show the pattern which repeats itself every three months.

### Overall, `target_carbon_monoxide` data shows clear daily, weekly, quarterly seasonality

## Seasonality for target_benzene

In [None]:
plot_multiple_seasonalities(train_df, feature_name="target_benzene")

For target_benzene, daily, weekly and quarterly seasonalities are evident. I am little confused about monthly. But it looks like there are two periodic dips around 11th and 27th of every month.

### Seasonality for target_nitrogen_oxides

In [None]:
plot_multiple_seasonalities(train_df, feature_name="target_nitrogen_oxides")

For target_nitrogen_oxides, daily, weekly and quarterly seasonalities are evident. For monthly seasonality, it looks like there are two periodic dips (by-weekly?)

# Boxplot to understand variation of data across different time periods

## Variation of target_carbon_monoxide

In [None]:
periods = ["day_name", "day_type", "weekofyear", "month", "quarter", "us_season"]
for period in periods:
    plot_boxh_groupby(df=train_df, feature_name="target_carbon_monoxide", by=period)

- target_carbon_monoxide varies significantly between weekend and weekdays
- Variation across weekdays are not that prominent for our bare eyes.
- Variations across subsequent weeks are not that significant. But weeks which are significantly separated show variations.
- Variataions across quareters and seasons are also prominent.

### We will have similar observation for other two target variables as well.

## Variation of target_benzene

In [None]:
periods = ["day_name", "day_type", "weekofyear", "month", "quarter", "us_season"]

for period in periods:
    plot_boxh_groupby(df=train_df, feature_name="target_benzene", by=period)

## Variation of target_nitrogen_oxides

In [None]:
periods = ["day_name", "day_type", "weekofyear", "month", "quarter", "us_season"]

for period in periods:
    plot_boxh_groupby(df=train_df, feature_name="target_nitrogen_oxides", by=period)