# Data Preprocessing of the Bike Sharing Demand Dataset

* The following code is based on [https://scikit-learn.org/stable/auto_examples/applications/plot_time_series_lagged_features.html](https://scikit-learn.org/stable/auto_examples/applications/plot_time_series_lagged_features.html)
* We start by loading the data from the OpenML repository.

In [1]:
import numpy as np
import pandas as pd
import copy

from sklearn.datasets import fetch_openml

bike_sharing = fetch_openml(
    "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
)
df = bike_sharing.frame

In [2]:
summary = pd.DataFrame(df.describe())
summary = (
    summary.style.background_gradient()
    .set_table_attributes("style = 'display: inline'")
    .set_caption("Statistics of the Dataset")
    .set_table_styles([{"selector": "caption", "props": [("font-size", "16px")]}])
)
summary

Unnamed: 0,year,month,hour,weekday,temp,feel_temp,humidity,windspeed,count
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,0.502561,6.537775,11.546752,3.003683,20.376474,23.788755,0.627229,12.73654,189.463088
std,0.500008,3.438776,6.914405,2.005771,7.894801,8.592511,0.19293,8.196795,181.387599
min,0.0,1.0,0.0,0.0,0.82,0.0,0.0,0.0,1.0
25%,0.0,4.0,6.0,1.0,13.94,16.665,0.48,7.0015,40.0
50%,1.0,7.0,12.0,3.0,20.5,24.24,0.63,12.998,142.0
75%,1.0,10.0,18.0,5.0,27.06,31.06,0.78,16.9979,281.0
max,1.0,12.0,23.0,6.0,41.0,50.0,1.0,56.9969,977.0


In [3]:
df.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed,count
0,spring,0,1,0,False,6,False,clear,9.84,14.395,0.81,0.0,16
1,spring,0,1,1,False,6,False,clear,9.02,13.635,0.8,0.0,40
2,spring,0,1,2,False,6,False,clear,9.02,13.635,0.8,0.0,32
3,spring,0,1,3,False,6,False,clear,9.84,14.395,0.75,0.0,13
4,spring,0,1,4,False,6,False,clear,9.84,14.395,0.75,0.0,1


In [4]:
def target2lastcolumn(df, target_name):
    """
    Move the target column to the last column of the dataframe.

    Args:
        df (pd.DataFrame):
            The dataframe.
        target_name (str):
            The name of the target column.

    Returns:
        pd.DataFrame:
            The dataframe with the target column as the last column.

    Examples:
        >>> import pandas as pd
        >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'target': [5, 6]})
        >>> target2lastcolumn(df, 'target')
           A  B  target
        0  1  3       5
        1  2  4       6
    """
    df = df[[c for c in df if c not in [target_name]] + [target_name]]
    return df

In [5]:
# convert the season column and the weather column to a  numerical column using one-hot encoding
bike_df = copy.deepcopy(df)
bike_df = pd.get_dummies(bike_df, columns=["season", "weather"], drop_first=True)
# move count column to the end of the dataframe
bike_df = target2lastcolumn(df=bike_df, target_name="count")
bike_df.head()
# save the dataframe to a csv file
bike_df.to_csv("bike_sharing_demand.csv", index=False)


In [6]:
df.describe()

Unnamed: 0,year,month,hour,weekday,temp,feel_temp,humidity,windspeed,count
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,0.502561,6.537775,11.546752,3.003683,20.376474,23.788755,0.627229,12.73654,189.463088
std,0.500008,3.438776,6.914405,2.005771,7.894801,8.592511,0.19293,8.196795,181.387599
min,0.0,1.0,0.0,0.0,0.82,0.0,0.0,0.0,1.0
25%,0.0,4.0,6.0,1.0,13.94,16.665,0.48,7.0015,40.0
50%,1.0,7.0,12.0,3.0,20.5,24.24,0.63,12.998,142.0
75%,1.0,10.0,18.0,5.0,27.06,31.06,0.78,16.9979,281.0
max,1.0,12.0,23.0,6.0,41.0,50.0,1.0,56.9969,977.0


## Generating pandas-engineered lagged features

* Let’s consider the problem of predicting the demand at the next hour given past demands.
* Since the demand is a continuous variable, one could intuitively use any regression model.
* However, we do not have the usual (X_train, y_train) dataset. Instead, we just have the y_train demand data sequentially organized by time.

In [7]:
count = df["count"]
lagged_df = pd.concat(
    [
        count,
        count.shift(1).rename("lagged_count_1h"),
        count.shift(2).rename("lagged_count_2h"),
        count.shift(3).rename("lagged_count_3h"),
        count.shift(24).rename("lagged_count_1d"),
        count.shift(24 + 1).rename("lagged_count_1d_1h"),
        count.shift(7 * 24).rename("lagged_count_7d"),
        count.shift(7 * 24 + 1).rename("lagged_count_7d_1h"),
        count.shift(1).rolling(24).mean().rename("lagged_mean_24h"),
        count.shift(1).rolling(24).max().rename("lagged_max_24h"),
        count.shift(1).rolling(24).min().rename("lagged_min_24h"),
        count.shift(1).rolling(7 * 24).mean().rename("lagged_mean_7d"),
        count.shift(1).rolling(7 * 24).max().rename("lagged_max_7d"),
        count.shift(1).rolling(7 * 24).min().rename("lagged_min_7d"),
    ],
    axis="columns",
)
lagged_df.tail(10)

Unnamed: 0,count,lagged_count_1h,lagged_count_2h,lagged_count_3h,lagged_count_1d,lagged_count_1d_1h,lagged_count_7d,lagged_count_7d_1h,lagged_mean_24h,lagged_max_24h,lagged_min_24h,lagged_mean_7d,lagged_max_7d,lagged_min_7d
17369,247,203.0,224.0,157.0,160.0,169.0,70.0,135.0,93.5,224.0,1.0,67.732143,271.0,1.0
17370,315,247.0,203.0,224.0,138.0,160.0,46.0,70.0,97.125,247.0,1.0,68.785714,271.0,1.0
17371,214,315.0,247.0,203.0,133.0,138.0,33.0,46.0,104.5,315.0,1.0,70.386905,315.0,1.0
17372,164,214.0,315.0,247.0,123.0,133.0,33.0,33.0,107.875,315.0,1.0,71.464286,315.0,1.0
17373,122,164.0,214.0,315.0,125.0,123.0,26.0,33.0,109.583333,315.0,1.0,72.244048,315.0,1.0
17374,119,122.0,164.0,214.0,102.0,125.0,26.0,26.0,109.458333,315.0,1.0,72.815476,315.0,1.0
17375,89,119.0,122.0,164.0,72.0,102.0,18.0,26.0,110.166667,315.0,1.0,73.369048,315.0,1.0
17376,90,89.0,119.0,122.0,47.0,72.0,23.0,18.0,110.875,315.0,1.0,73.791667,315.0,1.0
17377,61,90.0,89.0,119.0,36.0,47.0,22.0,23.0,112.666667,315.0,1.0,74.190476,315.0,1.0
17378,49,61.0,90.0,89.0,49.0,36.0,12.0,22.0,113.708333,315.0,1.0,74.422619,315.0,1.0


In [8]:
lagged_df.head(10)

Unnamed: 0,count,lagged_count_1h,lagged_count_2h,lagged_count_3h,lagged_count_1d,lagged_count_1d_1h,lagged_count_7d,lagged_count_7d_1h,lagged_mean_24h,lagged_max_24h,lagged_min_24h,lagged_mean_7d,lagged_max_7d,lagged_min_7d
0,16,,,,,,,,,,,,,
1,40,16.0,,,,,,,,,,,,
2,32,40.0,16.0,,,,,,,,,,,
3,13,32.0,40.0,16.0,,,,,,,,,,
4,1,13.0,32.0,40.0,,,,,,,,,,
5,1,1.0,13.0,32.0,,,,,,,,,,
6,2,1.0,1.0,13.0,,,,,,,,,,
7,3,2.0,1.0,1.0,,,,,,,,,,
8,8,3.0,2.0,1.0,,,,,,,,,,
9,14,8.0,3.0,2.0,,,,,,,,,,


* We can now separate the lagged features in a matrix X and the target variable (the counts to predict) in an array of the same first dimension y.

In [9]:
lagged_df = lagged_df.dropna()
X = lagged_df.drop("count", axis="columns")
y = lagged_df["count"]
print("X shape: {}\ny shape: {}".format(X.shape, y.shape))

X shape: (17210, 13)
y shape: (17210,)


In [10]:
X.head(10)

Unnamed: 0,lagged_count_1h,lagged_count_2h,lagged_count_3h,lagged_count_1d,lagged_count_1d_1h,lagged_count_7d,lagged_count_7d_1h,lagged_mean_24h,lagged_max_24h,lagged_min_24h,lagged_mean_7d,lagged_max_7d,lagged_min_7d
169,9.0,2.0,5.0,210.0,84.0,40.0,16.0,60.083333,210.0,1.0,56.416667,219.0,1.0
170,15.0,9.0,2.0,134.0,210.0,32.0,40.0,51.958333,187.0,1.0,56.267857,219.0,1.0
171,20.0,15.0,9.0,63.0,134.0,13.0,32.0,47.208333,187.0,1.0,56.196429,219.0,1.0
172,61.0,20.0,15.0,67.0,63.0,1.0,13.0,47.125,187.0,1.0,56.482143,219.0,1.0
173,62.0,61.0,20.0,59.0,67.0,1.0,1.0,46.916667,187.0,1.0,56.845238,219.0,1.0
174,98.0,62.0,61.0,73.0,59.0,2.0,1.0,48.541667,187.0,1.0,57.422619,219.0,1.0
175,102.0,98.0,62.0,50.0,73.0,3.0,2.0,49.75,187.0,1.0,58.017857,219.0,1.0
176,95.0,102.0,98.0,72.0,50.0,8.0,3.0,51.625,187.0,1.0,58.565476,219.0,1.0
177,74.0,95.0,102.0,87.0,72.0,14.0,8.0,51.708333,187.0,1.0,58.958333,219.0,1.0
178,76.0,74.0,95.0,187.0,87.0,36.0,14.0,51.25,187.0,1.0,59.327381,219.0,1.0


In [11]:
y.head(10)

169     15
170     20
171     61
172     62
173     98
174    102
175     95
176     74
177     76
178     69
Name: count, dtype: int64

In [12]:
# combine X and y into a single DataFrame and save it to a CSV file so that y is the last column
pd.concat([X, y], axis="columns").to_csv("bike_sharing_demand_lagged.csv", index=False)


## Naive evaluation of the next hour bike demand regression

* Let’s randomly split our tabularized dataset to train a gradient boosting regression tree (GBRT) model and evaluate it using Mean Absolute Percentage Error (MAPE).
* If our model is aimed at forecasting (i.e., predicting future data from past data), we should not use training data that are ulterior to the testing data.
* In time series machine learning the “i.i.d” (independent and identically distributed) assumption does not hold true as the data points are not independent and have a temporal relationship.

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = HistGradientBoostingRegressor().fit(X_train, y_train)

* Taking a look at the performance of the model.

In [14]:
from sklearn.metrics import mean_absolute_percentage_error

y_pred = model.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred)

0.3965958240845969