# Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engine.datetime import DatetimeFeatures
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures, ExpandingWindowFeatures

%matplotlib inline

# Loading Data

In [2]:
data = pd.read_parquet(r"C:/Users/SRA/Desktop/backup/C/MLgrit/time_series_project/data/2022/V2_Clean_Data.parquet")
data.head()

Unnamed: 0,passenger_demand,taxi_demand,timestamp
0,4,4,2022-03-31 15:00:00
1,3,3,2022-03-31 16:00:00
2,0,0,2022-03-31 17:00:00
3,0,0,2022-03-31 18:00:00
4,0,0,2022-03-31 19:00:00


In [3]:
data

Unnamed: 0,passenger_demand,taxi_demand,timestamp
0,4,4,2022-03-31 15:00:00
1,3,3,2022-03-31 16:00:00
2,0,0,2022-03-31 17:00:00
3,0,0,2022-03-31 18:00:00
4,0,0,2022-03-31 19:00:00
...,...,...,...
8732,8489,6242,2022-05-31 11:00:00
8733,8714,6444,2022-05-31 12:00:00
8734,9389,6915,2022-05-31 13:00:00
8735,9997,7296,2022-05-31 14:00:00


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8737 entries, 0 to 8736
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   passenger_demand  8737 non-null   int64         
 1   taxi_demand       8737 non-null   int64         
 2   timestamp         8737 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 204.9 KB


In [5]:
df = data.copy()

In [6]:
# df.set_index('timestamp', inplace=True)

In [7]:
df

Unnamed: 0,passenger_demand,taxi_demand,timestamp
0,4,4,2022-03-31 15:00:00
1,3,3,2022-03-31 16:00:00
2,0,0,2022-03-31 17:00:00
3,0,0,2022-03-31 18:00:00
4,0,0,2022-03-31 19:00:00
...,...,...,...
8732,8489,6242,2022-05-31 11:00:00
8733,8714,6444,2022-05-31 12:00:00
8734,9389,6915,2022-05-31 13:00:00
8735,9997,7296,2022-05-31 14:00:00


# Add Temporal Features

In [8]:
# DatetimeFeatures?

In [9]:
def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    features_to_extract = [
        "month", "quarter","semester","year","week","day_of_week","day_of_month",
        "day_of_year","weekend","month_start","month_end","quarter_start",
        "quarter_end","year_start","year_end","leap_year","days_in_month","hour","minute","second"
    ]
        
    ###
    temporal = DatetimeFeatures(features_to_extract = features_to_extract).fit_transform(df[['timestamp']])
    for col in temporal.columns:
        df.loc[:,col] = temporal[col].values

In [10]:
features_to_extract = [
    "month", "quarter","semester","year","week","day_of_week","day_of_month",
    "day_of_year","weekend","month_start","month_end","quarter_start",
    "quarter_end","year_start","year_end","leap_year","days_in_month","hour","minute","second"]

In [11]:
temporal = DatetimeFeatures(
    features_to_extract=features_to_extract).fit_transform(df[['timestamp']])
temporal.head()

Unnamed: 0,timestamp_month,timestamp_quarter,timestamp_semester,timestamp_year,timestamp_week,timestamp_day_of_week,timestamp_day_of_month,timestamp_day_of_year,timestamp_weekend,timestamp_month_start,timestamp_month_end,timestamp_quarter_start,timestamp_quarter_end,timestamp_year_start,timestamp_year_end,timestamp_leap_year,timestamp_days_in_month,timestamp_hour,timestamp_minute,timestamp_second
0,3,1,1,2022,13,3,31,90,0,0,1,0,1,0,0,0,31,15,0,0
1,3,1,1,2022,13,3,31,90,0,0,1,0,1,0,0,0,31,16,0,0
2,3,1,1,2022,13,3,31,90,0,0,1,0,1,0,0,0,31,17,0,0
3,3,1,1,2022,13,3,31,90,0,0,1,0,1,0,0,0,31,18,0,0
4,3,1,1,2022,13,3,31,90,0,0,1,0,1,0,0,0,31,19,0,0


In [12]:
temporal.head().T

Unnamed: 0,0,1,2,3,4
timestamp_month,3,3,3,3,3
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_year,2022,2022,2022,2022,2022
timestamp_week,13,13,13,13,13
timestamp_day_of_week,3,3,3,3,3
timestamp_day_of_month,31,31,31,31,31
timestamp_day_of_year,90,90,90,90,90
timestamp_weekend,0,0,0,0,0
timestamp_month_start,0,0,0,0,0


In [13]:
add_temporal_features(df)

In [14]:
df.head().T

Unnamed: 0,0,1,2,3,4
passenger_demand,4,3,0,0,0
taxi_demand,4,3,0,0,0
timestamp,2022-03-31 15:00:00,2022-03-31 16:00:00,2022-03-31 17:00:00,2022-03-31 18:00:00,2022-03-31 19:00:00
timestamp_month,3,3,3,3,3
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_year,2022,2022,2022,2022,2022
timestamp_week,13,13,13,13,13
timestamp_day_of_week,3,3,3,3,3
timestamp_day_of_month,31,31,31,31,31


# LagFeatures

<b>Lag:</b> In various fields, "lag" often refers to a delay or a period of time between two events or actions. For instance, in economics or finance, it might represent the delay between a policy change and its effects on the economy. In statistics, a lag is a shift in time for a variable in a time series analysis. It's used to observe the relationship between past and present values of a variable.

In [15]:
# LagFeatures?

In [16]:
lagfeatures = LagFeatures(variables=None, periods=[1, 2, 4, 8, 16, 24], freq=None, sort_index=True,
                          missing_values='raise', drop_original=False)
lagfeatures.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
features = lagfeatures.transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
features

Unnamed: 0,timestamp,passenger_demand,taxi_demand,passenger_demand_lag_1,taxi_demand_lag_1,passenger_demand_lag_2,taxi_demand_lag_2,passenger_demand_lag_4,taxi_demand_lag_4,passenger_demand_lag_8,taxi_demand_lag_8,passenger_demand_lag_16,taxi_demand_lag_16,passenger_demand_lag_24,taxi_demand_lag_24
0,2022-03-31 15:00:00,4,4,,,,,,,,,,,,
1,2022-03-31 16:00:00,3,3,4.0,4.0,,,,,,,,,,
2,2022-03-31 17:00:00,0,0,3.0,3.0,4.0,4.0,,,,,,,,
3,2022-03-31 18:00:00,0,0,0.0,0.0,3.0,3.0,,,,,,,,
4,2022-03-31 19:00:00,0,0,0.0,0.0,0.0,0.0,4.0,4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8732,2022-05-31 11:00:00,8489,6242,7741.0,5744.0,7626.0,5787.0,5599.0,4318.0,342.0,231.0,7850.0,5418.0,6166.0,4208.0
8733,2022-05-31 12:00:00,8714,6444,8489.0,6242.0,7741.0,5744.0,7174.0,5559.0,454.0,321.0,6974.0,4783.0,7738.0,5135.0
8734,2022-05-31 13:00:00,9389,6915,8714.0,6444.0,8489.0,6242.0,7626.0,5787.0,1104.0,808.0,6762.0,4628.0,8225.0,5475.0
8735,2022-05-31 14:00:00,9997,7296,9389.0,6915.0,8714.0,6444.0,7741.0,5744.0,2763.0,2182.0,5796.0,3980.0,8518.0,5735.0


In [17]:
def add_lag_features(df:pd.DataFrame) -> pd.DataFrame:
    lagfeatures = LagFeatures(variables=None, periods=[1, 2, 4, 8, 16, 24], freq=None, sort_index=True,
                              missing_values='raise', drop_original=False)
    lagfeatures.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
    features = lagfeatures.transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [18]:
add_lag_features(df)

In [19]:
df.head().T

Unnamed: 0,0,1,2,3,4
passenger_demand,4,3,0,0,0
taxi_demand,4,3,0,0,0
timestamp,2022-03-31 15:00:00,2022-03-31 16:00:00,2022-03-31 17:00:00,2022-03-31 18:00:00,2022-03-31 19:00:00
timestamp_month,3,3,3,3,3
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_year,2022,2022,2022,2022,2022
timestamp_week,13,13,13,13,13
timestamp_day_of_week,3,3,3,3,3
timestamp_day_of_month,31,31,31,31,31


# WindowFeatures

<b>Windowing in Data Analysis:</b> In data analysis or signal processing, a window function is a mathematical function that modifies a time series or sequence of data to emphasize or de-emphasize certain points. It's often used in techniques like Fourier analysis, signal processing, and data smoothing. The function "windows" or narrows the focus to a specific section of the data, reducing the impact of values outside that section.

Both concepts are crucial in various domains, helping to understand patterns, relationships, and the behavior of data over time.

The mean value of the previous 3 months of data is a window feature. The
maximum value of the previous three rows of data is another window feature.

In [20]:
# WindowFeatures?

In [21]:
def add_window_features(df:pd.DataFrame) -> pd.DataFrame:
    window = WindowFeatures(
        variables=None, window=7, min_periods=1, 
        functions=['mean', 'std', 'median'], periods=1, freq=None, sort_index=True, 
        missing_values='raise', drop_original=False
    )
    window.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
    features = window.transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [22]:
window = WindowFeatures(
    variables=None, window=7, min_periods=1, 
    functions=['mean', 'std', 'median'], periods=1, freq=None, sort_index=True, 
    missing_values='raise', drop_original=False
)

In [23]:
window.fit_transform(df[['timestamp', 'passenger_demand', 'taxi_demand']]).head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
timestamp,2022-03-31 15:00:00,2022-03-31 16:00:00,2022-03-31 17:00:00,2022-03-31 18:00:00,2022-03-31 19:00:00,2022-03-31 20:00:00,2022-03-31 21:00:00,2022-03-31 22:00:00,2022-03-31 23:00:00,2022-04-01 00:00:00
passenger_demand,4,3,0,0,0,0,0,2,121,5046
taxi_demand,4,3,0,0,0,0,0,1,65,3509
passenger_demand_window_7_mean,,,,,,,,1.0,0.714286,17.571429
passenger_demand_window_7_std,,,,,,,,1.732051,1.253566,45.613803
passenger_demand_window_7_median,,,,,,,,0.0,0.0,0.0
taxi_demand_window_7_mean,,,,,,,,1.0,0.571429,9.428571
taxi_demand_window_7_std,,,,,,,,1.732051,1.133893,24.50753
taxi_demand_window_7_median,,,,,,,,0.0,0.0,0.0


In [24]:
add_window_features(df)

In [25]:
df.head().T

Unnamed: 0,0,1,2,3,4
passenger_demand,4,3,0,0,0
taxi_demand,4,3,0,0,0
timestamp,2022-03-31 15:00:00,2022-03-31 16:00:00,2022-03-31 17:00:00,2022-03-31 18:00:00,2022-03-31 19:00:00
timestamp_month,3,3,3,3,3
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_year,2022,2022,2022,2022,2022
timestamp_week,13,13,13,13,13
timestamp_day_of_week,3,3,3,3,3
timestamp_day_of_month,31,31,31,31,31


# ExpandingWindowFeatures

In [26]:
# ExpandingWindowFeatures?

In [27]:
def add_exp_window_features(df:pd.DataFrame) -> pd.DataFrame:
    expwindow = ExpandingWindowFeatures(
        variables=None, min_periods=None, functions='std', 
        periods=1, freq=None, sort_index=True, 
        missing_values='raise', drop_original=False
    )
    expwindow.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
    features = expwindow.transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [28]:
expwindow = ExpandingWindowFeatures(
    variables=None, min_periods=None, functions='std', 
    periods=1, freq=None, sort_index=True, 
    missing_values='raise', drop_original=False
)

In [29]:
expwindow.fit_transform(df[['timestamp', 'passenger_demand', 'taxi_demand']]).head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
timestamp,2022-03-31 15:00:00,2022-03-31 16:00:00,2022-03-31 17:00:00,2022-03-31 18:00:00,2022-03-31 19:00:00,2022-03-31 20:00:00,2022-03-31 21:00:00,2022-03-31 22:00:00,2022-03-31 23:00:00,2022-04-01 00:00:00
passenger_demand,4,3,0,0,0,0,0,2,121,5046
taxi_demand,4,3,0,0,0,0,0,1,65,3509
passenger_demand_expanding_std,,,0.707107,2.081666,2.061553,1.949359,1.834848,1.732051,1.642081,39.987845
taxi_demand_expanding_std,,,0.707107,2.081666,2.061553,1.949359,1.834848,1.732051,1.603567,21.386003


In [30]:
add_exp_window_features(df)

In [31]:
df.head().T

Unnamed: 0,0,1,2,3,4
passenger_demand,4,3,0,0,0
taxi_demand,4,3,0,0,0
timestamp,2022-03-31 15:00:00,2022-03-31 16:00:00,2022-03-31 17:00:00,2022-03-31 18:00:00,2022-03-31 19:00:00
timestamp_month,3,3,3,3,3
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_year,2022,2022,2022,2022,2022
timestamp_week,13,13,13,13,13
timestamp_day_of_week,3,3,3,3,3
timestamp_day_of_month,31,31,31,31,31


In [32]:
df.dropna(inplace=True)

In [33]:
# Lets Check again NaN values

In [34]:
df.head(7).T

Unnamed: 0,24,25,26,27,28,29,30
passenger_demand,12127,11245,13665,14770,13868,11739,11102
taxi_demand,8487,7888,9788,10527,9697,7972,7380
timestamp,2022-04-01 15:00:00,2022-04-01 16:00:00,2022-04-01 17:00:00,2022-04-01 18:00:00,2022-04-01 19:00:00,2022-04-01 20:00:00,2022-04-01 21:00:00
timestamp_month,4,4,4,4,4,4,4
timestamp_quarter,2,2,2,2,2,2,2
timestamp_semester,1,1,1,1,1,1,1
timestamp_year,2022,2022,2022,2022,2022,2022,2022
timestamp_week,13,13,13,13,13,13,13
timestamp_day_of_week,4,4,4,4,4,4,4
timestamp_day_of_month,1,1,1,1,1,1,1


# Lets Save The FeaturesEngineering Data

In [35]:
df.to_parquet(r"C:/Users/SRA/Desktop/backup/C/MLgrit/time_series_project/data/2022/V3_FeatureEngineering_Data.parquet", index=False)