In [1]:
import numpy as np
import pandas as pd
from feature_engine.datetime import DatetimeFeatures

In [2]:
data = pd.read_parquet('d:/uber-taxi-demand/data/yellow_trip_v2.parquet')

In [3]:
data

Unnamed: 0,passengerDemand,taxiDemand,timestamp
0,6838,4261,2022-01-01 00:00:00
1,7738,4997,2022-01-01 01:00:00
2,5974,3901,2022-01-01 02:00:00
3,4107,2686,2022-01-01 03:00:00
4,2485,1651,2022-01-01 04:00:00
...,...,...,...
8732,0,0,2022-12-30 20:00:00
8733,0,0,2022-12-30 21:00:00
8734,7092,4547,2022-12-30 22:00:00
8735,6589,4336,2022-12-30 23:00:00


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8737 entries, 0 to 8736
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   passengerDemand  8737 non-null   int64         
 1   taxiDemand       8737 non-null   int64         
 2   timestamp        8737 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 204.9 KB


In [5]:
df = data.copy()

In [6]:
df

Unnamed: 0,passengerDemand,taxiDemand,timestamp
0,6838,4261,2022-01-01 00:00:00
1,7738,4997,2022-01-01 01:00:00
2,5974,3901,2022-01-01 02:00:00
3,4107,2686,2022-01-01 03:00:00
4,2485,1651,2022-01-01 04:00:00
...,...,...,...
8732,0,0,2022-12-30 20:00:00
8733,0,0,2022-12-30 21:00:00
8734,7092,4547,2022-12-30 22:00:00
8735,6589,4336,2022-12-30 23:00:00


**Add Temporal Features**

In [7]:
def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    features_to_extract = [
        "month", "quarter","semester","week","day_of_week","day_of_month",
        "day_of_year","weekend","month_start","month_end","quarter_start",
        "quarter_end","year_start","year_end","hour"
    ]
        
    ###
    temporal = DatetimeFeatures(features_to_extract = features_to_extract).fit_transform(df[['timestamp']])
    for col in temporal.columns:
        df.loc[:,col] = temporal[col].values

In [8]:
features_to_extract = [
    "month", "quarter","semester","week","day_of_week","day_of_month",
    "day_of_year","weekend","month_start","month_end","quarter_start","quarter_end","year_start","year_end","hour"]


In [9]:
temporal = DatetimeFeatures(
    features_to_extract=features_to_extract).fit_transform(df[['timestamp']])
temporal.head().T

Unnamed: 0,0,1,2,3,4
timestamp_month,1,1,1,1,1
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_week,52,52,52,52,52
timestamp_day_of_week,5,5,5,5,5
timestamp_day_of_month,1,1,1,1,1
timestamp_day_of_year,1,1,1,1,1
timestamp_weekend,1,1,1,1,1
timestamp_month_start,1,1,1,1,1
timestamp_month_end,0,0,0,0,0


In [10]:
add_temporal_features(df)

In [11]:
df.head().T

Unnamed: 0,0,1,2,3,4
passengerDemand,6838,7738,5974,4107,2485
taxiDemand,4261,4997,3901,2686,1651
timestamp,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00
timestamp_month,1,1,1,1,1
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_week,52,52,52,52,52
timestamp_day_of_week,5,5,5,5,5
timestamp_day_of_month,1,1,1,1,1
timestamp_day_of_year,1,1,1,1,1


<center><b>Lag Features</b></center>

In [12]:
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures, ExpandingWindowFeatures

In [13]:
lagfeatures = LagFeatures(variables=None, periods=[1, 2, 4, 8, 16, 24], freq=None, sort_index=True, missing_values='raise', drop_original=False)
lagfeatures.fit(df[['timestamp', 'passengerDemand', 'taxiDemand']])
features = lagfeatures.transform(df[['timestamp', 'passengerDemand', 'taxiDemand']])
features

Unnamed: 0,timestamp,passengerDemand,taxiDemand,passengerDemand_lag_1,taxiDemand_lag_1,passengerDemand_lag_2,taxiDemand_lag_2,passengerDemand_lag_4,taxiDemand_lag_4,passengerDemand_lag_8,taxiDemand_lag_8,passengerDemand_lag_16,taxiDemand_lag_16,passengerDemand_lag_24,taxiDemand_lag_24
0,2022-01-01 00:00:00,6838,4261,,,,,,,,,,,,
1,2022-01-01 01:00:00,7738,4997,6838.0,4261.0,,,,,,,,,,
2,2022-01-01 02:00:00,5974,3901,7738.0,4997.0,6838.0,4261.0,,,,,,,,
3,2022-01-01 03:00:00,4107,2686,5974.0,3901.0,7738.0,4997.0,,,,,,,,
4,2022-01-01 04:00:00,2485,1651,4107.0,2686.0,5974.0,3901.0,6838.0,4261.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8732,2022-12-30 20:00:00,0,0,0.0,0.0,0.0,0.0,10305.0,6635.0,0.0,0.0,703.0,565.0,0.0,0.0
8733,2022-12-30 21:00:00,0,0,0.0,0.0,0.0,0.0,9550.0,6363.0,8689.0,5779.0,0.0,0.0,0.0,0.0
8734,2022-12-30 22:00:00,7092,4547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6813.0,4500.0
8735,2022-12-30 23:00:00,6589,4336,7092.0,4547.0,0.0,0.0,0.0,0.0,9672.0,6382.0,0.0,0.0,6435.0,4372.0


In [14]:
def add_lag_features(df:pd.DataFrame) -> pd.DataFrame:
    lagfeatures = LagFeatures(variables=None, periods=[1, 2, 4, 8, 16, 24], freq=None, sort_index=True, missing_values='raise', drop_original=False)
    lagfeatures.fit(df[['timestamp', 'passengerDemand', 'taxiDemand']])
    features = lagfeatures.transform(df[['timestamp', 'passengerDemand', 'taxiDemand']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [15]:
add_lag_features(df)

In [16]:
df.head().T

Unnamed: 0,0,1,2,3,4
passengerDemand,6838,7738,5974,4107,2485
taxiDemand,4261,4997,3901,2686,1651
timestamp,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00
timestamp_month,1,1,1,1,1
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_week,52,52,52,52,52
timestamp_day_of_week,5,5,5,5,5
timestamp_day_of_month,1,1,1,1,1
timestamp_day_of_year,1,1,1,1,1


<center><b>Window Features</b></center>

In [26]:
def add_window_features(df:pd.DataFrame) -> pd.DataFrame:
    window = WindowFeatures(
        variables=None, window=7, min_periods=1, 
        functions=['mean', 'std', 'median'], periods=1, freq=None, sort_index=True, 
        missing_values='raise', drop_original=False
    )
    window.fit(df[['timestamp', 'passengerDemand', 'taxiDemand']])
    features = window.transform(df[['timestamp', 'passengerDemand', 'taxiDemand']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [27]:
window = WindowFeatures(
    variables=None, window=7, min_periods=1, 
    functions=['mean', 'std', 'median'], periods=1, freq=None, sort_index=True, 
    missing_values='raise', drop_original=False
)

In [28]:
window.fit_transform(df[['timestamp', 'passengerDemand', 'taxiDemand']]).head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
timestamp,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00,2022-01-01 05:00:00,2022-01-01 06:00:00,2022-01-01 07:00:00,2022-01-01 08:00:00,2022-01-01 09:00:00
passengerDemand,6838,7738,5974,4107,2485,1215,928,1244,1233,1868
taxiDemand,4261,4997,3901,2686,1651,827,657,848,849,1254
passengerDemand_window_7_mean,,,,,,,,4183.571429,3384.428571,2455.142857
passengerDemand_window_7_std,,,,,,,,2744.320126,2655.574066,1912.350162
passengerDemand_window_7_median,,,,,,,,4107.0,2485.0,1244.0
taxiDemand_window_7_mean,,,,,,,,2711.428571,2223.857143,1631.285714
taxiDemand_window_7_std,,,,,,,,1729.016662,1700.201207,1230.584511
taxiDemand_window_7_median,,,,,,,,2686.0,1651.0,849.0


In [29]:
add_window_features(df)

In [31]:
df.head().T

Unnamed: 0,0,1,2,3,4
passengerDemand,6838,7738,5974,4107,2485
taxiDemand,4261,4997,3901,2686,1651
timestamp,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00
timestamp_month,1,1,1,1,1
timestamp_quarter,1,1,1,1,1
timestamp_semester,1,1,1,1,1
timestamp_week,52,52,52,52,52
timestamp_day_of_week,5,5,5,5,5
timestamp_day_of_month,1,1,1,1,1
timestamp_day_of_year,1,1,1,1,1


<center><b>Expanding Window Features</b></center>

In [38]:
def add_exp_window_features(df:pd.DataFrame) -> pd.DataFrame:
    expwindow = ExpandingWindowFeatures(
        variables=None, min_periods=None, functions='std', 
        periods=1, freq=None, sort_index=True, 
        missing_values='raise', drop_original=False
    )
    expwindow.fit(df[['timestamp', 'passengerDemand', 'taxiDemand']])
    features = expwindow.transform(df[['timestamp', 'passengerDemand', 'taxiDemand']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [39]:
expwindow = ExpandingWindowFeatures(
    variables=None, min_periods=None, functions='std', 
    periods=1, freq=None, sort_index=True, 
    missing_values='raise', drop_original=False
)
expwindow.fit(df[['timestamp', 'passengerDemand', 'taxiDemand']])
expwindow.transform(df[['timestamp', 'passengerDemand', 'taxiDemand']]).head().T

Unnamed: 0,0,1,2,3,4
timestamp,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00
passengerDemand,6838,7738,5974,4107,2485
taxiDemand,4261,4997,3901,2686,1651
passengerDemand_expanding_std,,,636.396103,882.061222,1549.096592
taxiDemand_expanding_std,,,520.430591,558.645982,964.800627


In [40]:
add_exp_window_features(df)

In [42]:
df.head(7).T

Unnamed: 0,0,1,2,3,4,5,6
passengerDemand,6838,7738,5974,4107,2485,1215,928
taxiDemand,4261,4997,3901,2686,1651,827,657
timestamp,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00,2022-01-01 05:00:00,2022-01-01 06:00:00
timestamp_month,1,1,1,1,1,1,1
timestamp_quarter,1,1,1,1,1,1,1
timestamp_semester,1,1,1,1,1,1,1
timestamp_week,52,52,52,52,52,52,52
timestamp_day_of_week,5,5,5,5,5,5,5
timestamp_day_of_month,1,1,1,1,1,1,1
timestamp_day_of_year,1,1,1,1,1,1,1


In [43]:
df.dropna(inplace=True)

In [44]:
df.to_parquet('d:/uber-taxi-demand/data/yellow_trip_features_v3.parquet', index=False)