In [1]:
import os
import pandas as pd

project_path = '/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/'

In [2]:
transformed_path = project_path + 'data/transformed/'
transformed_files = [f for f in os.listdir(transformed_path) if 'parquet' in f]
transformed_files.sort()

## 1. Windowing (Sliding Windows)

- For a particular location, get the previous 24 hours data to predict the rides in the next hour
    - So, window size = features = list of 24 values
        - If we consider one week's history, features = 24*7 = 168 + any additional features required
- Each windows slides down one step usually meaning one hour
    - So, step size = 1
        - If we let the window slide 1 day, step size = 24

In [3]:
df = pd.read_parquet(path = transformed_path + transformed_files[0])
df

Unnamed: 0,pickup_time,pickup_location,count_pickup_loc
0,2022-01-01 00:00:00,7,6
1,2022-01-01 00:00:00,256,5
2,2022-01-01 00:00:00,125,27
3,2022-01-01 00:00:00,127,2
4,2022-01-01 00:00:00,129,2
...,...,...,...
191203,2022-01-31 23:00:00,255,0
191204,2022-01-31 23:00:00,257,0
191205,2022-01-31 23:00:00,258,0
191206,2022-01-31 23:00:00,259,0


In [4]:
def transform_timeseriesdata_into_features_target(df,window_size,step_size):
    
    features = []
    target = []
    col_names = [f'rides_previous_{el}_hours' for el in range(window_size,0,-1)]
    
    all_loc = df.pickup_location.unique()
    all_loc.sort()

    for loc_id in all_loc:
        df_single_loc = df[df.pickup_location == loc_id]
        df_single_loc = df_single_loc[['pickup_time','count_pickup_loc']]
        df_single_loc.columns = ['pickup_hour','rides']

        i_start = 0
        i_end = len(df_single_loc) - window_size

        X = []
        y = []
        y_pickup_hour = []

        for i in range(i_start,i_end, step_size):
            j = i + window_size
            X.append(df_single_loc['rides'][i:j].tolist()) 
            y.append(df_single_loc['rides'].iloc[j])
            y_pickup_hour.append(df_single_loc['pickup_hour'].iloc[j])

        features_single_loc = pd.DataFrame(X,columns = col_names)
        features_single_loc['pickup_hour'] = y_pickup_hour
        features_single_loc['pickup_location_id'] = loc_id
        target_single_loc = pd.DataFrame(y,columns = ['target_rides_next_hour'])

        features.append(features_single_loc)
        target.append(target_single_loc)

    features = pd.concat(features)
    features.reset_index(drop=True,inplace=True) #as index numbers are weird

    target = pd.concat(target)
    target.reset_index(drop=True,inplace=True) #as index numbers are weird
    
    return features,target

In [5]:
window_size = 168
step_size = 24
features,target = transform_timeseriesdata_into_features_target(df,window_size,step_size)
display(features)
display(target)

Unnamed: 0,rides_previous_168_hours,rides_previous_167_hours,rides_previous_166_hours,rides_previous_165_hours,rides_previous_164_hours,rides_previous_163_hours,rides_previous_162_hours,rides_previous_161_hours,rides_previous_160_hours,rides_previous_159_hours,...,rides_previous_8_hours,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour,pickup_location_id
0,1,1,0,2,0,0,1,2,1,5,...,0,0,0,0,0,0,2,0,2022-01-08 04:00:00,1
1,0,4,1,2,1,2,0,1,1,3,...,1,0,0,0,0,0,0,0,2022-01-09 04:00:00,1
2,0,0,0,2,0,0,0,0,2,1,...,0,0,0,0,0,0,0,0,2022-01-10 04:00:00,1
3,0,0,0,0,1,1,0,0,0,2,...,0,0,0,0,0,0,0,0,2022-01-11 04:00:00,1
4,0,0,0,0,0,0,1,1,1,0,...,1,0,2,0,0,0,0,0,2022-01-12 04:00:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,1,7,4,1,1,7,9,20,14,15,...,26,32,17,8,5,8,4,6,2022-01-27 00:00:00,265
6164,5,7,2,1,3,7,16,21,12,19,...,29,25,24,6,9,5,3,8,2022-01-28 00:00:00,265
6165,3,3,0,7,2,0,0,2,6,6,...,32,21,20,5,9,4,4,4,2022-01-29 00:00:00,265
6166,1,6,4,2,2,1,1,3,4,14,...,9,3,8,4,1,6,2,5,2022-01-30 00:00:00,265


Unnamed: 0,target_rides_next_hour
0,0
1,0
2,0
3,1
4,0
...,...
6163,3
6164,4
6165,1
6166,5
