## Here we transform the time series data into tabular data by using sliding window technique

In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
ts_data = pd.read_parquet("../data/transformed/ts_data_2022_01.parquet")

In [4]:
ts_data

Unnamed: 0,pickup_hour,rides_count,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
191203,2022-01-31 19:00:00,0,176
191204,2022-01-31 20:00:00,0,176
191205,2022-01-31 21:00:00,0,176
191206,2022-01-31 22:00:00,0,176


In [5]:
ts_data_one_location = ts_data.loc[ts_data["pickup_location_id"] == 43].reset_index(drop=True)

In [6]:
ts_data_one_location.head(15)

Unnamed: 0,pickup_hour,rides_count,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [8]:
def get_cutoff_indices(data: pd.DataFrame, n_features: int, step_size:int) -> list:
    stop_position = len(data) - 1
    
    # start the first sub-sequence at index position 0
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices_lst = []
    
    while subseq_last_idx <= stop_position:
        indices_lst.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
        
        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size
        
    return indices_lst

In [9]:
n_features = 24
step_size = 1

indices_lst = get_cutoff_indices(ts_data_one_location, n_features, step_size)

In [10]:
indices_lst[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [13]:
n_examples = len(indices_lst)
# features
x = np.ndarray(shape = (n_examples, n_features), dtype = np.float32)
# target
y = np.ndarray(shape = (n_examples, 1), dtype = np.float32)
pickup_hours_lst = []

for i, idx in enumerate(indices_lst):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]["rides_count"].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides_count"].values
    pickup_hours_lst.append(ts_data_one_location.iloc[idx[1]]["pickup_hour"])

In [14]:
print(f"{x.shape = }")
print(f"{x = }")
print(f"{pickup_hours_lst[:5] = }")

x.shape = (719, 24)
x = array([[ 97.,  60.,  22., ...,  16.,  18.,   6.],
       [ 60.,  22.,   8., ...,  18.,   6.,   3.],
       [ 22.,   8.,   6., ...,   6.,   3.,   1.],
       ...,
       [ 28.,  16.,  13., ..., 102.,  66.,  61.],
       [ 16.,  13.,   8., ...,  66.,  61.,  73.],
       [ 13.,   8.,   1., ...,  61.,  73.,  33.]], dtype=float32)
pickup_hours_lst[:5] = [Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


Transform numpy array to DataFrame

In [15]:
features_one_location = pd.DataFrame(x, columns = [f"rides_previous_{i + 1}_hour" for i in reversed(range(n_features))])

In [16]:
features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,97.0,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,...,70.0,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0
1,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,...,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0
2,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,...,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0
3,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,...,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0
4,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,77.0,...,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,52.0,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,...,78.0,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0
715,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,...,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0
716,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,...,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0
717,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,9.0,...,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0,73.0


Tranform target "y" numpy array into a pandas dataframe

In [17]:
targets_one_location = pd.DataFrame(y, columns = ["target_rides_next_hour"])

In [18]:
targets_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
714,66.0
715,61.0
716,73.0
717,33.0


Create a funtion that does the data procesing for all locations. this funtion will tranform time series data into tabular data

In [26]:
def tranform_ts_data_into_features_and_target(ts_data: pd.DataFrame, input_seq_len: int, step_size: int) -> pd.DataFrame:
    """
    Slices and transposes data from time-series format into a (features, target) format
    that can use to train supervised ML models
    """
    assert set(ts_data.columns) == {"pickup_hour", "rides_count", "pickup_location_id"}
    
    locations_ids = ts_data["pickup_location_id"].unique()
    features = pd.DataFrame()
    target = pd.DataFrame()
    
    
    for location_id in tqdm(locations_ids):
        
        # keep only ts data for this location_id
        ts_data_one_location = ts_data.loc[ts_data["pickup_location_id"] == location_id, ["pickup_hour", "rides_count"]]
        
        # pre-compute cutoff indices to split datafrane rows
        indices = get_cutoff_indices(ts_data_one_location, input_seq_len, step_size)
        
        # slice and transpose data into numpy arrays for features and target
        n_examples = len(indices)
        x = np.ndarray(shape = (n_examples, input_seq_len), dtype = np.float32)
        y = np.ndarray(shape = (n_examples), dtype = np.float32)
        pickup_hours_lst = []
        
        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]["rides_count"].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides_count"].values
            pickup_hours_lst.append(ts_data_one_location.iloc[idx[1]]["pickup_hour"])
            
        # convert numpy arrays into pandas dataframes for both features and target
        features_one_location = pd.DataFrame(x, columns = [f"rides_previous_{i + 1}_hour" for i in reversed(range(input_seq_len))])
        features_one_location["pickup_hour"] = pickup_hours_lst
        features_one_location["pickup_location_id"] = location_id
        
        # target
        targets_one_location = pd.DataFrame(y, columns = [f"target_rides_next_hour"])
        
        # concatenate features and target into 2 dataframes
        features = pd.concat([features, features_one_location])
        target = pd.concat([target, targets_one_location])
        
    features.reset_index(drop=True, inplace=True)
    target.reset_index(drop=True, inplace=True)
    
    return features, target["target_rides_next_hour"]

In [27]:
# call the function

# 24 hours, 7 days, 1 week ==> 1 week of history
features_df, target_df = tranform_ts_data_into_features_and_target(ts_data, input_seq_len = 24*7*1, step_size = 24)

100%|██████████| 257/257 [00:03<00:00, 79.24it/s]


In [28]:
print(f"{features_df.shape = }")
print(f"{target_df.shape = }")

features_df.shape = (6168, 170)
target_df.shape = (6168,)
