# 1. Load data

In [1]:
import pandas as pd

ts_data =  pd.read_parquet("../data/transformed/ts_data_2022_01.parquet")
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
191203,2022-01-31 19:00:00,0,176
191204,2022-01-31 20:00:00,0,176
191205,2022-01-31 21:00:00,0,176
191206,2022-01-31 22:00:00,0,176


# 2. Transform data

In [9]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 43,:].reset_index(drop=True)
ts_data_one_location

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
...,...,...,...
739,2022-01-31 19:00:00,61,43
740,2022-01-31 20:00:00,73,43
741,2022-01-31 21:00:00,33,43
742,2022-01-31 22:00:00,21,43


In [7]:
def get_cutoff_indices(data:pd.DataFrame, n_features:int, step_size:int):
    stop_position = len(data) - 1
    
    #Extraer indices
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []
    
    while subseq_last_idx <= stop_position:
        indices.append((subseq_first_idx,subseq_mid_idx,subseq_last_idx))
        
        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size
        
    return indices
    

In [27]:
n_features = 24
step_size = 1

indices =  get_cutoff_indices(ts_data_one_location,n_features,step_size)
indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [26]:
indices[-1]

(48, 720, 721)

In [16]:
len(indices)

16

In [43]:
ts_data_one_location

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
...,...,...,...
739,2022-01-31 19:00:00,61,43
740,2022-01-31 20:00:00,73,43
741,2022-01-31 21:00:00,33,43
742,2022-01-31 22:00:00,21,43


In [44]:
import numpy as np

n_examples = len(indices)
x = np.ndarray(shape=(n_examples,n_features),dtype=np.float32)
y = np.ndarray(shape=(n_examples))

pickup_hours = []

for i, idx in enumerate(indices):
    x[i,:] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

In [45]:
print(f'{x.shape=}')
print(f'{x=}')
print(f'{pickup_hours[:5]=}')

x.shape=(719, 24)
x=array([[ 97.,  60.,  22., ...,  16.,  18.,   6.],
       [ 60.,  22.,   8., ...,  18.,   6.,   3.],
       [ 22.,   8.,   6., ...,   6.,   3.,   1.],
       ...,
       [ 28.,  16.,  13., ..., 102.,  66.,  61.],
       [ 16.,  13.,   8., ...,  66.,  61.,  73.],
       [ 13.,   8.,   1., ...,  61.,  73.,  33.]], dtype=float32)
pickup_hours[:5]=[Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


In [50]:
features_one_location = pd.DataFrame(
    x,
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))]
)
features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,97.0,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,...,70.0,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0
1,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,...,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0
2,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,...,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0
3,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,...,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0
4,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,77.0,...,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,52.0,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,...,78.0,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0
715,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,...,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0
716,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,...,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0
717,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,9.0,...,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0,73.0


In [51]:
targets_one_location = pd.DataFrame(y,columns=[f'target_rides_next_hour'])
targets_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
714,66.0
715,61.0
716,73.0
717,33.0


In [52]:
from tqdm import tqdm
import numpy as np

def transform_ts_data_into_features_and_target (ts_data:pd.DataFrame, input_seq_len:int, step_size:int) -> pd.DataFrame:
    
    assert set(ts_data.columns) == {'pickup_hour','rides','pickup_location_id'}
    
    location_ids = ts_data['pickup_location_id'].unique()
    features = pd.DataFrame() 
    targets = pd.DataFrame()
    
    for location_id in tqdm(location_ids):
        # Get only one location id
        ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id,['pickup_hour','rides']]
        
        # Get indices
        indices = get_cutoff_indices(ts_data_one_location,input_seq_len,step_size)
        
        # Convert ts-data to tabular data (features-target)
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples,input_seq_len),dtype=np.float32)
        y = np.ndarray(shape=(n_examples))
        pickup_hours = []
        for i, idx in enumerate(indices):
            x[i,:] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])
            
        # Numpy to pandas
        features_one_location = pd.DataFrame(x, 
                                             columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
                                             )
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id
        
        targets_one_location = pd.DataFrame(y,columns=[f'target_rides_next_hour'])
        
        # Concatenate results
        features = pd.concat([features,features_one_location])
        targets = pd.concat([targets, targets_one_location])
    
    features.reset_index(inplace=True,drop=True)
    targets.reset_index(inplace=True,drop=True)
    
    return features, targets['target_rides_next_hour']

In [53]:
features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*7*1,
    step_size=24
)

print(f'{features.shape=}')
print(f'{targets.shape=}')

100%|██████████| 257/257 [00:02<00:00, 107.20it/s]

features.shape=(6168, 170)
targets.shape=(6168,)





In [54]:
features

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,2.0,3.0,3.0,7.0,4.0,4.0,7.0,10.0,2022-01-08,4
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3.0,3.0,5.0,7.0,8.0,6.0,7.0,14.0,2022-01-09,4
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,6.0,4.0,3.0,5.0,1.0,1.0,1.0,0.0,2022-01-10,4
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,6.0,3.0,2.0,4.0,1.0,0.0,1.0,2.0,2022-01-11,4
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,1.0,6.0,3.0,2.0,3.0,2.0,4.0,1.0,2022-01-12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-27,176
6164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-28,176
6165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29,176
6166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-30,176


In [11]:
targets

0       16.0
1       18.0
2        0.0
3        0.0
4        2.0
        ... 
6163     0.0
6164     0.0
6165     0.0
6166     0.0
6167     0.0
Name: target_rides_next_hour, Length: 6168, dtype: float64