### import modules

In [4]:
import pandas as pd
from itertools import product

### config

In [5]:
DATA_FILE_PATHS = 'D:/projects/rahnamcollege-ml/demand-prediction/data/input/'

OUTPUT_PATH = 'D:/projects/rahnamcollege-ml/demand-prediction/data/label/label.parquet'

### load data

In [6]:
def load_data(file_paths, start_date=None):
    df = pd.read_parquet(file_paths)
    df['date'] = df['tpep_pickup_datetime'].dt.date.astype(str)

    if start_date:
        df = df[df['date'] > start_date].reset_index(drop=True)

    return df


rides_df = load_data(DATA_FILE_PATHS, '2023-01-01')
print(rides_df.shape)
rides_df.head()

(12595923, 20)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,date
0,2,2023-01-02 00:00:37,2023-01-02 00:13:15,1.0,9.29,1.0,N,70,4,1,35.9,1.0,0.5,8.18,0.0,1.0,49.08,2.5,0.0,2023-01-02
1,2,2023-01-02 00:01:53,2023-01-02 00:34:16,1.0,20.4,2.0,N,132,238,1,70.0,0.0,0.5,15.86,6.55,1.0,95.16,0.0,1.25,2023-01-02
2,2,2023-01-02 00:04:59,2023-01-02 00:12:03,5.0,1.68,1.0,N,142,229,1,10.0,1.0,0.5,2.25,0.0,1.0,17.25,2.5,0.0,2023-01-02
3,2,2023-01-02 00:00:28,2023-01-02 00:08:45,1.0,1.74,1.0,N,164,224,1,10.7,1.0,0.5,0.0,0.0,1.0,15.7,2.5,0.0,2023-01-02
4,2,2023-01-02 00:00:08,2023-01-02 00:04:30,6.0,0.63,1.0,N,144,231,1,6.5,1.0,0.5,0.0,0.0,1.0,11.5,2.5,0.0,2023-01-02


### aggregate data and labeling

In [7]:
def labeling(rides_df: pd.DataFrame):
    aggregated_df = rides_df.groupby(['date', 'PULocationID']).size().reset_index(name='count')
    unique_dates = rides_df['date'].unique()
    unique_pu_location_ids = rides_df['PULocationID'].unique()
    all_combinations = list(product(unique_dates, unique_pu_location_ids))
    combinations_df = pd.DataFrame(all_combinations, columns=['date', 'PULocationID'])
    label_df = aggregated_df.merge(combinations_df, how='right', on=['date', 'PULocationID']).fillna(0)
    label_df['count'] = label_df['count'] + 1 
    label_df.sort_values(by=['PULocationID', 'date'], inplace=True)
    return label_df


labels_df = labeling(rides_df)
print(labels_df.shape)
labels_df.head()


(31964, 3)


Unnamed: 0,date,PULocationID,count
96,2023-01-02,1,32.0
358,2023-01-03,1,28.0
620,2023-01-04,1,8.0
882,2023-01-05,1,16.0
1144,2023-01-06,1,12.0


### save labeling data

In [8]:
labels_df.to_parquet(OUTPUT_PATH)