# Feature Selection

### Import modules

In [1]:
import numpy as np
import pandas as pd
import warnings
from itertools import product

warnings.simplefilter('ignore')

### Config

In [2]:
DATA_FILE_PATHS = '/workspace/rahnemacollege/Project/Git/demand-prediction/data/input/'

OUTPUT_PATH_FEAT = '/workspace/rahnemacollege/Project/Git/demand-prediction/data/label/feat_p2.parquet'

start_date = '2023-01-01'
end_date = '2023-05-01'

number_interval_per_day = 8

### Load Data

In [3]:
def load_data(file_paths, interval: int, start_date=None, end_date=None):
    df = pd.read_parquet(file_paths)
    df['date'] = df['tpep_pickup_datetime'].dt.date.astype(str)

    if start_date:
        if end_date:
            df = df[(df['date'] >= start_date) & (
                df['date'] < end_date)]
        else:
            df = df[df['date'] > start_date].reset_index(drop=True)
    # Sort the DataFrame based on the 'tpep_pickup_datetime' column in ascending order
    df = df.sort_values(by='date')
    df = df.reset_index(drop=True)

    interval_per_day = int(24/interval)

    # Calculate the start time of each interval
    df['interval_start'] = df['tpep_pickup_datetime'].dt.floor(
        f"{interval_per_day}H")

    # Calculate the end time of each interval
    df['interval_end'] = df['interval_start'] + \
        pd.Timedelta(hours=interval_per_day)

    # Create a new column with the time interval in the desired format
    df['time_interval'] = df['interval_start'].dt.strftime(
        '%H:%M:%S') + ' - ' + df['interval_end'].dt.strftime('%H:%M:%S')

    # Drop 'interval_start' and 'interval_end' columns
    df.drop(columns=['interval_start', 'interval_end'], inplace=True)

    # Create bins for interval numbers from 1 to interval
    df['time_interval_number'] = pd.cut(
        df['tpep_pickup_datetime'].dt.hour, bins=interval, labels=range(1, interval + 1), right=False)

    return df

In [4]:
rides_df = load_data(
    DATA_FILE_PATHS, number_interval_per_day, start_date, end_date)
print(rides_df.shape)
rides_df.head()

(12672629, 22)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,date,time_interval,time_interval_number
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,0.5,0.0,0.0,1.0,14.3,2.5,0.0,2023-01-01,00:00:00 - 03:00:00,1
1,1,2023-01-01 16:18:55,2023-01-01 16:26:09,3.0,0.0,1.0,N,107,90,1,...,0.5,2.1,0.0,1.0,12.6,2.5,0.0,2023-01-01,15:00:00 - 18:00:00,6
2,2,2023-01-01 16:59:08,2023-01-01 17:41:59,1.0,19.82,2.0,N,132,238,1,...,0.5,16.36,6.55,1.0,98.16,2.5,1.25,2023-01-01,15:00:00 - 18:00:00,6
3,2,2023-01-01 16:29:59,2023-01-01 16:59:01,1.0,9.36,1.0,N,138,68,1,...,0.5,11.8,6.55,1.0,70.8,2.5,1.25,2023-01-01,15:00:00 - 18:00:00,6
4,2,2023-01-01 16:35:44,2023-01-01 16:53:56,2.0,3.18,1.0,N,114,162,1,...,0.5,4.76,0.0,1.0,28.56,2.5,0.0,2023-01-01,15:00:00 - 18:00:00,6


### Aggregate data and labeling

In [5]:
def labeling_by_interval(rides_df: pd.DataFrame):
    aggregated_df = rides_df.groupby(
        ['date', 'time_interval_number', 'PULocationID']).size().reset_index(name='count')
    unique_dates = rides_df['date'].unique()
    unique_interval = rides_df['time_interval_number'].unique()
    unique_pu_location_ids = rides_df['PULocationID'].unique()
    all_combinations = list(
        product(unique_dates, unique_interval, unique_pu_location_ids))
    combinations_df = pd.DataFrame(all_combinations, columns=[
                                   'date', 'time_interval_number', 'PULocationID'])
    label_df = aggregated_df.merge(combinations_df, how='right', on=[
                                   'date', 'time_interval_number', 'PULocationID']).fillna(0)
    # Sort based on two columns: 'time_interval_number' (ascending) and 'date' (ascending)
    label_df = label_df.sort_values(
        by=['date', 'time_interval_number'], ascending=[True, True])
    return label_df

In [6]:
labels_time_df = labeling_by_interval(rides_df)
print(labels_time_df.shape)
labels_time_df.head()

(251520, 4)


Unnamed: 0,date,time_interval_number,PULocationID,count
0,2023-01-01,1,161,504
1,2023-01-01,1,107,604
2,2023-01-01,1,132,389
3,2023-01-01,1,138,42
4,2023-01-01,1,114,205


### Adding features

In [7]:
def adding_feature(rides_df: pd.DataFrame, interval: int):
    rides_df['date'] = rides_df['date'].astype('datetime64[ns]')
    rides_df['PU_day_of_month'] = rides_df['date'].dt.day.astype(np.uint8)
    rides_df['PU_day_of_week'] = rides_df['date'].dt.weekday.astype(np.uint8)
    rides_df = rides_df.sort_values(
        ['PULocationID', 'date', 'time_interval_number'])
    rides_df['last_day_demand'] = rides_df.groupby(['PULocationID'])[
        'count'].shift(interval)
    rides_df['last_week_demand'] = rides_df.groupby(['PULocationID'])[
        'count'].shift(interval * 7)

    return rides_df

In [8]:
labels_time_df_feat = adding_feature(labels_time_df, number_interval_per_day)
print(labels_time_df_feat.shape)
labels_time_df_feat.head()

(251520, 8)


Unnamed: 0,date,time_interval_number,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day_demand,last_week_demand
58,2023-01-01,1,1,0,1,6,,
1368,2023-01-01,2,1,1,1,6,,
1630,2023-01-01,3,1,1,1,6,,
1892,2023-01-01,4,1,1,1,6,,
844,2023-01-01,5,1,13,1,6,,


### Checking two days of data as a sample

In [9]:
labels_time_df_feat[(labels_time_df_feat['PULocationID'] == 79)].tail(16)

Unnamed: 0,date,time_interval_number,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day_demand,last_week_demand
247342,2023-04-29,1,79,1509,29,5,432.0,1288.0
248652,2023-04-29,2,79,334,29,5,103.0,431.0
248914,2023-04-29,3,79,78,29,5,187.0,74.0
249176,2023-04-29,4,79,294,29,5,186.0,230.0
248128,2023-04-29,5,79,468,29,5,259.0,352.0
247604,2023-04-29,6,79,443,29,5,298.0,472.0
247866,2023-04-29,7,79,532,29,5,598.0,653.0
248390,2023-04-29,8,79,923,29,5,1082.0,1141.0
249438,2023-04-30,1,79,1353,30,6,1509.0,1243.0
250748,2023-04-30,2,79,352,30,6,334.0,323.0


### Save features engineering  data

In [10]:
labels_time_df_feat.to_parquet(OUTPUT_PATH_FEAT)