# Feature Selection

In [2]:
import numpy as np
import pandas as pd
import warnings

warnings.simplefilter('ignore')

### config

In [3]:
BASE_PATH = "D:/projects/rahnamcollege-ml/demand-prediction/"

LABEL_FILE_PATH = BASE_PATH + 'data/label/'
DATA_FILE_PATHS = BASE_PATH + 'data/input/label.parquet'
OUTPUT_PATH = BASE_PATH + "data/output/features.parquet"
ARIMA_PATH= BASE_PATH + 'data/output/arima_output.xlsx'
START_DATE = '2023-01-01'

### load label

In [4]:
rides_df = pd.read_parquet(LABEL_FILE_PATH)

print(rides_df.shape)
rides_df.head()

(31964, 3)


Unnamed: 0,date,PULocationID,count
96,2023-01-02,1,32.0
358,2023-01-03,1,28.0
620,2023-01-04,1,8.0
882,2023-01-05,1,16.0
1144,2023-01-06,1,12.0


 ### Add Arima

In [5]:
def load_arima(file_paths):
    df=pd.read_excel(file_paths,index_col=0)
    df=df.rename({'pred':'arima'},axis=1)
    df.drop('count',axis=1,inplace=True)

    return df
arima_df=load_arima(ARIMA_PATH)
rides_df=pd.merge(rides_df,arima_df,on=['date','PULocationID'])
rides_df.head()

Unnamed: 0,date,PULocationID,count,arima
0,2023-01-02,1,32.0,13
1,2023-01-03,1,28.0,18
2,2023-01-04,1,8.0,17
3,2023-01-05,1,16.0,11
4,2023-01-06,1,12.0,13


### adding calender features

In [6]:
def adding_feature(rides_df: pd.DataFrame):
    df = rides_df.copy()
    df['date'] = df['date'].astype('datetime64')
    df['PU_day_of_month'] = df['date'].dt.day.astype(np.uint8)
    df['week_of_month'] = df['date'].apply(lambda x: (x.day - 1) // 7 + 1)
    df['PU_day_of_week'] = df['date'].dt.weekday.astype(np.uint8)
    df.sort_values(by=['PULocationID', 'date'], inplace=True)
    df['last_day_demand'] = df.groupby(['PULocationID'])['count'].shift(1)
    df['last_week_demand'] = df.groupby(['PULocationID'])['count'].shift(7)

    for i in range(1,5):
        df[f'lag{i}-{i + 7}'] = (df.groupby(['PULocationID'])['count'].shift(i))/(df.groupby(['PULocationID'])['count'].shift(i + 7))
    df.dropna(inplace=True)
    df['arima'] = df['arima'] / df['last_week_demand']
    df['label'] = df['count'] / df['last_week_demand']
    df.drop(['count'], axis=1, inplace=True)
    return df

featured_df = adding_feature(rides_df)
print(featured_df.shape)
featured_df.head()

(28296, 13)


Unnamed: 0,date,PULocationID,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11,label
11,2023-01-13,1,0.916667,13,2,4,9.0,12.0,0.5625,1.25,0.178571,0.46875,1.833333
12,2023-01-14,1,2.142857,14,2,5,22.0,7.0,1.833333,0.5625,1.25,0.178571,1.142857
13,2023-01-15,1,0.846154,15,3,6,8.0,13.0,1.142857,1.833333,0.5625,1.25,1.538462
14,2023-01-16,1,1.0,16,3,0,20.0,15.0,1.538462,1.142857,1.833333,0.5625,1.466667
15,2023-01-17,1,3.0,17,3,1,22.0,5.0,1.466667,1.538462,1.142857,1.833333,3.8


### checking one week of data as a sample

In [7]:
featured_df[(featured_df['PULocationID'] == 79)].tail(8)

Unnamed: 0,date,PULocationID,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11,label
9393,2023-04-23,79,0.979427,23,4,6,4642.0,3597.0,0.929515,0.974787,1.005702,0.986155,0.862663
9394,2023-04-24,79,0.945385,24,4,0,3103.0,1300.0,0.862663,0.929515,0.974787,1.005702,1.093846
9395,2023-04-25,79,1.097708,25,4,1,1422.0,1658.0,1.093846,0.862663,0.929515,0.974787,1.012063
9396,2023-04-26,79,1.007559,26,4,2,1678.0,1852.0,1.012063,1.093846,0.862663,0.929515,1.086933
9397,2023-04-27,79,1.02573,27,4,3,2013.0,2293.0,1.086933,1.012063,1.093846,0.862663,0.986044
9398,2023-04-28,79,1.013436,28,4,4,2261.0,2977.0,0.986044,1.086933,1.012063,1.093846,1.056769
9399,2023-04-29,79,0.984489,29,5,5,3146.0,4642.0,1.056769,0.986044,1.086933,1.012063,0.987075
9400,2023-04-30,79,1.000322,30,5,6,4582.0,3103.0,0.987075,1.056769,0.986044,1.086933,1.0796


### save result

In [28]:
featured_df.to_parquet(OUTPUT_PATH)