In [1]:
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_absolute_error

from src.paths import *
from src.data_split import *

In [2]:
df = pd.read_parquet(path = TRANSFORMED_PATH + "/features.parquet")
df['target_rides_next_hour'] = pd.read_parquet(path = TRANSFORMED_PATH + "/target.parquet")
df

Unnamed: 0,rides_previous_672_hours,rides_previous_671_hours,rides_previous_670_hours,rides_previous_669_hours,rides_previous_668_hours,rides_previous_667_hours,rides_previous_666_hours,rides_previous_665_hours,rides_previous_664_hours,rides_previous_663_hours,...,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour,pickup_location_id,target_rides_next_hour
0,1,1,0,2,0,0,1,2,1,5,...,0,0,0,0,0,0,0,2022-01-29 04:00:00,1,0
1,0,4,1,2,1,2,0,1,1,3,...,1,0,0,0,0,0,0,2022-01-30 04:00:00,1,0
2,0,0,0,2,0,0,0,0,2,1,...,0,0,0,0,0,0,0,2022-01-31 04:00:00,1,1
3,0,0,0,0,1,1,0,0,0,2,...,1,0,0,0,0,0,0,2022-02-01 04:00:00,1,0
4,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,2022-02-02 04:00:00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72307,3,4,1,1,1,0,1,2,2,3,...,29,15,4,12,7,11,4,2022-10-27 00:00:00,265,12
72308,9,4,3,3,2,4,2,6,1,3,...,29,17,12,4,5,8,9,2022-10-28 00:00:00,265,3
72309,7,6,3,4,3,5,7,6,5,10,...,29,13,9,10,5,10,7,2022-10-29 00:00:00,265,6
72310,6,5,8,6,6,0,1,2,8,6,...,8,10,7,3,3,6,2,2022-10-30 00:00:00,265,10


In [3]:
# Train - Test split

cutoff_dt = datetime(year=2022, month=6, day=1, hour=0, minute=0, second=0)
X_train,y_train,X_test,y_test = train_test_split(df,cutoff_dt)

print("Training data : ",X_train.shape,y_train.shape)
print("Test data : ",X_test.shape,y_test.shape)

Training data :  (32284, 674) (32284,)
Test data :  (40028, 674) (40028,)


### Baseline model 1

- How to predict demand for the next hour?
    - Look at the no.of rides in the previous hour

In [4]:
class BaselineModelPreviousHour():
    def fit(self, X, y):
        pass
    def predict(self, X):
        return X['rides_previous_1_hours']

In [5]:
model_prevhour = BaselineModelPreviousHour()
y_pred_prevhour = model_prevhour.predict(X_test)
y_pred_prevhour

0        0
1        0
2        0
3        1
4        0
        ..
40023    4
40024    9
40025    7
40026    2
40027    4
Name: rides_previous_1_hours, Length: 40028, dtype: int64

In [6]:
error_metric_prevhour = mean_absolute_error(y_test,y_pred_prevhour)
print("{:.4f}".format(error_metric_prevhour))

6.0894


### Baseline model 2

- How to predict demand for the next hour?
    - Look at the no.of rides for that location at the same time last week (i.e., 24*7 = 168 hours back)
        - We assume that there is weekly seasonality here

In [7]:
class BaselineModelPreviousWeek():
    def fit(self, X, y):
        pass
    def predict(self, X):
        return X['rides_previous_168_hours']

In [8]:
model_prevweek = BaselineModelPreviousWeek()
y_pred_prevweek = model_prevweek.predict(X_test)
y_pred_prevweek

0        0
1        1
2        0
3        3
4        0
        ..
40023    1
40024    9
40025    4
40026    5
40027    4
Name: rides_previous_168_hours, Length: 40028, dtype: int64

In [9]:
error_metric_prevweek = mean_absolute_error(y_test,y_pred_prevweek)
print("{:.4f}".format(error_metric_prevweek))

3.4201


### Baseline model 3

- How to predict demand for the next hour?
    - Look at the average no.of rides for that location at the same time for the last 4 weeks (i.e., 7 days (168 hrs) back, 14 days (336 hrs) back, 21 days (504 hrs) back and 28 days (672 hrs) back)

In [10]:
class BaselineModelLast4Weeks():
    def fit(self, X, y):
        pass
    def predict(self, X):
        avg_X = (X['rides_previous_168_hours'] + X['rides_previous_336_hours'] + X['rides_previous_504_hours'] + X['rides_previous_672_hours'])/4
        avg_X = avg_X.round()
        return avg_X

In [11]:
model_last4weeks = BaselineModelLast4Weeks()
y_pred_last4weeks = model_last4weeks.predict(X_test)
y_pred_last4weeks

0        1.0
1        0.0
2        1.0
3        1.0
4        0.0
        ... 
40023    5.0
40024    8.0
40025    8.0
40026    6.0
40027    6.0
Length: 40028, dtype: float64

In [12]:
error_metric_last4weeks = mean_absolute_error(y_test,y_pred_last4weeks)
print("{:.4f}".format(error_metric_last4weeks))

2.8949
