## Create a Baseline Model

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89300,3.0,0.0,2.0,3.0,2.0,3.0,13.0,8.0,9.0,9.0,...,6.0,5.0,3.0,1.0,6.0,1.0,3.0,2022-12-27,265,3.0
89301,6.0,4.0,0.0,0.0,2.0,0.0,14.0,7.0,8.0,4.0,...,4.0,2.0,1.0,2.0,2.0,2.0,8.0,2022-12-28,265,1.0
89302,7.0,2.0,3.0,4.0,7.0,4.0,10.0,9.0,7.0,11.0,...,2.0,3.0,5.0,1.0,1.0,0.0,8.0,2022-12-29,265,3.0
89303,6.0,5.0,4.0,3.0,0.0,3.0,11.0,12.0,9.0,10.0,...,3.0,3.0,1.0,2.0,0.0,1.0,2.0,2022-12-30,265,7.0


#### Split into Training and Testing

Now we will split the data into training and testing. We will not use the popular train_test_split from sklearn because we have created our own that will not split the data randomly but will instead split by a specified **cutoff_date**.

In [3]:
from src.data_split import train_test_split
from datetime import datetime

# cutoff_date must be datetime()
X_train, y_train, X_test, y_test = train_test_split(df,
                                                    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
                                                    target_column_name= 'target_rides_next_hour'
                                                    )

print(f'{X_train.shape}')
print(f'{y_train.shape}')
print(f'{X_test.shape}')
print(f'{y_test.shape}')


(32595, 674)
(32595,)
(56710, 674)
(56710,)


#### First Baseline Model - Predict Using Last Hour

Since this is our first baseline model we are going to make this model very simple. We are going to predict the taxi demand to be the demand that it was in the previous hour. We do this by making a class. Normally most of my code uses functions but we want this to follow the same structure of our next models which will be classes with fit and predict options.

This class will have fit and predict methods within it. The fit method will actually do nothing since our model is so simple that it will not have to be trained. The predict method will just have to return the **rides_previous_1_hour** column for each value in our **X_test** data. 

In [4]:
import numpy as np

class BaselineModelPreviousHour:
    '''class for baseline 1 model'''
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        # this model is very simple and does nothing 
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        # just return the rides from the previous hour
        return X_test[f'rides_previous_1_hour']




In [5]:
# instantiate the class 
model = BaselineModelPreviousHour()
# make predictions - just the rides_previous_1_hour column of that row
predictions = model.predict(X_test)
# view
predictions

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
56705    3.0
56706    8.0
56707    8.0
56708    2.0
56709    7.0
Name: rides_previous_1_hour, Length: 56710, dtype: float32

##### Error Metric - MAE

Average absolute difference between predicted values and actual values. This means on average our demand is off by 6.05 units, or rides. The mean number of rides is roughly 12 so this is a very bad model. The number 6 may not seem like a lot but when the average number of rides is only 12 it means we are off by a lot. 

In [7]:
y_test.mean()

np.float32(11.848051)

In [6]:
from sklearn.metrics import mean_absolute_error

test_mea = mean_absolute_error(y_test, predictions)
print(f'{test_mea=:.4f}')

test_mea=6.0558


#### Second Baseline Model - Use Last Week's Demand

This model will use the value of last week's demand for this week's demand. This model has an MAE of 3.68 which is much better than the original model. We thought this would be the case when we viewed the central park time series because there was weekly seasonality present in the data. This goes to show that EDA is important.

In [8]:
class BaselineModelPreviousWeek:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        '''remember rides_previous_#_hour works backwards from todays demand.'''
        return X_test[f'rides_previous_{7*24}_hour']
    
    

In [9]:
# create object from second baseline class
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)


In [10]:
# check the mae - the = inside the curly brackets tells python to include the variable name in the print statement.
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=3.6811


Another option would be to create a model based on the previous days demand but this model will not be as good as last week because weekends will have more demand than weekdays so going based on the previous week is better. 

In [13]:
class Baseline24:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        '''remember rides_previous_#_hour works backwards from todays demand.'''
        return X_test[f'rides_previous_{1*24}_hour']

In [14]:
# create object from second baseline class
model = Baseline24()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=5.3327


#### Third Baseline Model - Last 4 Week's Average

For this baseline model, we will build on our weekly seasonality intuition by getting the average over the last 4 weeks. This model performs the best out of our baseline models.

In [27]:
class BaselineModelPrevious4Weeks():
    def fit(self, X_train: pd.DataFrame, y_train:pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return (X_test[f'rides_previous_{7*24}_hour'] + \
        X_test[f'rides_previous_{14*24}_hour'] + \
        X_test[f'rides_previous_{21*24}_hour'] + \
        X_test[f'rides_previous_{28*24}_hour']) / 4

In [28]:
model = BaselineModelPrevious4Weeks()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=3.1963
