In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.config as config

We will connect to our project and feature group because this is where we will be reading the data from for our modeling.

In [3]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1049751
Connected. Call `.close()` to terminate connection gracefully.


To read data from the feature group we need to create a feature view. This is how we create our final dataset.

In [4]:
# create feature view (if it doesn't exist yet)
# This feature view only uses one feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation.


Create training data from the feature view data.

In [5]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (23.93s) 



The length of this data is the same as the length of the data that we sent to the feature group after adding in the new synthetic data.

In [6]:
len(ts_data)

4124990

In [7]:
# drop `pickup_ts` column
ts_data.drop('pickup_ts', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
552612,2023-01-01 00:00:00+00:00,0,1
2486448,2023-01-01 01:00:00+00:00,0,1
260806,2023-01-01 02:00:00+00:00,0,1
791889,2023-01-01 03:00:00+00:00,0,1
1387328,2023-01-01 04:00:00+00:00,0,1
...,...,...,...
258311,2024-10-10 11:00:00+00:00,6,265
258626,2024-10-10 12:00:00+00:00,16,265
259029,2024-10-10 13:00:00+00:00,4,265
259141,2024-10-10 14:00:00+00:00,5,265


In [8]:
len(ts_data.loc[ts_data['pickup_location_id'] == 1])

15566

In [9]:
len(ts_data.loc[ts_data['pickup_location_id'] == 265])

15566

In [10]:
ts_data['pickup_hour'].info()

<class 'pandas.core.series.Series'>
Int64Index: 4124990 entries, 552612 to 259457
Series name: pickup_hour
Non-Null Count    Dtype 
--------------    ----- 
4124990 non-null  object
dtypes: object(1)
memory usage: 62.9+ MB


In [60]:
from src.plot import plot_ts

plot_ts(ts_data, locations=[161])

In [12]:
import pandas as pd
# Convert pickup_hour to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

In [13]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
552612,2023-01-01 00:00:00+00:00,0,1
2486448,2023-01-01 01:00:00+00:00,0,1
260806,2023-01-01 02:00:00+00:00,0,1
791889,2023-01-01 03:00:00+00:00,0,1
1387328,2023-01-01 04:00:00+00:00,0,1
...,...,...,...
258311,2024-10-10 11:00:00+00:00,6,265
258626,2024-10-10 12:00:00+00:00,16,265
259029,2024-10-10 13:00:00+00:00,4,265
259141,2024-10-10 14:00:00+00:00,5,265


In [14]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data, 
    input_seq_len = 24*28*1,   # one month of features
    step_size = 23
)

print(f'{features.shape=}')
print(f'{targets.shape=}')

100%|██████████| 265/265 [00:42<00:00,  6.20it/s]

features.shape=(171720, 674)
targets.shape=(171720,)





In [15]:
features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

In [16]:
print(f'{features_and_target.shape=}')

features_and_target.shape=(171720, 675)


In [17]:
features_and_target

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-29 00:00:00+00:00,1,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,...,1.0,2.0,1.0,1.0,0.0,0.0,0.0,2023-01-29 23:00:00+00:00,1,0.0
2,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,2.0,1.0,2.0,0.0,1.0,0.0,2023-01-30 22:00:00+00:00,1,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,2.0,0.0,0.0,0.0,2023-01-31 21:00:00+00:00,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,1.0,0.0,2.0,1.0,1.0,0.0,2023-02-01 20:00:00+00:00,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171715,3.0,1.0,1.0,2.0,2.0,5.0,5.0,4.0,7.0,2.0,...,5.0,4.0,12.0,12.0,6.0,2.0,0.0,2024-10-06 07:00:00+00:00,265,1.0
171716,1.0,1.0,5.0,1.0,5.0,2.0,5.0,5.0,7.0,10.0,...,1.0,0.0,3.0,4.0,1.0,1.0,3.0,2024-10-07 06:00:00+00:00,265,2.0
171717,2.0,3.0,1.0,7.0,6.0,3.0,5.0,5.0,13.0,11.0,...,3.0,1.0,0.0,1.0,2.0,1.0,1.0,2024-10-08 05:00:00+00:00,265,0.0
171718,3.0,1.0,2.0,1.0,7.0,6.0,3.0,2.0,3.0,6.0,...,3.0,5.0,1.0,4.0,4.0,1.0,0.0,2024-10-09 04:00:00+00:00,265,0.0


In [18]:
features_and_target['pickup_hour'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 171720 entries, 0 to 171719
Series name: pickup_hour
Non-Null Count   Dtype              
--------------   -----              
171720 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 1.3 MB


In [19]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> from January 2023 up until 1 month ago
# test data -> last 1 months
# Convert pickup_hour to datetime
cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'   
)
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2024-09-12 00:00:00+0000', tz='UTC')
X_train.shape=(163770, 674)
y_train.shape=(163770,)
X_test.shape=(7950, 674)
y_test.shape=(7950,)


In [20]:
X_train

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-29 00:00:00+00:00,1
1,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,...,5.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,2023-01-29 23:00:00+00:00,1
2,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,2023-01-30 22:00:00+00:00,1
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,2023-01-31 21:00:00+00:00,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,2.0,1.0,0.0,2.0,1.0,1.0,0.0,2023-02-01 20:00:00+00:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163765,2.0,1.0,2.0,4.0,0.0,7.0,3.0,4.0,3.0,2.0,...,3.0,8.0,1.0,0.0,1.0,1.0,2.0,3.0,2024-09-07 11:00:00+00:00,265
163766,2.0,2.0,7.0,1.0,2.0,2.0,3.0,7.0,10.0,6.0,...,8.0,6.0,6.0,3.0,1.0,1.0,2.0,2.0,2024-09-08 10:00:00+00:00,265
163767,2.0,2.0,4.0,4.0,6.0,3.0,2.0,6.0,8.0,3.0,...,2.0,1.0,1.0,1.0,1.0,5.0,1.0,5.0,2024-09-09 09:00:00+00:00,265
163768,2.0,4.0,3.0,7.0,2.0,3.0,4.0,3.0,3.0,5.0,...,14.0,5.0,4.0,2.0,3.0,1.0,7.0,6.0,2024-09-10 08:00:00+00:00,265


In [21]:
X_test

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-09-12 08:00:00+00:00,1
1,2.0,0.0,0.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2024-09-13 07:00:00+00:00,1
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-09-14 06:00:00+00:00,1
3,2.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2024-09-15 05:00:00+00:00,1
4,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,...,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2024-09-16 04:00:00+00:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7945,3.0,1.0,1.0,2.0,2.0,5.0,5.0,4.0,7.0,2.0,...,4.0,5.0,4.0,12.0,12.0,6.0,2.0,0.0,2024-10-06 07:00:00+00:00,265
7946,1.0,1.0,5.0,1.0,5.0,2.0,5.0,5.0,7.0,10.0,...,9.0,1.0,0.0,3.0,4.0,1.0,1.0,3.0,2024-10-07 06:00:00+00:00,265
7947,2.0,3.0,1.0,7.0,6.0,3.0,5.0,5.0,13.0,11.0,...,2.0,3.0,1.0,0.0,1.0,2.0,1.0,1.0,2024-10-08 05:00:00+00:00,265
7948,3.0,1.0,2.0,1.0,7.0,6.0,3.0,2.0,3.0,6.0,...,2.0,3.0,5.0,1.0,4.0,4.0,1.0,0.0,2024-10-09 04:00:00+00:00,265


Now we create the model using Optuna to optimize the hyperparameters.

In [26]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

# this is where we created the pipeline for feature engineering
from src.model import get_pipeline

# optuna objective
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """

    # lightgbm hyperparameters
    hyperparams = {
        'metric' : 'mae',
        'verbose' : -1,
        'num_leaves' : trial.suggest_int('num_leaves', 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100), 
    }

    # make sure the data is sorted correctly for TimeSeriesSplit
    X_train_sorted = X_train.sort_values(by = 'pickup_hour')

    # TimeSeriesSplit set up
    tss = TimeSeriesSplit(n_splits=3)
    # empty list to hold mae for each split
    scores = []

    
    for train_index, val_index in tss.split(X_train_sorted):

        # split data for training and validation - use copy to avoid warnings
        X_train_ = X_train_sorted.iloc[train_index, :].copy()
        X_val_ = X_train_sorted.iloc[val_index, :].copy()
        y_train_ = y_train.iloc[train_index].copy()
        y_val_ = y_train.iloc[val_index].copy()
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()



In [27]:
# create a study from the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 5)

[I 2024-10-10 09:58:58,222] A new study created in memory with name: no-name-1f1cbddd-b9bb-4778-86ff-42d6ff4d9d28


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 224, 'feature_fraction': 0.7337750392651199, 'bagging_fraction': 0.5627425942879254, 'min_child_samples': 92}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 224, 'feature_fraction': 0.7337750392651199, 'bagging_fraction': 0.5627425942879254, 'min_child_samples': 92}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 224, 'feature_fraction': 0.7337750392651199, 'bagging_fraction': 0.5627425942879254, 'min_child_samples': 92}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:


[I 2024-10-10 10:00:17,091] Trial 0 finished with value: 26.88348534744199 and parameters: {'num_leaves': 224, 'feature_fraction': 0.7337750392651199, 'bagging_fraction': 0.5627425942879254, 'min_child_samples': 92}. Best is trial 0 with value: 26.88348534744199.


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 71, 'feature_fraction': 0.4243614808661358, 'bagging_fraction': 0.8451909524895584, 'min_child_samples': 20}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 71, 'feature_fraction': 0.4243614808661358, 'bagging_fraction': 0.8451909524895584, 'min_child_samples': 20}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 71, 'feature_fraction': 0.4243614808661358, 'bagging_fraction': 0.8451909524895584, 'min_child_samples': 20}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:


[I 2024-10-10 10:00:58,255] Trial 1 finished with value: 26.044039752167937 and parameters: {'num_leaves': 71, 'feature_fraction': 0.4243614808661358, 'bagging_fraction': 0.8451909524895584, 'min_child_samples': 20}. Best is trial 1 with value: 26.044039752167937.


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 91, 'feature_fraction': 0.9299489032716728, 'bagging_fraction': 0.3549698142169717, 'min_child_samples': 7}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 91, 'feature_fraction': 0.9299489032716728, 'bagging_fraction': 0.3549698142169717, 'min_child_samples': 7}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 91, 'feature_fraction': 0.9299489032716728, 'bagging_fraction': 0.3549698142169717, 'min_child_samples': 7}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:


[I 2024-10-10 10:01:35,927] Trial 2 finished with value: 26.327505301984655 and parameters: {'num_leaves': 91, 'feature_fraction': 0.9299489032716728, 'bagging_fraction': 0.3549698142169717, 'min_child_samples': 7}. Best is trial 1 with value: 26.044039752167937.


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 87, 'feature_fraction': 0.3399834913148513, 'bagging_fraction': 0.5846990680355046, 'min_child_samples': 94}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 87, 'feature_fraction': 0.3399834913148513, 'bagging_fraction': 0.5846990680355046, 'min_child_samples': 94}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 87, 'feature_fraction': 0.3399834913148513, 'bagging_fraction': 0.5846990680355046, 'min_child_samples': 94}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:


[I 2024-10-10 10:02:09,776] Trial 3 finished with value: 26.140151399920487 and parameters: {'num_leaves': 87, 'feature_fraction': 0.3399834913148513, 'bagging_fraction': 0.5846990680355046, 'min_child_samples': 94}. Best is trial 1 with value: 26.044039752167937.


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 156, 'feature_fraction': 0.6330410239298128, 'bagging_fraction': 0.9631378831795094, 'min_child_samples': 50}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 156, 'feature_fraction': 0.6330410239298128, 'bagging_fraction': 0.9631378831795094, 'min_child_samples': 50}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 156, 'feature_fraction': 0.6330410239298128, 'bagging_fraction': 0.9631378831795094, 'min_child_samples': 50}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:


[I 2024-10-10 10:03:40,527] Trial 4 finished with value: 26.517363642619035 and parameters: {'num_leaves': 156, 'feature_fraction': 0.6330410239298128, 'bagging_fraction': 0.9631378831795094, 'min_child_samples': 50}. Best is trial 1 with value: 26.044039752167937.


In [28]:
# get best params
best_params = study.best_trial.params
print(best_params)

{'num_leaves': 71, 'feature_fraction': 0.4243614808661358, 'bagging_fraction': 0.8451909524895584, 'min_child_samples': 20}


In [29]:
# use full training data
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

Creating pipeline with hyperparameters: {'num_leaves': 71, 'feature_fraction': 0.4243614808661358, 'bagging_fraction': 0.8451909524895584, 'min_child_samples': 20}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created:


This mae is not great but not terrible we want anything less than 30 here. It is something we can monitor.

In [31]:
# make predictions
predictions = pipeline.predict(X_test)
# score them
mae = mean_absolute_error(y_test, predictions)
print(f'{mae=:.4f}')

mae=26.3811


In [33]:
len(predictions)

7950

In [63]:
predictions.max()

185.00831797864427

In [34]:
len(X_test)

7950

In [35]:
X_test

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-09-12 08:00:00+00:00,1,0.00
1,2.0,0.0,0.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2024-09-13 07:00:00+00:00,1,1.50
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-09-14 06:00:00+00:00,1,0.75
3,2.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2024-09-15 05:00:00+00:00,1,0.50
4,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2024-09-16 04:00:00+00:00,1,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7945,3.0,1.0,1.0,2.0,2.0,5.0,5.0,4.0,7.0,2.0,...,5.0,4.0,12.0,12.0,6.0,2.0,0.0,2024-10-06 07:00:00+00:00,265,1.25
7946,1.0,1.0,5.0,1.0,5.0,2.0,5.0,5.0,7.0,10.0,...,1.0,0.0,3.0,4.0,1.0,1.0,3.0,2024-10-07 06:00:00+00:00,265,3.00
7947,2.0,3.0,1.0,7.0,6.0,3.0,5.0,5.0,13.0,11.0,...,3.0,1.0,0.0,1.0,2.0,1.0,1.0,2024-10-08 05:00:00+00:00,265,1.75
7948,3.0,1.0,2.0,1.0,7.0,6.0,3.0,2.0,3.0,6.0,...,3.0,5.0,1.0,4.0,4.0,1.0,0.0,2024-10-09 04:00:00+00:00,265,2.50


In [37]:
pred = pd.Series(predictions)

In [38]:
import pandas as pd
from sklearn.metrics import mean_absolute_error

def compute_mae_by_location(X_test: pd.DataFrame, y_test: pd.Series, y_pred: pd.Series) -> pd.DataFrame:
    """
    Computes the Mean Absolute Error (MAE) for each pickup_location_id in X_test using provided predictions.

    Parameters:
    - X_test: DataFrame containing the test features including pickup_location_id and pickup_hour
    - y_test: Series containing the true values for the test set
    - y_pred: Series containing the predicted values for the test set

    Returns:
    - DataFrame with pickup_location_id and corresponding MAE
    """
    
    # Ensure the data is sorted by pickup_hour
    X_test_sorted = X_test.sort_values(by='pickup_hour')
    # align the indices
    y_test_sorted = y_test.loc[X_test_sorted.index]
    y_pred_sorted = y_pred.loc[X_test_sorted.index]

    # DataFrame to store results
    results = []

    # Get unique pickup_location_ids
    unique_locations = X_test_sorted['pickup_location_id'].unique()

    for location in unique_locations:
        # Filter data for the current pickup_location_id
        indices = X_test_sorted[X_test_sorted['pickup_location_id'] == location].index
        y_test_location = y_test_sorted.loc[indices]
        y_pred_location = y_pred_sorted.loc[indices]
        
        # Compute MAE
        mae = mean_absolute_error(y_test_location, y_pred_location)
        
        # Append results
        results.append({'pickup_location_id': location, 'mae': mae})

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    return results_df

# Example usage:
results_df = compute_mae_by_location(X_test, y_test, pred)
print(results_df)


     pickup_location_id        mae
0                     1  19.184851
1                    14  16.900679
2                    75  20.649307
3                   187  19.079530
4                   186  89.154258
..                  ...        ...
260                 112  23.192035
261                 113  42.098524
262                 148  25.608956
263                 120  19.913742
264                 264  39.266669

[265 rows x 2 columns]


In [62]:
results_df.loc[results_df['pickup_location_id'] == 161]

Unnamed: 0,pickup_location_id,mae
64,161,132.096712


Save this model to our model directory.

In [53]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['C:\\Users\\ryans\\taxi_demand_predictor\\models\\model.pkl']

This is where we create the format for the input and output data for Hopsworks. We just created a sample of predictors and labels from our data to define this format.

In [54]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [55]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR / 'model.pkl'))

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/686252 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3400 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/60849 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1049751/models/taxi_demand_predictor_next_hour/2


Model(name: 'taxi_demand_predictor_next_hour', version: 2)