In [11]:
## importar paquetes al realizar modificaciones
%reload_ext autoreload
%autoreload 2

In [12]:
import src.config as config
import hopsworks

# connecto to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/30877
Connected. Call `.close()` to terminate connection gracefully.


In [13]:
# create feature view (if it doesn't exist yet)
# This feature only uses on feature group, so the query is trivial 

try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation.


In [14]:
ts_data,_ = feature_view.training_data(
    description='Time-series hourly taxi rides'
)

2023-04-20 20:17:59,164 INFO: USE `taxi_demand_project_featurestore`
2023-04-20 20:17:59,635 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_project_featurestore`.`time_series_hourly_feature_group_1` `fg0`




In [15]:
ts_data.sort_values(by='pickup_hour')

Unnamed: 0,pickup_hour,rides,pickup_location_id
130254,2023-03-24 01:00:00,0,133
92454,2023-03-24 01:00:00,0,203
156064,2023-03-24 01:00:00,0,61
76441,2023-03-24 01:00:00,0,244
167023,2023-03-24 01:00:00,0,181
...,...,...,...
165224,2023-04-21 00:00:00,129,114
14149,2023-04-21 00:00:00,0,96
4528,2023-04-21 00:00:00,59,158
90082,2023-04-21 00:00:00,0,35


In [8]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28,
    step_size=23
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 257/257 [00:00<00:00, 356.44it/s]

features_and_target.shape=(0, 675)





In [10]:
from datetime import date,timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> From January 2023 up until 1 months ago
# testdata -> last 1 months

cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1))

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

TypeError: '<' not supported between instances of 'numpy.ndarray' and 'Timestamp'

In [7]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    
    # pick hyper-parameters
    hyperparams = {
        "metric":'mae',
        "verbose": -1,
        "num_leaves":trial.suggest_int("num_leaves",2,256),
        "feature_fraction":trial.suggest_float("feature_fraction",0.2,1.0),
        "bagging_fraction":trial.suggest_float("bagging_fraction",0.2,1.0),
        "min_child_samples":trial.suggest_int("min_child_samples",3,100)
    }
    
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):
        
        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index,:], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_,y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_,y_pred)
        scores.append(mae)
        
    # Return the mean score 
    return np.array(scores).mean()

In [8]:
study = optuna.create_study(direction="minimize")
study.optimize(objective,n_trials=2)

[32m[I 2023-04-12 09:48:29,871][0m A new study created in memory with name: no-name-087cb7fa-86ee-405e-b7de-6d5bccb5b181[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[32m[I 2023-04-12 09:49:03,097][0m Trial 0 finished with value: 6.714172055022388 and parameters: {'num_leaves': 134, 'feature_fraction': 0.5169279127336126, 'bagging_fraction': 0.9381699717960414, 'min_child_samples': 54}. Best is trial 0 with value: 6.714172055022388.[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[32m[I 2023-04-12 09:49:45,516][0m Trial 1 finished with value: 6.768949918968138 and parameters: {'num_leaves': 225, 'feature_fraction': 0.44913227833831554, 'bagging_fraction': 0.8007680903029362, 'min_child_samples': 41}. Best is trial 0 with value: 6.714172055022388.[0m


In [9]:
best_params = study.best_params
print(f'{best_params=}')

best_params={'num_leaves': 134, 'feature_fraction': 0.5169279127336126, 'bagging_fraction': 0.9381699717960414, 'min_child_samples': 54}


In [10]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train,y_train)



In [12]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test,predictions)
print(f'{test_mae=:.4f}')

test_mae=6.2111


In [16]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['C:\\Users\\SantiagoLopezCarmona\\OneDrive - unimilitar.edu.co\\1-desarrollo-profesional\\1-projects\\taxi-demand-predictor\\models\\model.pkl']

In [17]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema,output_schema=output_schema)

In [21]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name='taxi_demand_predictor_next_hour',
    metrics={'test_mae':test_mae},
    description="LighGBM regressor with a bit of hyperparameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(MODELS_DIR / 'model.pkl')

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/30877/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)