In [52]:
import hopsworks
from datetime import datetime,timedelta
import pandas as pd
import optuna
from sklearn.metrics import mean_absolute_error
import joblib
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

from src.data import *
from src.config import *
from src.data_split import *
from src.model import *

#### 1. Hopsworks feature store

In [2]:
### Connecting to Hopsworks Feature Store


hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
fs = hw_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
### Connect to the Feature Group

fg = fs.get_or_create_feature_group(name=FEATURE_GROUP_NAME,
                             description='Hourly data of taxi demand in NYC in TimeSeries format',
                             version=FEATURE_GROUP_VERSION,
                             primary_key=['pickup_hour','pickup_location_id'],
                             event_time='pickup_hour')

In [4]:
### Create Feature View

try:
    fs.create_feature_view(name=FEATURE_VIEW_NAME,
                           version=FEATURE_VIEW_VERSION,
                           query=fg.select_all())
except:
    print("Feature view already exists")

Feature view already exists


In [5]:
### Connect to Feature View

fv = fs.get_feature_view(name=FEATURE_VIEW_NAME, version=FEATURE_VIEW_VERSION)

In [6]:
%%time

### Get Training data

taxi_data_ts,_ = fv.training_data(description="Hourly taxi rides - Time Series data")

Finished: Reading data from Hopsworks, using ArrowFlight (37.47s) 
CPU times: user 1.12 s, sys: 928 ms, total: 2.05 s
Wall time: 41.2 s




In [7]:
taxi_data_ts.sort_values(by=['pickup_hour', 'pickup_location_id'],inplace=True)
taxi_data_ts.columns = ['pickup_time','pickup_location','count_pickup_loc']

taxi_data_ts.to_parquet(TRANSFORMED_PATH + "rides.parquet") #compression='snappy', index=None   

#### 2. Transform Time Series data into Tabular Data (Features, Target)

In [8]:
%%time

window_size = 672 #1 month i.e., 28 days => 28*24 hours = 672
step_size = 23

features,target = transform_timeseriesdata_into_features_target(window_size,step_size)
print("Features : ",features.shape,"Target : ",target.shape)

Features :  (191202, 674) Target :  (191202, 1)
CPU times: user 50 s, sys: 2.18 s, total: 52.2 s
Wall time: 52.9 s


In [26]:
df = features
df['target_rides_next_hour'] = target
df['pickup_hour'] = pd.to_datetime(df['pickup_hour']).dt.tz_convert(None)
df

Unnamed: 0,rides_previous_672_hours,rides_previous_671_hours,rides_previous_670_hours,rides_previous_669_hours,rides_previous_668_hours,rides_previous_667_hours,rides_previous_666_hours,rides_previous_665_hours,rides_previous_664_hours,rides_previous_663_hours,...,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour,pickup_location_id,target_rides_next_hour
0,0,0,0,0,1,1,0,2,0,0,...,2,0,1,0,0,0,0,2022-01-29 00:00:00,1,0
1,0,0,0,0,0,0,4,1,2,1,...,0,0,0,0,0,1,0,2022-01-29 23:00:00,1,0
2,0,0,0,0,0,0,0,0,0,2,...,2,2,0,1,2,0,0,2022-01-30 22:00:00,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,2,1,0,1,2022-01-31 21:00:00,1,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,2022-02-01 20:00:00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191197,2,3,5,4,1,4,1,1,1,1,...,3,1,0,2,6,11,25,2024-02-17 08:00:00,265,45
191198,4,2,6,2,8,2,4,0,0,0,...,4,0,0,2,6,4,10,2024-02-18 07:00:00,265,26
191199,14,3,4,8,5,2,4,1,0,2,...,11,10,0,0,2,2,3,2024-02-19 06:00:00,265,12
191200,4,8,3,1,2,2,6,7,5,7,...,17,11,6,0,2,3,3,2024-02-20 05:00:00,265,3


#### 3. Train - Test split

- We have data from 2022, 2023 until November, 2024..
- Last 4 months (16+1 = 17 weeks) can be Test data

In [27]:
cutoff_dt = datetime.now().replace(minute=0,second=0,microsecond=0) - timedelta(weeks=17)
X_train,y_train,X_test,y_test = train_test_split(df,cutoff_dt)

print("Training Subset : ",X_train.shape,y_train.shape)
print("Validation data : ",X_test.shape,y_test.shape)

Training Subset :  (173706, 674) (173706,)
Validation data :  (17496, 674) (17496,)


#### 4. Hyperparameter tuning using Optuna

In [36]:
# Create an objective function that Optuna tries to minimize

def objective(trial):
    
    r_params = {
        "metric": "mae",
        "verbosity": -1,
        "num_leaves":trial.suggest_int('num_leaves',2,256), 
        "bagging_fraction":trial.suggest_float('bagging_fraction',0.2,1.0),
        "feature_fraction":trial.suggest_float('feature_fraction',0.2,1.0),
        "min_data_in_leaf":trial.suggest_int('min_data_in_leaf',3,100)
    }
    
    pipe = get_pipeline(r_params)
    pipe.fit(X_train,y_train)
    
    y_pred_lgb_ht = pipe.predict(X_test)
    error_metric_lgb_ht = mean_absolute_error(y_test,y_pred_lgb_ht)
    print("{:.4f}".format(error_metric_lgb_ht))
    
    return error_metric_lgb_ht

In [38]:
# Run the optimization (i.e., minimization here)

study = optuna.create_study(study_name='lgb_nyc_study',direction='minimize')
study.optimize(objective, n_trials=10, show_progress_bar=True)

[I 2024-02-21 23:12:46,363] A new study created in memory with name: lgb_nyc_study


  0%|          | 0/10 [00:00<?, ?it/s]

3.0789
[I 2024-02-21 23:13:15,754] Trial 0 finished with value: 3.0789035003088956 and parameters: {'num_leaves': 13, 'bagging_fraction': 0.285753624378539, 'feature_fraction': 0.6263750366263874, 'min_data_in_leaf': 49}. Best is trial 0 with value: 3.0789035003088956.
2.8274
[I 2024-02-21 23:13:53,456] Trial 1 finished with value: 2.8273519377348397 and parameters: {'num_leaves': 139, 'bagging_fraction': 0.7133913617913945, 'feature_fraction': 0.7271239425318075, 'min_data_in_leaf': 41}. Best is trial 1 with value: 2.8273519377348397.
2.8489
[I 2024-02-21 23:14:43,598] Trial 2 finished with value: 2.848866648006692 and parameters: {'num_leaves': 203, 'bagging_fraction': 0.5286315296038141, 'feature_fraction': 0.4624010506443738, 'min_data_in_leaf': 82}. Best is trial 1 with value: 2.8273519377348397.
3.3765
[I 2024-02-21 23:15:22,053] Trial 3 finished with value: 3.3764684713929176 and parameters: {'num_leaves': 187, 'bagging_fraction': 0.335809022785319, 'feature_fraction': 0.2892461

In [39]:
print('Best parameters', study.best_params)
print('Best score', study.best_value)
print('Best model', study.best_trial)

Best parameters {'num_leaves': 139, 'bagging_fraction': 0.7133913617913945, 'feature_fraction': 0.7271239425318075, 'min_data_in_leaf': 41}
Best score 2.8273519377348397
Best model FrozenTrial(number=1, state=TrialState.COMPLETE, values=[2.8273519377348397], datetime_start=datetime.datetime(2024, 2, 21, 23, 13, 15, 757996), datetime_complete=datetime.datetime(2024, 2, 21, 23, 13, 53, 456128), params={'num_leaves': 139, 'bagging_fraction': 0.7133913617913945, 'feature_fraction': 0.7271239425318075, 'min_data_in_leaf': 41}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'min_data_in_leaf': IntDistribution(high=100, log=False, low=3, step=1)}, trial_id=1, value=None)


#### 5. Re-fit the model using the best parameters found

In [40]:
pipe = get_pipeline(study.best_params)
pipe.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.499803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 171905
[LightGBM] [Info] Number of data points in the train set: 173706, number of used features: 678
[LightGBM] [Info] Start training from score 17.162637


In [42]:
y_pred_lgb_ht = pipe.predict(X_test)



#### 6. Evaluate model

In [43]:
error_metric_lgb_ht = mean_absolute_error(y_test,y_pred_lgb_ht)
print("{:.4f}".format(error_metric_lgb_ht))

2.8274


#### 7. Save model

In [51]:
joblib.dump(pipe,MODEL_PATH+'/nyc_taxi_pipe_model.pkl')

['/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/model/nyc_taxi_pipe_model.pkl']

#### 8. Push model to Model Registry in Hopsworks

In [54]:
input_schema = Schema(X_train)
output_schema = Schema(y_train)
ms = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [57]:
hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
mr = hw_project.get_model_registry()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.


In [60]:
taxi_model = mr.sklearn.create_model(name="taxi_demand_predictor_next_hour",
                                      metrics={'test_mae': error_metric_lgb_ht},
                                      description="LightGBM regressor with hyperparameter tuning",
                                      input_example=X_train.sample(), 
                                      model_schema=ms)
taxi_model.save(MODEL_PATH+'/nyc_taxi_pipe_model.pkl')

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1292466 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/2071 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/57755 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/467093/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)