In [1]:
import hopsworks
from datetime import datetime,timedelta
import pandas as pd
import optuna
from sklearn.metrics import mean_absolute_error
import joblib
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

from src.data import *
from src.config import *
from src.data_split import *
from src.model import *

#### 1. Hopsworks feature store

In [3]:
### Connecting to Hopsworks Feature Store


hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
fs = hw_project.get_feature_store()
mr = hw_project.get_model_registry()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
### Connect to the Feature Group

fg = fs.get_or_create_feature_group(name=FEATURE_GROUP_NAME,
                             description='Hourly data of taxi demand in NYC in TimeSeries format',
                             version=FEATURE_GROUP_VERSION,
                             primary_key=['pickup_hour','pickup_location_id'],
                             event_time='pickup_hour')

In [5]:
### Create Feature View

try:
    fs.create_feature_view(name=FEATURE_VIEW_NAME,
                           version=FEATURE_VIEW_VERSION,
                           query=fg.select_all())
except:
    print("Feature view already exists")

Feature view already exists


In [6]:
### Connect to Feature View

fv = fs.get_feature_view(name=FEATURE_VIEW_NAME, version=FEATURE_VIEW_VERSION)

In [7]:
### Get Training data i.e., data before last 4 months (16+1 weeks)

fetch_data_to = datetime.now().replace(minute=0,second=0,microsecond=0) - timedelta(weeks=17)

taxi_data_ts = fv.get_batch_data(end_time=fetch_data_to)

Finished: Reading data from Hopsworks, using ArrowFlight (43.04s) 


In [8]:
taxi_data_ts.sort_values(by=['pickup_hour', 'pickup_location_id'],inplace=True)
taxi_data_ts.columns = ['pickup_time','pickup_location','count_pickup_loc']
taxi_data_ts.reset_index(drop=True,inplace=True)
taxi_data_ts.to_parquet(TRANSFORMED_PATH + "rides.parquet") #compression='snappy', index=None   

#### 2. Transform Time Series data into Tabular Data (Features, Target)

In [9]:
%%time

window_size = 672 #1 month i.e., 28 days => 28*24 hours = 672
step_size = 23

features,target = transform_timeseriesdata_into_features_target(window_size,step_size)
print("Features : ",features.shape,"Target : ",target.shape)

Features :  (176850, 674) Target :  (176850, 1)
CPU times: user 49 s, sys: 1.84 s, total: 50.8 s
Wall time: 51.6 s


In [10]:
df = features
df['target_rides_next_hour'] = target
df['pickup_hour'] = pd.to_datetime(df['pickup_hour']).dt.tz_convert(None)
df

Unnamed: 0,rides_previous_672_hours,rides_previous_671_hours,rides_previous_670_hours,rides_previous_669_hours,rides_previous_668_hours,rides_previous_667_hours,rides_previous_666_hours,rides_previous_665_hours,rides_previous_664_hours,rides_previous_663_hours,...,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour,pickup_location_id,target_rides_next_hour
0,0,0,0,0,1,1,0,2,0,0,...,2,0,1,0,0,0,0,2022-01-29 00:00:00,1,0
1,0,0,0,0,0,0,4,1,2,1,...,0,0,0,0,0,1,0,2022-01-29 23:00:00,1,0
2,0,0,0,0,0,0,0,0,0,2,...,2,2,0,1,2,0,0,2022-01-30 22:00:00,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,2,1,0,1,2022-01-31 21:00:00,1,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,2022-02-01 20:00:00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176845,4,1,1,2,4,4,2,4,7,4,...,1,3,4,5,0,4,0,2023-11-02 02:00:00,265,1
176846,3,3,0,2,1,1,2,1,2,5,...,2,1,5,2,8,6,1,2023-11-03 01:00:00,265,6
176847,1,4,2,4,7,4,2,1,1,2,...,0,0,6,1,1,2,2,2023-11-04 00:00:00,265,8
176848,4,5,4,12,12,6,2,0,1,3,...,2,2,1,1,2,2,2,2023-11-04 23:00:00,265,4


#### 3. Train - Validation split

- Last 4 months (16+1 = 17 weeks) in training data can be Validation data

In [12]:
cutoff_dt = fetch_data_to - timedelta(weeks=17)
X_train,y_train,X_val,y_val = train_test_split(df,cutoff_dt)

print("Training set : ",X_train.shape,y_train.shape)
print("Test set : ",X_val.shape,y_val.shape)

Training set :  (144362, 674) (144362,)
Test set :  (32488, 674) (32488,)


#### 4. Hyperparameter tuning using Optuna

In [15]:
# Create an objective function that Optuna tries to minimize

def objective(trial):
    
    r_params = {
        "metric": "mae",
        "verbosity": -1,
        "num_leaves":trial.suggest_int('num_leaves',2,256), 
        "bagging_fraction":trial.suggest_float('bagging_fraction',0.2,1.0),
        "feature_fraction":trial.suggest_float('feature_fraction',0.2,1.0),
        "min_data_in_leaf":trial.suggest_int('min_data_in_leaf',3,100)
    }
    
    pipe = get_pipeline(r_params)
    pipe.fit(X_train,y_train)
    
    y_pred_lgb_ht = pipe.predict(X_val)
    error_metric_lgb_ht = mean_absolute_error(y_val,y_pred_lgb_ht)
    print("{:.4f}".format(error_metric_lgb_ht))
    
    return error_metric_lgb_ht

In [16]:
# Run the optimization (i.e., minimization here)

study = optuna.create_study(study_name='lgb_nyc_study',direction='minimize')
study.optimize(objective, n_trials=10, show_progress_bar=True)

[I 2024-03-04 21:51:26,448] A new study created in memory with name: lgb_nyc_study


  0%|          | 0/20 [00:00<?, ?it/s]

2.5860
[I 2024-03-04 21:51:59,062] Trial 0 finished with value: 2.586048933052965 and parameters: {'num_leaves': 73, 'bagging_fraction': 0.6660811111618483, 'feature_fraction': 0.8613897403487529, 'min_data_in_leaf': 89}. Best is trial 0 with value: 2.586048933052965.
2.5692
[I 2024-03-04 21:52:31,520] Trial 1 finished with value: 2.5691578011649567 and parameters: {'num_leaves': 115, 'bagging_fraction': 0.30606880023148575, 'feature_fraction': 0.6060448913673584, 'min_data_in_leaf': 63}. Best is trial 1 with value: 2.5691578011649567.
2.6144
[I 2024-03-04 21:53:07,158] Trial 2 finished with value: 2.6144000255191275 and parameters: {'num_leaves': 43, 'bagging_fraction': 0.2502616392936974, 'feature_fraction': 0.64319440511196, 'min_data_in_leaf': 53}. Best is trial 1 with value: 2.5691578011649567.
2.5537
[I 2024-03-04 21:53:42,910] Trial 3 finished with value: 2.5536592982895163 and parameters: {'num_leaves': 92, 'bagging_fraction': 0.9269190209692832, 'feature_fraction': 0.682019067

In [17]:
print('Best parameters', study.best_params)
print('Best score', study.best_value)
print('Best model', study.best_trial)

Best parameters {'num_leaves': 197, 'bagging_fraction': 0.9983768522185472, 'feature_fraction': 0.993289300862745, 'min_data_in_leaf': 26}
Best score 2.5253029291463003
Best model FrozenTrial(number=11, state=TrialState.COMPLETE, values=[2.5253029291463003], datetime_start=datetime.datetime(2024, 3, 4, 21, 58, 16, 714730), datetime_complete=datetime.datetime(2024, 3, 4, 21, 58, 55, 112799), params={'num_leaves': 197, 'bagging_fraction': 0.9983768522185472, 'feature_fraction': 0.993289300862745, 'min_data_in_leaf': 26}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'min_data_in_leaf': IntDistribution(high=100, log=False, low=3, step=1)}, trial_id=11, value=None)


#### 5. Re-fit the model using the best parameters found

In [18]:
pipe = get_pipeline(study.best_params)
pipe.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.372255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171905
[LightGBM] [Info] Number of data points in the train set: 144362, number of used features: 678
[LightGBM] [Info] Start training from score 17.387325


In [31]:
y_pred_lgb_ht = pipe.predict(X_val)



#### 6. Evaluate model

In [33]:
error_metric_lgb_ht = mean_absolute_error(y_val,y_pred_lgb_ht)
print("{:.4f}".format(error_metric_lgb_ht))

2.5253


#### 7. Save model

In [34]:
joblib.dump(pipe,MODEL_PATH+'/nyc_taxi_pipe_model.pkl')

['/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/model/nyc_taxi_pipe_model.pkl']

#### 8. Push model to Model Registry in Hopsworks

In [35]:
input_schema = Schema(X_train)
output_schema = Schema(y_train)
ms = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [36]:
taxi_model = mr.sklearn.create_model(name="taxi_demand_predictor_next_hour",
                                      metrics={'test_mae': error_metric_lgb_ht},
                                      description="LightGBM regressor with hyperparameter tuning",
                                      input_example=X_train.sample(), 
                                      model_schema=ms)

In [37]:
taxi_model.save(MODEL_PATH+'/nyc_taxi_pipe_model.pkl')

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1783544 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/2568 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/57755 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/467093/models/taxi_demand_predictor_next_hour/52


Model(name: 'taxi_demand_predictor_next_hour', version: 52)