In [1]:
import hopsworks
from datetime import datetime,timedelta
import joblib
from pathlib import Path
from sklearn.metrics import mean_absolute_error

from src.config import *
from src.data import *

from warnings import simplefilter,filterwarnings
from sklearn.exceptions import InconsistentVersionWarning

#Can ignore this
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

#Investigate into this before supressing
filterwarnings(action='ignore', category=InconsistentVersionWarning)

#### 1. Hopsworks feature store

In [7]:
### Connecting to Hopsworks Feature Store


hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
fs = hw_project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.


In [8]:
### Connect to Feature View

fv = fs.get_feature_view(name=FEATURE_VIEW_NAME, version=FEATURE_VIEW_VERSION)

In [9]:
### Get Test data i.e., data of last 4 months (16 weeks)

fetch_data_from = datetime.now().replace(minute=0,second=0,microsecond=0) - timedelta(weeks=16)
fetch_data_to = datetime.now().replace(minute=0,second=0,microsecond=0) - timedelta(hours=1)

taxi_test_data_ts = fv.get_batch_data(start_time=fetch_data_from,end_time=fetch_data_to)

Finished: Reading data from Hopsworks, using ArrowFlight (13.38s) 


In [10]:
taxi_test_data_ts.sort_values(by=['pickup_hour', 'pickup_location_id'],inplace=True)
taxi_test_data_ts.columns = ['pickup_time','pickup_location','count_pickup_loc']
taxi_test_data_ts.reset_index(drop=True,inplace=True)
taxi_test_data_ts.to_parquet(TRANSFORMED_PATH + "rides.parquet") #compression='snappy', index=None   

#### 2. Transform Time Series data into Tabular Data (Features, Target)

In [11]:
%%time

window_size = 672 #1 month i.e., 28 days => 28*24 hours = 672
step_size = 23

features,target = transform_timeseriesdata_into_features_target(window_size,step_size)
print("Features : ",features.shape,"Target : ",target.shape)

Features :  (8083, 674) Target :  (8083, 1)
CPU times: user 5.92 s, sys: 182 ms, total: 6.1 s
Wall time: 6.35 s


In [12]:
X_test = features
y_test = target
X_test['pickup_hour'] = pd.to_datetime(X_test['pickup_hour']).dt.tz_convert(None)

df_test = X_test
df_test['target_rides_next_hour'] = y_test

idx = df_test.groupby('pickup_location_id')['pickup_hour'].idxmax()
df_test_final = df_test.loc[idx]
loc_df = pd.DataFrame({'pickup_location_id': range(1, 266)})
df_test_final = loc_df.merge(df_test_final, how='left', on='pickup_location_id').fillna({'pickup_hour': df_test['pickup_hour'].max()})
df_test_final.fillna(0, inplace=True)

X_pred = df_test_final.drop('target_rides_next_hour',axis=1)
y_actual = df_test_final['target_rides_next_hour'].to_frame()
display(X_pred)
display(y_actual)

Unnamed: 0,pickup_location_id,rides_previous_672_hours,rides_previous_671_hours,rides_previous_670_hours,rides_previous_669_hours,rides_previous_668_hours,rides_previous_667_hours,rides_previous_666_hours,rides_previous_665_hours,rides_previous_664_hours,...,rides_previous_9_hours,rides_previous_8_hours,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour
0,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,2024-03-03 17:00:00
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2024-03-03 13:00:00
2,3,1,0,0,2,0,0,0,0,0,...,0,0,1,0,1,1,1,0,2,2024-03-03 17:00:00
3,4,5,2,4,7,0,3,0,1,0,...,4,3,3,6,3,2,4,2,4,2024-03-03 17:00:00
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2024-03-03 17:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,261,11,24,7,8,2,2,0,3,1,...,6,2,10,16,17,11,13,16,17,2024-03-03 17:00:00
261,262,66,46,32,26,7,14,4,2,0,...,106,101,91,80,98,87,107,97,70,2024-03-03 17:00:00
262,263,112,113,78,47,44,40,13,14,8,...,116,98,116,115,93,114,145,144,126,2024-03-03 17:00:00
263,264,23,18,22,8,6,10,1,1,0,...,19,18,20,27,28,29,27,28,24,2024-03-03 17:00:00


Unnamed: 0,target_rides_next_hour
0,0
1,0
2,0
3,4
4,0
...,...
260,13
261,70
262,109
263,26


#### 4. Use model from Model Registry in Hopsworks for predictions

In [13]:
hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
mr = hw_project.get_model_registry()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.


In [14]:
#Get the best version of model

#lgb_model_hw = mr.get_model(name=MODEL_NAME,version=MODEL_VERSION)

lgb_model_hw = mr.get_best_model(MODEL_NAME, "test_mae", "min")

In [15]:
#Download the model

lgb_model_hw_path = lgb_model_hw.download()

Downloading model artifact (0 dirs, 3 files)... DONE

In [16]:
#Load the model

lgb_model = joblib.load(Path(lgb_model_hw_path)/'nyc_taxi_pipe_model.pkl')

In [17]:
#Predict using the model

y_pred_lgb = lgb_model.predict(X_pred)



In [18]:
#Prettify the result

y_pred = pd.DataFrame(y_pred_lgb.round(decimals=0).astype(int),columns=['predicted_demand_rides'])
y_pred[['pickup_location_id','pickup_hour']] = X_pred[['pickup_location_id','pickup_hour']]
y_pred = y_pred[['pickup_location_id','pickup_hour','predicted_demand_rides']]
y_pred

Unnamed: 0,pickup_location_id,pickup_hour,predicted_demand_rides
0,1,2024-03-03 17:00:00,0
1,2,2024-03-03 13:00:00,0
2,3,2024-03-03 17:00:00,0
3,4,2024-03-03 17:00:00,4
4,5,2024-03-03 17:00:00,0
...,...,...,...
260,261,2024-03-03 17:00:00,10
261,262,2024-03-03 17:00:00,61
262,263,2024-03-03 17:00:00,112
263,264,2024-03-03 17:00:00,21


#### 5. Evaluate the model

In [19]:
error_metric_lgb = mean_absolute_error(y_actual,y_pred_lgb)
print("{:.4f}".format(error_metric_lgb))

4.8963
