In [36]:
import hopsworks
from datetime import datetime,timedelta
import joblib
from pathlib import Path
from sklearn.metrics import mean_absolute_error

from src.config import *
from src.data import *

from warnings import simplefilter,filterwarnings
from sklearn.exceptions import InconsistentVersionWarning

#Can ignore this
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

#Investigate into this before supressing
filterwarnings(action='ignore', category=InconsistentVersionWarning)

#### 1. Hopsworks feature store

In [2]:
### Connecting to Hopsworks Feature Store


hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
fs = hw_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
### Connect to Feature View

fv = fs.get_feature_view(name=FEATURE_VIEW_NAME, version=FEATURE_VIEW_VERSION)

In [4]:
### Get Test data i.e., data of last 4 months (16 weeks)

fetch_data_from = datetime.now().replace(minute=0,second=0,microsecond=0) - timedelta(weeks=16)
fetch_data_to = datetime.now().replace(minute=0,second=0,microsecond=0) - timedelta(hours=1)

taxi_test_data_ts = fv.get_batch_data(start_time=fetch_data_from,end_time=fetch_data_to)

Finished: Reading data from Hopsworks, using ArrowFlight (14.50s) 


In [5]:
taxi_test_data_ts.sort_values(by=['pickup_hour', 'pickup_location_id'],inplace=True)
taxi_test_data_ts.columns = ['pickup_time','pickup_location','count_pickup_loc']
taxi_test_data_ts.reset_index(drop=True,inplace=True)
taxi_test_data_ts.to_parquet(TRANSFORMED_PATH + "rides.parquet") #compression='snappy', index=None   

#### 2. Transform Time Series data into Tabular Data (Features, Target)

In [6]:
%%time

window_size = 672 #1 month i.e., 28 days => 28*24 hours = 672
step_size = 23

features,target = transform_timeseriesdata_into_features_target(window_size,step_size)
print("Features : ",features.shape,"Target : ",target.shape)

Features :  (8060, 674) Target :  (8060, 1)
CPU times: user 5.62 s, sys: 142 ms, total: 5.76 s
Wall time: 5.89 s


In [7]:
X_test = features
y_test = target
X_test['pickup_hour'] = pd.to_datetime(X_test['pickup_hour']).dt.tz_convert(None)

df_test = X_test
df_test['target_rides_next_hour'] = y_test

idx = df_test.groupby('pickup_location_id')['pickup_hour'].idxmax()
df_test_final = df_test.loc[idx]
loc_df = pd.DataFrame({'pickup_location_id': range(1, 266)})
df_test_final = loc_df.merge(df_test_final, how='left', on='pickup_location_id').fillna({'pickup_hour': df_test['pickup_hour'].max()})
df_test_final.fillna(0, inplace=True)

X_pred = df_test_final.drop('target_rides_next_hour',axis=1)
y_actual = df_test_final['target_rides_next_hour'].to_frame()
display(X_pred)
display(y_actual)

Unnamed: 0,pickup_location_id,rides_previous_672_hours,rides_previous_671_hours,rides_previous_670_hours,rides_previous_669_hours,rides_previous_668_hours,rides_previous_667_hours,rides_previous_666_hours,rides_previous_665_hours,rides_previous_664_hours,...,rides_previous_9_hours,rides_previous_8_hours,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2024-02-24 16:00:00
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2024-02-24 16:00:00
2,3,1,0,0,0,0,0,0,0,0,...,0,0,1,4,1,1,0,0,0,2024-02-24 16:00:00
3,4,5,4,1,1,2,3,2,1,1,...,4,0,3,4,5,3,3,2,4,2024-02-24 16:00:00
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2024-02-24 16:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,261,12,9,18,8,5,8,1,3,1,...,3,5,7,6,13,20,12,12,9,2024-02-24 16:00:00
261,262,61,40,35,30,23,9,8,5,1,...,99,108,104,87,95,66,76,92,88,2024-02-24 16:00:00
262,263,92,95,82,74,37,37,27,12,6,...,95,122,126,123,116,97,102,119,114,2024-02-24 16:00:00
263,264,28,23,27,16,6,12,8,4,4,...,8,20,16,17,26,25,22,33,23,2024-02-24 16:00:00


Unnamed: 0,target_rides_next_hour
0,0
1,0
2,0
3,1
4,0
...,...
260,10
261,64
262,102
263,21


#### 4. Use model from Model Registry in Hopsworks for predictions

In [8]:
hw_project = hopsworks.login(project=HOPSWORKS_PROJECT,api_key_value=HOPSWORKS_API_KEY)
mr = hw_project.get_model_registry()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/467093
Connected. Call `.close()` to terminate connection gracefully.


In [9]:
#Get the model

lgb_model_hw = mr.get_model(name=MODEL_NAME,version=MODEL_VERSION)

In [10]:
#Download the model

lgb_model_hw_path = lgb_model_hw.download()

Downloading model artifact (0 dirs, 3 files)... DONE

In [11]:
#Load the model

lgb_model = joblib.load(Path(lgb_model_hw_path)/'nyc_taxi_pipe_model.pkl')

In [12]:
#Predict using the model

y_pred_lgb = lgb_model.predict(X_pred)



In [50]:
#Prettify the result

y_pred = pd.DataFrame(y_pred_lgb.round(decimals=0).astype(int),columns=['predicted_demand_rides'])
y_pred[['pickup_location_id','pickup_hour']] = X_pred[['pickup_location_id','pickup_hour']]
y_pred = y_pred[['pickup_location_id','pickup_hour','predicted_demand_rides']]
y_pred

Unnamed: 0,pickup_location_id,pickup_hour,predicted_demand_rides
0,1,2024-02-24 16:00:00,0
1,2,2024-02-24 16:00:00,0
2,3,2024-02-24 16:00:00,0
3,4,2024-02-24 16:00:00,3
4,5,2024-02-24 16:00:00,0
...,...,...,...
260,261,2024-02-24 16:00:00,11
261,262,2024-02-24 16:00:00,74
262,263,2024-02-24 16:00:00,118
263,264,2024-02-24 16:00:00,30


#### 5. Evaluate the model

In [13]:
error_metric_lgb = mean_absolute_error(y_actual,y_pred_lgb)
print("{:.4f}".format(error_metric_lgb))

3.0535
