In [2]:
import src.config as config
import hopsworks

# connecto to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/30877
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
# create feature view (if it doesn't exist yet)
# This feature only uses on feature group, so the query is trivial 

try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/30877/fs/30797/fv/time_series_hourly_feature_view/version/1


In [4]:
ts_data,_ = feature_view.training_data(
    description='Time-series hourly taxi rides'
)

2023-04-10 11:24:10,462 INFO: USE `taxi_demand_project_featurestore`
2023-04-10 11:24:10,955 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_project_featurestore`.`time_series_hourly_feature_group_1` `fg0`




In [6]:
ts_data.sort_values(by='pickup_hour')

Unnamed: 0,pickup_hour,rides,pickup_location_id
1110375,2022-01-01 00:00:00,0,62
1925061,2022-01-01 00:00:00,0,174
1925044,2022-01-01 00:00:00,0,120
773673,2022-01-01 00:00:00,0,175
2563650,2022-01-01 00:00:00,0,216
...,...,...,...
260502,2023-04-10 15:00:00,0,176
260643,2023-04-10 15:00:00,2,25
260644,2023-04-10 15:00:00,0,17
260712,2023-04-10 15:00:00,222,142


In [7]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28,
    step_size=23
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 262/262 [00:41<00:00,  6.25it/s]

features_and_target.shape=(109088, 675)





In [9]:
from datetime import date,timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> From January 2023 up until 1 months ago
# testdata -> last 1 months

cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1))

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')


X_train.shape=(103402, 674)
y_train.shape=(103402,)
X_test.shape=(5686, 674)
y_test.shape=(5686,)
