In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import src.config as config

In [30]:
from datetime import datetime, timedelta
import pandas as pd
import pytz

# Get the current date and time in UTC
current_date = pd.to_datetime(datetime.now(pytz.utc)).floor('H')

print(f"Current UTC date and time floored to the nearest hour: {current_date}")

# we fetch raw data for the last 28 days, to add redundancy to our data pipeline
fetch_data_to = current_date
fetch_data_from = current_date - timedelta(days=28)
print(f'We will fetch rides from {fetch_data_from=} until {fetch_data_to=}')


Current UTC date and time floored to the nearest hour: 2024-10-09 22:00:00+00:00
We will fetch rides from fetch_data_from=Timestamp('2024-09-11 22:00:00+0000', tz='UTC') until fetch_data_to=Timestamp('2024-10-09 22:00:00+0000', tz='UTC')


In [31]:
from src.data import load_raw_data
from datetime import datetime, timedelta
import pandas as pd
import pytz

def fetch_ride_events_from_data_warehouse(
    from_date: datetime,
    to_date: datetime
) -> pd.DataFrame:
    """
    This function is used to simulate production data by sampling historical data
    from 52 weeks ago (i.e. 1 year)
    """
    # Ensure from_date and to_date are timezone-aware
    from_date = from_date.replace(tzinfo=pytz.UTC)
    to_date = to_date.replace(tzinfo=pytz.UTC)

    from_date_ = pd.Timestamp(from_date - timedelta(days=7*52))
    to_date_ = pd.Timestamp(to_date - timedelta(days=7*52))
    print(f'Fetching ride events from {from_date} to {to_date}')

    if (from_date_.year == to_date_.year) and (from_date_.month == to_date_.month):
        # download 1 file of data only
        rides = load_raw_data(year=from_date_.year, months=from_date_.month)
        rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'], utc=True)
        rides = rides[rides.pickup_datetime >= from_date_]
        rides = rides[rides.pickup_datetime < to_date_]
    else:
        # download 2 files from website
        rides = load_raw_data(year=from_date_.year, months=from_date_.month)
        rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'], utc=True)
        rides = rides[rides.pickup_datetime >= from_date_]

        rides_2 = load_raw_data(year=to_date_.year, months=to_date_.month)
        rides_2['pickup_datetime'] = pd.to_datetime(rides_2['pickup_datetime'], utc=True)
        rides_2 = rides_2[rides_2.pickup_datetime < to_date_]

        rides = pd.concat([rides, rides_2])

    # shift the pickup_datetime back 1 year ahead, to simulate production data
    # using its 7*52-days-ago value
    rides['pickup_datetime'] = rides['pickup_datetime'] + timedelta(days=7*52)

    rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)

    return rides


In [32]:
rides = fetch_ride_events_from_data_warehouse(from_date=fetch_data_from, to_date=fetch_data_to)

Fetching ride events from 2024-09-11 22:00:00+00:00 to 2024-10-09 22:00:00+00:00
File 2023-09 was already in local storage
File 2023-10 was already in local storage


This is our new data that we will be sending to the feature group. It is fake data that is just what happened a year ago in our time frame of 28 days.

In [33]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
1336105,2024-09-12 10:31:48+00:00,1
1345029,2024-09-12 12:50:05+00:00,1
1352845,2024-09-12 13:45:08+00:00,1
1358457,2024-09-12 14:35:41+00:00,1
1369746,2024-09-12 16:35:28+00:00,1
...,...,...
1128536,2024-10-09 20:49:11+00:00,265
1135841,2024-10-09 21:28:50+00:00,265
1135533,2024-10-09 21:45:55+00:00,265
1134460,2024-10-09 21:51:50+00:00,265


In [34]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)



100%|██████████| 265/265 [00:00<00:00, 468.36it/s]


In [35]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2024-09-11 22:00:00+00:00,0,1
1,2024-09-11 23:00:00+00:00,0,1
2,2024-09-12 00:00:00+00:00,0,1
3,2024-09-12 01:00:00+00:00,0,1
4,2024-09-12 02:00:00+00:00,0,1
...,...,...,...
178075,2024-10-09 17:00:00+00:00,4,265
178076,2024-10-09 18:00:00+00:00,7,265
178077,2024-10-09 19:00:00+00:00,2,265
178078,2024-10-09 20:00:00+00:00,3,265


In [36]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178080 entries, 0 to 178079
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   pickup_hour         178080 non-null  datetime64[ns, UTC]
 1   rides               178080 non-null  int64              
 2   pickup_location_id  178080 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2)
memory usage: 4.1 MB


In [37]:
# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].astype('int64') // 10**6

In [38]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id,pickup_ts
0,2024-09-11 22:00:00+00:00,0,1,1726092000000
1,2024-09-11 23:00:00+00:00,0,1,1726095600000
2,2024-09-12 00:00:00+00:00,0,1,1726099200000
3,2024-09-12 01:00:00+00:00,0,1,1726102800000
4,2024-09-12 02:00:00+00:00,0,1,1726106400000
...,...,...,...,...
178075,2024-10-09 17:00:00+00:00,4,265,1728493200000
178076,2024-10-09 18:00:00+00:00,7,265,1728496800000
178077,2024-10-09 19:00:00+00:00,2,265,1728500400000
178078,2024-10-09 20:00:00+00:00,3,265,1728504000000


In [39]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_ts'],
    event_time='pickup_ts',
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1049751
Connected. Call `.close()` to terminate connection gracefully.


In [40]:
len(ts_data)

178080

In [41]:
# insert this data to our feature group
feature_group.insert(ts_data, write_options={"wait_for_job": True})


Uploading Dataframe: 0.00% |          | Rows 0/178080 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1049751/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x281f9828130>, None)

In [42]:
df = feature_group.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (18.57s) 


In [43]:
len(df)

4120220