In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import src.config as config

In [42]:
from datetime import datetime, timedelta
import pandas as pd
import pytz

# Get the current date and time in UTC
current_date = pd.to_datetime(datetime.now(pytz.utc)).floor('H')

print(f"Current UTC date and time floored to the nearest hour: {current_date}")

# we fetch raw data for the last 28 days, to add redundancy to our data pipeline
fetch_data_to = current_date
fetch_data_from = current_date - timedelta(days=28)
print(f'We will fetch rides from {fetch_data_from=} until {fetch_data_to=}')


Current UTC date and time floored to the nearest hour: 2024-10-08 20:00:00+00:00
We will fetch rides from fetch_data_from=Timestamp('2024-09-10 20:00:00+0000', tz='UTC') until fetch_data_to=Timestamp('2024-10-08 20:00:00+0000', tz='UTC')


In [43]:
from src.data import load_raw_data
from datetime import datetime, timedelta
import pandas as pd
import pytz

def fetch_ride_events_from_data_warehouse(
    from_date: datetime,
    to_date: datetime
) -> pd.DataFrame:
    """
    This function is used to simulate production data by sampling historical data
    from 52 weeks ago (i.e. 1 year)
    """
    # Ensure from_date and to_date are timezone-aware
    from_date = from_date.replace(tzinfo=pytz.UTC)
    to_date = to_date.replace(tzinfo=pytz.UTC)

    from_date_ = pd.Timestamp(from_date - timedelta(days=7*52))
    to_date_ = pd.Timestamp(to_date - timedelta(days=7*52))
    print(f'Fetching ride events from {from_date} to {to_date}')

    if (from_date_.year == to_date_.year) and (from_date_.month == to_date_.month):
        # download 1 file of data only
        rides = load_raw_data(year=from_date_.year, months=from_date_.month)
        rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'], utc=True)
        rides = rides[rides.pickup_datetime >= from_date_]
        rides = rides[rides.pickup_datetime < to_date_]
    else:
        # download 2 files from website
        rides = load_raw_data(year=from_date_.year, months=from_date_.month)
        rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'], utc=True)
        rides = rides[rides.pickup_datetime >= from_date_]

        rides_2 = load_raw_data(year=to_date_.year, months=to_date_.month)
        rides_2['pickup_datetime'] = pd.to_datetime(rides_2['pickup_datetime'], utc=True)
        rides_2 = rides_2[rides_2.pickup_datetime < to_date_]

        rides = pd.concat([rides, rides_2])

    # shift the pickup_datetime back 1 year ahead, to simulate production data
    # using its 7*52-days-ago value
    rides['pickup_datetime'] = rides['pickup_datetime'] + timedelta(days=7*52)

    rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)

    return rides


In [44]:
rides = fetch_ride_events_from_data_warehouse(from_date=fetch_data_from, to_date=fetch_data_to)

Fetching ride events from 2024-09-10 20:00:00+00:00 to 2024-10-08 20:00:00+00:00
File 2023-09 was already in local storage
File 2023-10 was already in local storage


In [45]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
1177856,2024-09-10 20:47:14+00:00,1
1198858,2024-09-11 04:56:25+00:00,1
1199538,2024-09-11 05:34:18+00:00,1
1200506,2024-09-11 06:45:13+00:00,1
1202836,2024-09-11 07:26:56+00:00,1
...,...,...
1003004,2024-10-08 18:47:21+00:00,265
3412259,2024-10-08 19:10:08+00:00,265
3412178,2024-10-08 19:10:57+00:00,265
1008783,2024-10-08 19:11:01+00:00,265


In [46]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)



100%|██████████| 265/265 [00:00<00:00, 295.74it/s]


In [47]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2024-09-10 20:00:00+00:00,1,1
1,2024-09-10 21:00:00+00:00,0,1
2,2024-09-10 22:00:00+00:00,0,1
3,2024-09-10 23:00:00+00:00,0,1
4,2024-09-11 00:00:00+00:00,0,1
...,...,...,...
178075,2024-10-08 15:00:00+00:00,8,265
178076,2024-10-08 16:00:00+00:00,6,265
178077,2024-10-08 17:00:00+00:00,5,265
178078,2024-10-08 18:00:00+00:00,7,265


In [48]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178080 entries, 0 to 178079
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   pickup_hour         178080 non-null  datetime64[ns, UTC]
 1   rides               178080 non-null  int64              
 2   pickup_location_id  178080 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2)
memory usage: 4.1 MB


In [49]:
# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].astype('int64') // 10**6

In [50]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id,pickup_ts
0,2024-09-10 20:00:00+00:00,1,1,1725998400000
1,2024-09-10 21:00:00+00:00,0,1,1726002000000
2,2024-09-10 22:00:00+00:00,0,1,1726005600000
3,2024-09-10 23:00:00+00:00,0,1,1726009200000
4,2024-09-11 00:00:00+00:00,0,1,1726012800000
...,...,...,...,...
178075,2024-10-08 15:00:00+00:00,8,265,1728399600000
178076,2024-10-08 16:00:00+00:00,6,265,1728403200000
178077,2024-10-08 17:00:00+00:00,5,265,1728406800000
178078,2024-10-08 18:00:00+00:00,7,265,1728410400000


In [51]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_ts'],
    event_time='pickup_ts',
)

AttributeError: module 'src.config' has no attribute 'HOPSWORKS_PROJECT_NAME'