In [97]:
import hopsworks

In [98]:
HOPSWORKS_PROJECT_NAME = 'taxi_demand_rs'

In [99]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

# load key-value pairs from .env file located in the parent directory
load_dotenv(PARENT_DIR / '.env')

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

I want to only use data from 2023 onward since I was getting a message about hitting a row limit with the data. We will see if this causes any issues to the model.

In [100]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2023
to_year = datetime.now().year
print(f'Downloading raw data from {from_year} to {to_year}')

rides = pd.DataFrame()
for year in range(from_year, to_year+1):
    
    # download data for the whole year
    rides_one_year = load_raw_data(year)
    
    # append rows
    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2023 to 2024
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local storage
File 2023-12 was already in local storage
File 2024-01 was already in local storage
File 2024-02 was already in local storage
File 2024-03 was already in local storage
File 2024-04 was already in local storage
File 2024-05 was already in local storage
File 2024-06 was already in local storage
File 2024-07 was already in local storage
Downloading file 2024-08
2024-08 file is not available
Downloading file 2024-09
2024-09 file is not available
Downloading file 2024-10
2024-10 file is not availabl

Now we have all rides from the start of 2023 until the end of July 2024. This is because the site we are getting data from has not uploaded any data past this date.

In [101]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107
...,...,...
3076898,2024-07-31 23:12:00,243
3076899,2024-07-31 23:10:34,170
3076900,2024-07-31 23:32:00,197
3076901,2024-07-31 23:32:52,230


We have 61 million rides now.

Here is where things get just a little bit tricky. This is where all future data will become synthetic data that is actually just data from a year ago. The first step is to add in the August 2024 data which will be the August 2023 data.

In [102]:
# create to and from dates starting with August
august = pd.to_datetime("2024-08-01 00:00:00+00:00")

# til date is Sep 11th
sep = pd.to_datetime('2024-09-11 20:00:00+00:00')

print(f'Fetching synthetic data from {august} until {sep}')


Fetching synthetic data from 2024-08-01 00:00:00+00:00 until 2024-09-11 20:00:00+00:00


In [103]:
from src.data import load_raw_data
from datetime import datetime, timedelta
import pandas as pd
import pytz

def fetch_ride_events_from_data_warehouse(
    from_date: datetime,
    to_date: datetime
) -> pd.DataFrame:
    """
    This function is used to simulate production data by sampling historical data
    from 52 weeks ago (i.e. 1 year)
    """
    # Ensure from_date and to_date are timezone-aware
    from_date = from_date.replace(tzinfo=pytz.UTC)
    to_date = to_date.replace(tzinfo=pytz.UTC)

    from_date_ = pd.Timestamp(from_date - timedelta(days=7*52))
    to_date_ = pd.Timestamp(to_date - timedelta(days=7*52))
    print(f'Fetching ride events from {from_date} to {to_date}')

    if (from_date_.year == to_date_.year) and (from_date_.month == to_date_.month):
        # download 1 file of data only
        rides = load_raw_data(year=from_date_.year, months=from_date_.month)
        rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'], utc=True)
        rides = rides[rides.pickup_datetime >= from_date_]
        rides = rides[rides.pickup_datetime < to_date_]
    else:
        # download 2 files from website
        rides = load_raw_data(year=from_date_.year, months=from_date_.month)
        rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'], utc=True)
        rides = rides[rides.pickup_datetime >= from_date_]

        rides_2 = load_raw_data(year=to_date_.year, months=to_date_.month)
        rides_2['pickup_datetime'] = pd.to_datetime(rides_2['pickup_datetime'], utc=True)
        rides_2 = rides_2[rides_2.pickup_datetime < to_date_]

        rides = pd.concat([rides, rides_2])

    # shift the pickup_datetime back 1 year ahead, to simulate production data
    # using its 7*52-days-ago value
    rides['pickup_datetime'] = rides['pickup_datetime'] + timedelta(days=7*52)

    rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)

    return rides

In [104]:
rides_synthetic = fetch_ride_events_from_data_warehouse(august, sep)
rides_synthetic

Fetching ride events from 2024-08-01 00:00:00+00:00 to 2024-09-11 20:00:00+00:00
File 2023-08 was already in local storage
File 2023-09 was already in local storage


Unnamed: 0,pickup_datetime,pickup_location_id
197584,2024-08-01 01:03:05+00:00,1
199508,2024-08-01 05:11:46+00:00,1
201047,2024-08-01 06:05:50+00:00,1
201048,2024-08-01 06:07:13+00:00,1
219670,2024-08-01 11:02:05+00:00,1
...,...,...
2758685,2024-09-11 19:09:20+00:00,265
1282168,2024-09-11 19:25:23+00:00,265
1282959,2024-09-11 19:28:01+00:00,265
1279910,2024-09-11 19:47:02+00:00,265


In [105]:
len(rides)

61718296

In [106]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107
...,...,...
3076898,2024-07-31 23:12:00,243
3076899,2024-07-31 23:10:34,170
3076900,2024-07-31 23:32:00,197
3076901,2024-07-31 23:32:52,230


In [108]:
# Convert pickup_datetime to datetime and make it timezone-aware
rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime']).dt.tz_localize('UTC')

In [109]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10+00:00,161
1,2023-01-01 00:55:08+00:00,43
2,2023-01-01 00:25:04+00:00,48
3,2023-01-01 00:03:48+00:00,138
4,2023-01-01 00:10:29+00:00,107
...,...,...
3076898,2024-07-31 23:12:00+00:00,243
3076899,2024-07-31 23:10:34+00:00,170
3076900,2024-07-31 23:32:00+00:00,197
3076901,2024-07-31 23:32:52+00:00,230


In [107]:
rides_synthetic

Unnamed: 0,pickup_datetime,pickup_location_id
197584,2024-08-01 01:03:05+00:00,1
199508,2024-08-01 05:11:46+00:00,1
201047,2024-08-01 06:05:50+00:00,1
201048,2024-08-01 06:07:13+00:00,1
219670,2024-08-01 11:02:05+00:00,1
...,...,...
2758685,2024-09-11 19:09:20+00:00,265
1282168,2024-09-11 19:25:23+00:00,265
1282959,2024-09-11 19:28:01+00:00,265
1279910,2024-09-11 19:47:02+00:00,265


In [110]:
# add this to rides data
rides = pd.concat([rides, rides_synthetic])

In [111]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10+00:00,161
1,2023-01-01 00:55:08+00:00,43
2,2023-01-01 00:25:04+00:00,48
3,2023-01-01 00:03:48+00:00,138
4,2023-01-01 00:10:29+00:00,107
...,...,...
2758685,2024-09-11 19:09:20+00:00,265
1282168,2024-09-11 19:25:23+00:00,265
1282959,2024-09-11 19:28:01+00:00,265
1279910,2024-09-11 19:47:02+00:00,265


In [112]:
print(f'{len(rides)=:,}')

len(rides)=65,679,100


In [113]:
rides.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65679100 entries, 0 to 1283488
Data columns (total 2 columns):
 #   Column              Dtype              
---  ------              -----              
 0   pickup_datetime     datetime64[ns, UTC]
 1   pickup_location_id  int64              
dtypes: datetime64[ns, UTC](1), int64(1)
memory usage: 1.5 GB


In [114]:
rides.sort_values(by=['pickup_datetime', 'pickup_location_id'], inplace=True)
rides

Unnamed: 0,pickup_datetime,pickup_location_id
2995098,2023-01-01 00:00:00+00:00,42
3497,2023-01-01 00:00:05+00:00,249
2506,2023-01-01 00:00:06+00:00,125
3499,2023-01-01 00:00:08+00:00,42
4475,2023-01-01 00:00:09+00:00,79
...,...,...
1282913,2024-09-11 19:59:57+00:00,144
1288742,2024-09-11 19:59:57+00:00,262
1280651,2024-09-11 19:59:58+00:00,100
1285457,2024-09-11 19:59:58+00:00,162


In [116]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)



100%|██████████| 265/265 [00:03<00:00, 76.00it/s] 


Now we have the rides per hour for each pickup_location.

In [117]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2023-01-01 00:00:00+00:00,0,1
1,2023-01-01 01:00:00+00:00,0,1
2,2023-01-01 02:00:00+00:00,0,1
3,2023-01-01 03:00:00+00:00,0,1
4,2023-01-01 04:00:00+00:00,0,1
...,...,...,...
3942135,2024-09-11 15:00:00+00:00,8,265
3942136,2024-09-11 16:00:00+00:00,3,265
3942137,2024-09-11 17:00:00+00:00,9,265
3942138,2024-09-11 18:00:00+00:00,2,265


In [118]:
len(ts_data.loc[ts_data['pickup_location_id'] == 265])

14876

In [119]:
# Convert string to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# Add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].astype('int64') // 10**6

This operation added a **pickup_ts** column to help hopsworks with the event time primary key.

In [120]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id,pickup_ts
0,2023-01-01 00:00:00+00:00,0,1,1672531200000
1,2023-01-01 01:00:00+00:00,0,1,1672534800000
2,2023-01-01 02:00:00+00:00,0,1,1672538400000
3,2023-01-01 03:00:00+00:00,0,1,1672542000000
4,2023-01-01 04:00:00+00:00,0,1,1672545600000
...,...,...,...,...
3942135,2024-09-11 15:00:00+00:00,8,265,1726066800000
3942136,2024-09-11 16:00:00+00:00,3,265,1726070400000
3942137,2024-09-11 17:00:00+00:00,9,265,1726074000000
3942138,2024-09-11 18:00:00+00:00,2,265,1726077600000


In [121]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1049751


In [122]:
feature_store = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.


Now we will write data to the feature group within our feature store.

In [123]:
len(ts_data)

3942140

In [124]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 1

In [125]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_ts'],
    event_time='pickup_ts',
)

In [126]:
# add this data to feature group version 2
feature_group.insert(ts_data, write_options={"wait_for_job": True})


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1049751/fs/1041478/fg/1261916


Uploading Dataframe: 0.00% |          | Rows 0/3942140 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1049751/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x26c481bbf10>, None)

In [127]:
df = feature_group.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (101.44s) 


In [128]:
len(df)

3942140

In [129]:
df.tail()

Unnamed: 0,pickup_hour,rides,pickup_location_id,pickup_ts
3942135,2023-05-09 18:00:00+00:00,193,48,1683655200000
3942136,2024-06-29 10:00:00+00:00,4,152,1719655200000
3942137,2023-10-22 20:00:00+00:00,31,151,1698004800000
3942138,2023-04-13 18:00:00+00:00,444,162,1681408800000
3942139,2024-01-30 21:00:00+00:00,0,219,1706648400000
