In [10]:
import importlib
import sys
import os

# Add scripts path
scripts_dir = os.path.abspath(os.path.join(os.getcwd(), "../scripts"))
if scripts_dir not in sys.path:
    sys.path.append(scripts_dir)

# Reload module in case it changed
import data_utils
importlib.reload(data_utils)

from data_utils import transform_ts_data_into_features_and_target_loop


In [11]:
ts_df = pd.read_csv("../data/ts_data.csv", parse_dates=["pickup_hour"])
ts_df.set_index("pickup_hour", inplace=True)
print("✅ Time series data shape:", ts_df.shape)
ts_df.head()


✅ Time series data shape: (8748, 205)


Unnamed: 0_level_0,4074.14,4298.05,4461.07,4762.05,4977.03,4993.02,4993.15,5024.10,5033.01,5105.01,...,JC102,JC103,JC104,JC105,JC107,JC108,JC109,JC110,JC115,JC116
pickup_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-01 00:00:00,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,2,0,3,11
2024-01-01 01:00:00,0,0,0,0,0,0,0,0,0,0,...,4,0,2,5,0,0,4,1,4,2
2024-01-01 02:00:00,0,0,0,0,0,0,0,0,0,0,...,2,2,4,1,0,0,2,1,6,4
2024-01-01 03:00:00,0,0,0,0,0,0,0,0,0,0,...,0,2,2,1,0,0,1,2,2,0
2024-01-01 04:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,4,0


In [12]:
top_location_ids = ["HB102", "JC115", "HB105"]
feature_dfs = transform_ts_data_into_features_and_target_loop(ts_df, top_location_ids)
feature_dfs[top_location_ids[0]].head()


📍 Processing location: HB102
📍 Processing location: JC115
📍 Processing location: HB105


Unnamed: 0_level_0,target,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,...,lag_42,lag_43,lag_44,lag_45,lag_46,lag_47,lag_48,hour,dayofweek,is_weekend
pickup_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-03 00:00:00,0,1.0,0.0,1.0,1.0,10.0,18.0,20.0,12.0,10.0,...,0.0,0.0,2.0,2.0,20.0,5.0,5.0,0,2,0
2024-01-03 01:00:00,0,0.0,1.0,0.0,1.0,1.0,10.0,18.0,20.0,12.0,...,0.0,0.0,0.0,2.0,2.0,20.0,5.0,1,2,0
2024-01-03 02:00:00,0,0.0,0.0,1.0,0.0,1.0,1.0,10.0,18.0,20.0,...,0.0,0.0,0.0,0.0,2.0,2.0,20.0,2,2,0
2024-01-03 03:00:00,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,10.0,18.0,...,1.0,0.0,0.0,0.0,0.0,2.0,2.0,3,2,0
2024-01-03 04:00:00,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,10.0,...,1.0,1.0,0.0,0.0,0.0,0.0,2.0,4,2,0


In [13]:
for location_id in top_location_ids:
    output_path = f"../data/features/{location_id}.csv"
    feature_dfs[location_id].to_csv(output_path, index=True)  # ✅ save pickup_hour index
    print(f"✅ Saved features for {location_id} to {output_path}")


✅ Saved features for HB102 to ../data/features/HB102.csv
✅ Saved features for JC115 to ../data/features/JC115.csv
✅ Saved features for HB105 to ../data/features/HB105.csv


In [14]:
import hopsworks
from hsfs.feature_group import FeatureGroup

# Step 1: Combine all features into one dataframe with a location_id column
combined_dfs = []

for location_id in top_location_ids:
    df = pd.read_csv(f"../data/features/{location_id}.csv", parse_dates=["pickup_hour"])
    df["location_id"] = location_id
    combined_dfs.append(df)

all_features_df = pd.concat(combined_dfs)
print("✅ Combined features shape:", all_features_df.shape)

# Step 2: Log in and store to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

# Step 3: Define and save the unified feature group
fg = fs.get_or_create_feature_group(
    name="citi_bike_features",
    version=1,
    description="Combined features for top 3 Citi Bike locations",
    primary_key=["pickup_hour", "location_id"],
    event_time="pickup_hour"
)

# Step 4: Insert the data
fg.insert(all_features_df, overwrite=True)
print("✅ Successfully saved unified feature group: citi_bike_features (v1)")


  from .autonotebook import tqdm as notebook_tqdm


✅ Combined features shape: (26100, 54)
2025-05-07 16:59:04,350 INFO: Initializing external client
2025-05-07 16:59:04,350 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-07 16:59:05,631 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215691
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1215691/fs/1203317/fg/1454423


Uploading Dataframe: 100.00% |██████████| Rows 26100/26100 | Elapsed Time: 00:11 | Remaining Time: 00:00


Launching job: citi_bike_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1215691/jobs/named/citi_bike_features_1_offline_fg_materialization/executions
✅ Successfully saved unified feature group: citi_bike_features (v1)
