In [2]:
import os
import sys
import pandas as pd

# Go one directory up and into 'scripts'
scripts_path = os.path.abspath(os.path.join(os.getcwd(), "..", "scripts"))
print("✅ Corrected path:", scripts_path)

# Add to path and confirm
sys.path.append(scripts_path)
print("✅ sys.path updated.")

# Try import again
from data_utils import transform_ts_data_into_features_and_target_loop


✅ Corrected path: /Users/snehitha/citi_bike_project/scripts
✅ sys.path updated.


In [3]:
# Step 2: Load the timeseries data
ts_df = pd.read_csv("../data/ts_data.csv", parse_dates=["pickup_hour"])
ts_df.set_index("pickup_hour", inplace=True)

# Step 3: Define the 3 location IDs to use
location_ids = ["HB102", "HB105", "JC115"]

# Step 4: Call the utility function to generate lag features
feature_dfs = transform_ts_data_into_features_and_target_loop(ts_df, location_ids)


📍 Processing location: HB102
📍 Processing location: HB105
📍 Processing location: JC115


In [4]:
feature_dfs.keys()


dict_keys(['HB102', 'HB105', 'JC115'])

In [5]:
import hopsworks

# Step 1: Combine per-location feature DataFrames into one DataFrame
combined_dfs = []
for location_id, df in feature_dfs.items():
    df = df.copy()
    df["location_id"] = location_id
    df.reset_index(inplace=True)  # So pickup_hour becomes a column again
    combined_dfs.append(df)

all_features_df = pd.concat(combined_dfs)
print("✅ Combined shape:", all_features_df.shape)

# Step 2: Login to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

# Step 3: Define a version 2 of your feature group
fg = fs.get_or_create_feature_group(
    name="citi_bike_features",
    version=2,
    description="Lag and time features for 3 Citi Bike locations (v2)",
    primary_key=["pickup_hour", "location_id"],
    event_time="pickup_hour"
)

# Step 4: Insert data into the feature group
fg.insert(all_features_df, overwrite=True)
print("✅ Saved version 2 to Hopsworks.")


  from .autonotebook import tqdm as notebook_tqdm


✅ Combined shape: (26100, 54)
2025-05-08 16:26:56,855 INFO: Initializing external client
2025-05-08 16:26:56,856 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-08 16:26:58,084 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215691
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1215691/fs/1203317/fg/1454445


Uploading Dataframe: 100.00% |██████████| Rows 26100/26100 | Elapsed Time: 00:14 | Remaining Time: 00:00


Launching job: citi_bike_features_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1215691/jobs/named/citi_bike_features_2_offline_fg_materialization/executions
✅ Saved version 2 to Hopsworks.


In [10]:
# Check keys and columns from each dataframe in the dictionary
for loc_id, df in feature_dfs.items():
    print(f"\n📍 Location ID: {loc_id}")
    print("➡️ Columns:", df.columns.tolist())



📍 Location ID: HB102
➡️ Columns: ['target', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_17', 'lag_18', 'lag_19', 'lag_20', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25', 'lag_26', 'lag_27', 'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34', 'lag_35', 'lag_36', 'lag_37', 'lag_38', 'lag_39', 'lag_40', 'lag_41', 'lag_42', 'lag_43', 'lag_44', 'lag_45', 'lag_46', 'lag_47', 'lag_48', 'hour', 'dayofweek', 'is_weekend']

📍 Location ID: HB105
➡️ Columns: ['target', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_17', 'lag_18', 'lag_19', 'lag_20', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25', 'lag_26', 'lag_27', 'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34', 'lag_35', 'lag_36', 'lag_37', 'lag_38', 'lag_39', 'lag_40', 'lag_41', 'la

In [9]:
import hopsworks
import pandas as pd

# Log in to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

# ✅ Recombine feature frames
all_features_df = pd.concat(feature_dfs.values())
all_features_df = all_features_df.reset_index()  # restore 'pickup_hour' as a column

# ✅ Fix data types
all_features_df["location_id"] = all_features_df["location_id"].astype(str)
all_features_df["pickup_hour"] = pd.to_datetime(all_features_df["pickup_hour"])

# ✅ Drop rows with missing values in primary key columns (safety)
all_features_df.dropna(subset=["pickup_hour", "location_id"], inplace=True)

# ✅ Confirm
print("✅ Combined shape:", all_features_df.shape)
print("✅ Columns:", all_features_df.columns.tolist())
print("✅ pickup_hour type:", all_features_df["pickup_hour"].dtype)
print("✅ location_id type:", all_features_df["location_id"].dtype)

# ✅ Register to Hopsworks
fg = fs.get_or_create_feature_group(
    name="citi_bike_features",
    version=2,
    description="Lag and time features for 3 Citi Bike locations (v2)",
    primary_key=["pickup_hour", "location_id"],
    event_time="pickup_hour",
)

# ✅ Insert into feature store
fg.insert(all_features_df, overwrite=True)
print("✅ Successfully pushed cleaned features to Hopsworks.")


2025-05-08 17:13:44,277 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-08 17:13:44,296 INFO: Initializing external client
2025-05-08 17:13:44,297 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-08 17:13:45,407 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215691


KeyError: 'location_id'