In [1]:
import pandas as pd

# Load the pivoted time series data
ts_df = pd.read_csv("../data/ts_data.csv", parse_dates=["pickup_hour"])
ts_df.set_index("pickup_hour", inplace=True)

# Sum rides per location and get top 3
top_locations = ts_df.sum().sort_values(ascending=False).head(3)
top_location_ids = top_locations.index.tolist()

print("✅ Top 3 Start Station IDs:", top_location_ids)
top_locations


✅ Top 3 Start Station IDs: ['HB102', 'JC115', 'HB105']


HB102    54287
JC115    46147
HB105    25785
dtype: int64

In [2]:
# Create features and targets for top locations
feature_dfs = {}

for location_id in top_location_ids:
    print(f"📍 Processing location: {location_id}")

    df_loc = pd.DataFrame(ts_df[location_id])
    df_loc.columns = ["target"]  # rename for modeling

    # Create lag features t-1 to t-28
    for lag in range(1, 29):
        df_loc[f"lag_{lag}"] = df_loc["target"].shift(lag)

    df_loc.dropna(inplace=True)
    feature_dfs[location_id] = df_loc

# Show one example
feature_dfs[top_location_ids[0]].head()


📍 Processing location: HB102
📍 Processing location: JC115
📍 Processing location: HB105


Unnamed: 0_level_0,target,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,...,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24,lag_25,lag_26,lag_27,lag_28
pickup_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 04:00:00,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,4.0,...,1.0,0.0,0.0,0.0,0.0,2.0,2.0,20.0,5.0,5.0
2024-01-02 05:00:00,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,...,1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,20.0,5.0
2024-01-02 06:00:00,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,5.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,20.0
2024-01-02 07:00:00,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,5.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0
2024-01-02 08:00:00,4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,5.0,5.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0


In [4]:
import os

In [5]:
# Save each location's feature set as a CSV
output_dir = "data/features"
os.makedirs(output_dir, exist_ok=True)

for location_id, df_feat in feature_dfs.items():
    out_path = os.path.join(output_dir, f"{location_id}.csv")
    df_feat.to_csv(out_path)
    print(f"✅ Saved features for {location_id} to {out_path}")


✅ Saved features for HB102 to data/features/HB102.csv
✅ Saved features for JC115 to data/features/JC115.csv
✅ Saved features for HB105 to data/features/HB105.csv
