In [1]:
import pandas as pd

In [2]:
# -----------------------------
# Load cleaned EDA output
# -----------------------------
DATA_PATH = "../data/processed/cleaned_ferry_ridership.csv"
df = pd.read_csv(DATA_PATH)

In [3]:
# Ensure datetime and sorting
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

In [4]:
df.head(10)

Unnamed: 0,Date,Hour,Route,Direction,Stop,Boardings,TypeDay
0,2017-07-01,7,ER,SB,East 34th Street,0,Weekend
1,2017-07-01,17,SB,SB,Sunset Park/BAT,1,Weekend
2,2017-07-01,17,SB,SB,Red Hook/Atlantic Basin,4,Weekend
3,2017-07-01,17,SB,SB,Dumbo/BBP Pier 1,16,Weekend
4,2017-07-01,17,SB,SB,Bay Ridge,0,Weekend
5,2017-07-01,17,SB,SB,Atlantic Ave/BBP Pier 6,13,Weekend
6,2017-07-01,17,SB,NB,Governors Island,0,Weekend
7,2017-07-01,17,SB,NB,Wall St/Pier 11,0,Weekend
8,2017-07-01,17,SB,NB,Sunset Park/BAT,11,Weekend
9,2017-07-01,17,SB,NB,Red Hook/Atlantic Basin,0,Weekend


In [5]:
daily_df = (
    df.groupby("Date", as_index=False)["Boardings"]
      .sum()
      .sort_values("Date")
      .reset_index(drop=True)
)

daily_df.head()

Unnamed: 0,Date,Boardings
0,2017-07-01,13086
1,2017-07-02,21709
2,2017-07-03,16934
3,2017-07-04,13260
4,2017-07-05,13222


In [6]:
# -----------------------------
# Calendar Features
# -----------------------------
daily_df["year"] = daily_df["Date"].dt.year
daily_df["month"] = daily_df["Date"].dt.month
daily_df["day_of_week"] = daily_df["Date"].dt.dayofweek
daily_df["is_weekend"] = daily_df["day_of_week"].isin([5, 6]).astype(int)

In [None]:
# -----------------------------
# Lag Features(Core)
# -----------------------------
daily_df["lag_1"] = daily_df["Boardings"].shift(1)
daily_df["lag_7"] = daily_df["Boardings"].shift(7)
daily_df["lag_14"] = daily_df["Boardings"].shift(14)
daily_df["lag_30"] = daily_df["Boardings"].shift(30)

In [8]:
# -----------------------------
# Rolling Statistics
# (shifted to avoid leakage)
# -----------------------------
daily_df["rolling_mean_7"] = (daily_df["Boardings"].shift(1).rolling(window=7).mean())
daily_df["rolling_mean_30"] = (daily_df["Boardings"].shift(1).rolling(window=30).mean())

daily_df["rolling_std_7"] = (daily_df["Boardings"].shift(1).rolling(window=7).std())


In [9]:
daily_df.isnull().sum()

Date                0
Boardings           0
year                0
month               0
day_of_week         0
is_weekend          0
lag_1               1
lag_7               7
lag_14             14
lag_30             30
rolling_mean_7      7
rolling_mean_30    30
rolling_std_7       7
dtype: int64

In [10]:
# -----------------------------
# Drop rows with NaNs from lagging
# -----------------------------
df_fe = daily_df.dropna().reset_index(drop=True)
df_fe.head(10)


Unnamed: 0,Date,Boardings,year,month,day_of_week,is_weekend,lag_1,lag_7,lag_14,lag_30,rolling_mean_7,rolling_mean_30,rolling_std_7
0,2017-07-31,15374,2017,7,0,0,23297.0,7913.0,14107.0,13086.0,14201.142857,15168.266667,4731.559765
1,2017-08-01,15244,2017,8,1,0,15374.0,11548.0,14843.0,21709.0,15267.0,15244.533333,3834.241342
2,2017-08-02,11989,2017,8,2,0,15244.0,16024.0,14749.0,16934.0,15795.0,15029.033333,3474.346941
3,2017-08-03,14906,2017,8,3,0,11989.0,13036.0,13819.0,13260.0,15218.571429,14864.2,3753.527801
4,2017-08-04,15698,2017,8,4,0,14906.0,13771.0,15995.0,13222.0,15485.714286,14919.066667,3637.039809
5,2017-08-05,17482,2017,8,5,1,15698.0,13819.0,19529.0,12461.0,15761.0,15001.6,3557.684078
6,2017-08-06,23425,2017,8,6,1,17482.0,23297.0,14695.0,9436.0,16284.285714,15168.966667,3493.240554
7,2017-08-07,6327,2017,8,0,0,23425.0,15374.0,7913.0,19143.0,16302.571429,15635.266667,3536.139037
8,2017-08-08,13937,2017,8,1,0,6327.0,15244.0,11548.0,20664.0,15010.142857,15208.066667,5195.87745
9,2017-08-09,17719,2017,8,2,0,13937.0,11989.0,16024.0,14062.0,14823.428571,14983.833333,5209.538779


In [11]:
# -----------------------------
# Separate features and target
# -----------------------------
feature_cols = [
    "year",
    "month",
    "day_of_week",
    "is_weekend",
    "lag_1",
    "lag_7",
    "lag_14",
    "lag_30",
    "rolling_mean_7",
    "rolling_mean_30",
    "rolling_std_7",
]
target_col = "Boardings"

In [12]:
X = df_fe[feature_cols]
y = df_fe[target_col]

In [15]:
# -----------------------------
# Save engineered dataset
# -----------------------------
output_path = "../data/processed/ml_features_v1.csv"
df_fe[feature_cols + [target_col]].to_csv(output_path, index=False)

print("Feature engineering complete.")
print(f"Saved file to: {output_path}")
print(f"Final dataset shape: {df_fe.shape}")


Feature engineering complete.
Saved file to: ../data/processed/ml_features_v1.csv
Final dataset shape: (3014, 13)


In [14]:
df_fe.head(10)

Unnamed: 0,Date,Boardings,year,month,day_of_week,is_weekend,lag_1,lag_7,lag_14,lag_30,rolling_mean_7,rolling_mean_30,rolling_std_7
0,2017-07-31,15374,2017,7,0,0,23297.0,7913.0,14107.0,13086.0,14201.142857,15168.266667,4731.559765
1,2017-08-01,15244,2017,8,1,0,15374.0,11548.0,14843.0,21709.0,15267.0,15244.533333,3834.241342
2,2017-08-02,11989,2017,8,2,0,15244.0,16024.0,14749.0,16934.0,15795.0,15029.033333,3474.346941
3,2017-08-03,14906,2017,8,3,0,11989.0,13036.0,13819.0,13260.0,15218.571429,14864.2,3753.527801
4,2017-08-04,15698,2017,8,4,0,14906.0,13771.0,15995.0,13222.0,15485.714286,14919.066667,3637.039809
5,2017-08-05,17482,2017,8,5,1,15698.0,13819.0,19529.0,12461.0,15761.0,15001.6,3557.684078
6,2017-08-06,23425,2017,8,6,1,17482.0,23297.0,14695.0,9436.0,16284.285714,15168.966667,3493.240554
7,2017-08-07,6327,2017,8,0,0,23425.0,15374.0,7913.0,19143.0,16302.571429,15635.266667,3536.139037
8,2017-08-08,13937,2017,8,1,0,6327.0,15244.0,11548.0,20664.0,15010.142857,15208.066667,5195.87745
9,2017-08-09,17719,2017,8,2,0,13937.0,11989.0,16024.0,14062.0,14823.428571,14983.833333,5209.538779
