In [1]:
import os
import pandas as pd
import numpy as np


In [2]:

# ----------------------------------------------------------------------
# 0. PATH HANDLING
# ----------------------------------------------------------------------
def safe_path(preferred, fallback):
    try:
        os.makedirs(os.path.dirname(preferred), exist_ok=True)
        return preferred
    except Exception:
        os.makedirs(os.path.dirname(fallback), exist_ok=True)
        return fallback
import os
import pandas as pd
import numpy as np

# ----------------------------------------------------------------------
# 0. PATHS 
# ----------------------------------------------------------------------
MERGED_PATH       = "../data/processed/taxi_weather_zones_merged_2023_sample.parquet"
ENGINEERED_OUT    = "../data/processed/engineered_features_2023_sample.parquet"
FEATURE_LIST_OUT  = "../data/processed/feature_list_sample.txt"
FEATURE_STATS_OUT = "../data/processed/feature_statistics_sample.csv"

os.makedirs("../data/processed", exist_ok=True)

print("Merged input      :", MERGED_PATH)
print("Engineered output :", ENGINEERED_OUT)

# ----------------------------------------------------------------------
# 1. LOAD MERGED DATASET
# ----------------------------------------------------------------------
df = pd.read_parquet(MERGED_PATH)


# ----------------------------------------------------------------------
# 1. LOAD MERGED DATASET
# ----------------------------------------------------------------------
df = pd.read_parquet(MERGED_PATH)

print("\n=== Loaded merged dataset ===")
print(df.shape)
print(df.head())

# ----------------------------------------------------------------------
# 2. TEMPORAL FEATURES
# ----------------------------------------------------------------------
dt = df["tpep_pickup_datetime"]

df["hour"] = dt.dt.hour
df["day_of_week"] = dt.dt.weekday  # 0 = Monday
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)
df["month"] = dt.dt.month
df["day_of_month"] = dt.dt.day
df["week_of_year"] = dt.dt.isocalendar().week.astype(int)
df["quarter"] = dt.dt.quarter

# US Holiday List 
us_holidays = {
    "2023-01-01", "2023-07-04", "2023-12-25", "2023-11-23",
    "2023-05-29", "2023-09-04", "2023-11-11"
}
df["is_holiday"] = df["pickup_date"].astype(str).isin(us_holidays).astype(int)

# ----------------------------------------------------------------------
# 3. TRIP-BASED FEATURES
# ----------------------------------------------------------------------
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"], errors="coerce")

df["trip_duration_minutes"] = (
    (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
)

df["avg_speed_mph"] = df["trip_distance"] / (df["trip_duration_minutes"] / 60)
df["fare_per_mile"] = df["fare_amount"] / df["trip_distance"]
df["fare_per_minute"] = df["fare_amount"] / df["trip_duration_minutes"]
df["distance_squared"] = df["trip_distance"] ** 2
df["duration_squared"] = df["trip_duration_minutes"] ** 2

# ----------------------------------------------------------------------
# 4. WEATHER FEATURES
# ----------------------------------------------------------------------
df["temp_fahrenheit"] = df["temperature_avg"]

df["is_raining"] = (df["precipitation"] > 0).astype(int)

def temp_category(t):
    if pd.isna(t): return np.nan
    if t < 32: return "cold"
    if t > 75: return "hot"
    return "moderate"

df["temp_category"] = df["temp_fahrenheit"].apply(temp_category)

# Monthly average temp
monthly_mean_temp = df.groupby("month")["temp_fahrenheit"].transform("mean")
df["temp_deviation"] = df["temp_fahrenheit"] - monthly_mean_temp

df["extreme_weather"] = (
    (df["is_raining"] == 1) |
    (df["temp_fahrenheit"] < 20) |
    (df["temp_fahrenheit"] > 90)
).astype(int)

# ----------------------------------------------------------------------
# 5. SPATIAL FEATURES
# ----------------------------------------------------------------------
df["same_borough"] = (df["pickup_borough"] == df["dropoff_borough"]).astype(int)

df["zone_pair_encoded"] = (
    df["pickup_zone"].astype(str) + "_" +
    df["dropoff_zone"].astype(str)
)

# Pickup density (per zone per hour)
df["pickup_density"] = (
    df.groupby(["pickup_zone", "hour"])["pickup_zone"].transform("count")
)

# Traffic volume category (from congestion surcharge)
def traffic_cat(c):
    if c == 0: return "low"
    if c == 2.50: return "medium"
    if c == 2.75: return "high"
    return "other"

if "congestion_surcharge" in df.columns:
    df["traffic_volume_category"] = df["congestion_surcharge"].apply(traffic_cat)
else:
    df["traffic_volume_category"] = "unknown"

# ----------------------------------------------------------------------
# 6. DEMAND-BASED FEATURES
# ----------------------------------------------------------------------
df["hourly_pickup_count"] = df.groupby(["pickup_date", "hour"])["pickup_date"].transform("count")

df["zone_hourly_pickup_count"] = df.groupby(
    ["pickup_zone", "pickup_date", "hour"]
)["pickup_zone"].transform("count")

df["is_peak_hour"] = (
    ((df["hour"].between(7, 10)) | (df["hour"].between(17, 20))) &
    (df["is_weekend"] == 0)
).astype(int)

df["is_late_night"] = df["hour"].between(0, 5).astype(int)

avg_hourly = df.groupby("hour")["hourly_pickup_count"].transform("mean")
df["surge_likelihood"] = df["hourly_pickup_count"] / avg_hourly

# ----------------------------------------------------------------------
# 7. DERIVED FEATURES
# ----------------------------------------------------------------------
df["fare_per_distance_per_minute"] = df["fare_amount"] / (
    df["trip_distance"] * df["trip_duration_minutes"]
)

df["trip_complexity"] = (
    df["trip_distance"] * df["avg_speed_mph"] * df["is_raining"]
)

def time_of_day(h):
    if 5 <= h < 12: return "morning"
    if 12 <= h < 17: return "afternoon"
    if 17 <= h < 21: return "evening"
    return "night"

df["time_of_day_factor"] = df["hour"].apply(time_of_day)

def day_type(row):
    if row["is_holiday"] == 1: return "holiday"
    if row["is_weekend"] == 1: return "weekend"
    return "weekday"

df["day_type"] = df.apply(day_type, axis=1)

# ----------------------------------------------------------------------
# 8. OUTLIER FLAGS
# ----------------------------------------------------------------------
fare_95 = df["fare_amount"].quantile(0.95)
dist_95 = df["trip_distance"].quantile(0.95)

df["extreme_fare_flag"] = (df["fare_amount"] > fare_95).astype(int)
df["extreme_distance_flag"] = (df["trip_distance"] > dist_95).astype(int)
df["zero_distance_flag"] = (df["trip_distance"] <= 0).astype(int)
df["zero_fare_flag"] = (df["fare_amount"] <= 0).astype(int)

# ----------------------------------------------------------------------
# 9. INTERACTION FEATURES
# ----------------------------------------------------------------------
df["rain_rush_hour"] = df["is_raining"] * df["is_peak_hour"]
df["weather_distance"] = df["extreme_weather"] * df["trip_distance"]
df["peak_hour_distance"] = df["is_peak_hour"] * df["trip_distance"]

# ----------------------------------------------------------------------
# 10. CLEANING & VALIDATION
# ----------------------------------------------------------------------
df = df[
    (df["trip_distance"] > 0) &
    (df["fare_amount"] > 0) &
    (df["trip_duration_minutes"] > 0)
].copy()

df.replace([np.inf, -np.inf], np.nan, inplace=True)

for col in df.select_dtypes(include="number").columns:
    df[col].fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include="object").columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# ----------------------------------------------------------------------
# 11. FEATURE SELECTION & PREPARATION
# ----------------------------------------------------------------------


# drop_cols = [
#     "tpep_pickup_datetime", "tpep_dropoff_datetime"
# ]
# df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

# numerical_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
# categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()

# ----------------------------------------------------------------------
# 11. REMOVE TARGET-LEAKING FEATURES FROM MODELING DATA
# ----------------------------------------------------------------------
# Features derived directly from the target `fare_amount`
leaky_fare_features = [
    "fare_per_mile",
    "fare_per_minute",
    "fare_per_distance_per_minute",
    "extreme_fare_flag",
    "zero_fare_flag",
]

# Financial components that let the model reconstruct the fare/total
leaky_financial_features = [
    "total_amount",
    "tip_amount",
    "tolls_amount",
    "mta_tax",
    "extra",
    "congestion_surcharge",
    "airport_fee",
]

leaky_features = leaky_fare_features + leaky_financial_features

existing_leaky = [c for c in leaky_features if c in df.columns]
if existing_leaky:
    print("\nDropping target/financial-leaking features from engineered dataset used for modeling:")
    print(existing_leaky)
    df = df.drop(columns=existing_leaky)
else:
    print("\nNo target/financial-leaking features found to drop.")

# Now recompute feature type lists AFTER dropping
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()


# ----------------------------------------------------------------------
# 12. SAVE OUTPUTS
# ----------------------------------------------------------------------
df.to_parquet(ENGINEERED_OUT, index=False)
print("\n Saved engineered dataset to:", ENGINEERED_OUT)

# Feature list
with open(FEATURE_LIST_OUT, "w") as f:
    for col in df.columns:
        dtype = "numerical" if col in numerical_features else "categorical"
        f.write(f"{col} | {dtype}\n")

print(" Saved feature list:", FEATURE_LIST_OUT)

# Feature statistics
stats = df[numerical_features].describe().T
stats.to_csv(FEATURE_STATS_OUT)
print(" Saved feature statistics:", FEATURE_STATS_OUT)

# ----------------------------------------------------------------------
# 13. SUMMARY OUTPUT
# ----------------------------------------------------------------------
print("\n=== FINAL SUMMARY ===")
print("Total rows  :", len(df))
print("Total cols  :", len(df.columns))
print("Numerical   :", len(numerical_features))
print("Categorical :", len(categorical_features))

print("\nMissing values per feature:")
print(df.isna().sum().sort_values(ascending=False).head(20))

print("\nSample of engineered data:")
print(df.head())

print("\n Feature engineering completed successfully!")


Merged input      : ../data/processed/taxi_weather_zones_merged_2023_sample.parquet
Engineered output : ../data/processed/engineered_features_2023_sample.parquet

=== Loaded merged dataset ===
(1960211, 29)
   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2023-01-01 00:32:10   2023-01-01 00:40:36              1.0   
1         2  2023-01-01 00:55:08   2023-01-01 01:01:27              1.0   
2         2  2023-01-01 00:25:04   2023-01-01 00:37:49              1.0   
3         1  2023-01-01 00:03:48   2023-01-01 00:13:25              0.0   
4         2  2023-01-01 00:10:29   2023-01-01 00:21:19              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           0.97         1.0                  N           161           141   
1           1.10         1.0                  N            43           237   
2           2.51         1.0                  N            48           238   
3           1.90         1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)



Dropping target/financial-leaking features from engineered dataset used for modeling:
['fare_per_mile', 'fare_per_minute', 'fare_per_distance_per_minute', 'extreme_fare_flag', 'zero_fare_flag', 'total_amount', 'tip_amount', 'tolls_amount', 'mta_tax', 'extra', 'congestion_surcharge', 'airport_fee']

 Saved engineered dataset to: ../data/processed/engineered_features_2023_sample.parquet
 Saved feature list: ../data/processed/feature_list_sample.txt
 Saved feature statistics: ../data/processed/feature_statistics_sample.csv

=== FINAL SUMMARY ===
Total rows  : 1960167
Total cols  : 54
Numerical   : 36
Categorical : 10

Missing values per feature:
Airport_fee                 1960167
VendorID                          0
hourly_pickup_count               0
trip_duration_minutes             0
avg_speed_mph                     0
distance_squared                  0
duration_squared                  0
temp_fahrenheit                   0
temp_deviation                    0
extreme_weather         