In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

print("Processed files:")
print(os.listdir("../data/processed"))


Processed files:
['taxi_weather_zones_merged_2023_sample.parquet', 'train_test_split_indices.csv', 'data_schema_sample.txt', 'taxi_weather_zones_merged_2023_sample.csv']


In [2]:

# =========================
# 1. Paths & loading
# =========================
MERGED_PATH = "../data/processed/taxi_weather_zones_merged_2023_sample.csv"
OUTPUT_DIR = "../output/eda_visualizations"
SUMMARY_PATH = "../output/eda_summary.txt"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.dirname(SUMMARY_PATH), exist_ok=True)

print("Merged dataset path:", MERGED_PATH)
print("Visualization output dir:", OUTPUT_DIR)
print("Summary file:", SUMMARY_PATH)

# Load merged dataset
df = pd.read_csv(MERGED_PATH, parse_dates=["tpep_pickup_datetime", "pickup_date"])

# =========================
# 2. Basic overview
# =========================
print("\n=== Basic Info ===")
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

print("\n=== Data Types ===")
print(df.dtypes)

print("\n=== Memory Usage (MB) ===")
print(df.memory_usage(deep=True).sum() / (1024**2))

print("\n=== Missing Values ===")
missing_pct = df.isna().mean() * 100
missing_summary = pd.DataFrame({
    "missing_count": df.isna().sum(),
    "missing_pct": missing_pct
})
print(missing_summary)

print("\n=== Duplicate Rows ===")
print("Duplicate count:", df.duplicated().sum())

# =========================
# 3. Univariate analysis
# =========================
num_cols = ["fare_amount", "trip_distance", "temperature_avg", "precipitation"]

print("\n=== Numerical Descriptive Stats ===")
print(df[num_cols].describe(percentiles=[0.25, 0.5, 0.75]))

outliers = {
    "fares_over_500": (df["fare_amount"] > 500).sum(),
    "distance_over_100_miles": (df["trip_distance"] > 100).sum()
}
print("\n=== Outliers ===")
print(outliers)

print("\n=== Categorical Distributions ===")
if "pickup_borough" in df.columns:
    print("\nPickup borough (%):")
    print(df["pickup_borough"].value_counts(normalize=True) * 100)

if "temp_category" in df.columns:
    print("\nTemp category (%):")
    print(df["temp_category"].value_counts(normalize=True) * 100)

if "is_raining" in df.columns:
    print("\nIs raining (%):")
    print(df["is_raining"].value_counts(normalize=True) * 100)

# =========================
# 4. Bivariate analysis
# =========================
df["hour"] = df["tpep_pickup_datetime"].dt.hour

corr_fd = df[["fare_amount", "trip_distance"]].corr().iloc[0, 1]
print("\nFare vs Distance Correlation:", corr_fd)

avg_fare_by_hour = df.groupby("hour")["fare_amount"].mean()
avg_fare_by_borough = df.groupby("pickup_borough")["fare_amount"].mean()
avg_fare_rain = df.groupby("is_raining")["fare_amount"].mean()
avg_fare_temp_cat = df.groupby("temp_category")["fare_amount"].mean()

print("\n=== Avg Fare By Hour ===")
print(avg_fare_by_hour)

print("\n=== Avg Fare By Borough ===")
print(avg_fare_by_borough)

print("\n=== Avg Fare: Rain vs No Rain ===")
print(avg_fare_rain)

print("\n=== Avg Fare by Temp Category ===")
print(avg_fare_temp_cat)

# =========================
# 5. Temporal analysis
# =========================
df["day_of_week"] = df["tpep_pickup_datetime"].dt.day_name()
df["month"] = df["tpep_pickup_datetime"].dt.month

pickups_by_hour = df["hour"].value_counts().sort_index()
pickups_by_day = df["day_of_week"].value_counts()
pickups_by_month = df["month"].value_counts().sort_index()

print("\n=== Temporal Patterns ===")
print("Pickups by hour:\n", pickups_by_hour.head(24))
print("\nPickups by day:\n", pickups_by_day)
print("\nPickups by month:\n", pickups_by_month)

# =========================
# 6. Spatial analysis
# =========================
top_pickup_zones = df["pickup_zone"].value_counts().head(10)
top_fare_zones = (
    df.groupby("pickup_zone")["fare_amount"]
      .mean()
      .sort_values(ascending=False)
      .head(10)
)
zone_density = df["pickup_zone"].value_counts()

print("\n=== Spatial Analysis ===")
print("Top 10 pickup zones:\n", top_pickup_zones)
print("\nTop 10 zones by avg fare:\n", top_fare_zones)

# =========================
# 7. Data quality checks
# =========================
print("\n=== Data Quality Checks ===")
print("Zero/negative fare rows:", (df["fare_amount"] <= 0).sum())
print("Zero/negative distance rows:", (df["trip_distance"] <= 0).sum())

# =========================
# 8. Visualizations
# =========================
sns.set(style="whitegrid")

# Fare histogram
plt.figure(figsize=(8, 6))
sns.histplot(df["fare_amount"], bins=80, kde=True)
plt.title("Fare Amount Distribution")
plt.xlabel("Fare Amount ($)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/fare_distribution.png")
plt.close()

# Distance histogram
plt.figure(figsize=(8, 6))
sns.histplot(df["trip_distance"], bins=80, kde=True)
plt.title("Trip Distance Distribution")
plt.xlabel("Trip Distance (miles)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/distance_distribution.png")
plt.close()

# Trips by borough
if "pickup_borough" in df.columns:
    plt.figure(figsize=(8, 6))
    df["pickup_borough"].value_counts().plot(kind="bar")
    plt.title("Trips by Borough")
    plt.xlabel("Borough")
    plt.ylabel("Trip Count")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/trips_by_borough.png")
    plt.close()

# Boxplot: fare by rain
if "is_raining" in df.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x="is_raining", y="fare_amount", data=df)
    plt.title("Fare vs Weather (Rainy vs Not Rainy)")
    plt.xlabel("Is Raining (0 = No, 1 = Yes)")
    plt.ylabel("Fare Amount ($)")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/fare_vs_rain.png")
    plt.close()

# Line: avg fare by hour
plt.figure(figsize=(8, 6))
avg_fare_by_hour.plot(kind="line", marker="o")
plt.title("Average Fare by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Fare ($)")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/avg_fare_by_hour.png")
plt.close()

# Heatmap: hour x day_of_week
pivot = df.pivot_table(
    index="day_of_week",
    columns="hour",
    values="fare_amount",
    aggfunc="mean"
)
# Ensure consistent ordering of days
day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
pivot = pivot.reindex(day_order)

plt.figure(figsize=(12, 6))
sns.heatmap(pivot, cmap="viridis")
plt.title("Avg Fare: Hour vs Day of Week")
plt.xlabel("Hour of Day")
plt.ylabel("Day of Week")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/fare_heatmap_hour_day.png")
plt.close()

print("\n Visualizations saved in:", OUTPUT_DIR)

# =========================
# 9. Summary report
# =========================
with open(SUMMARY_PATH, "w") as f:
    f.write("=== NYC Taxi + Weather + Zones EDA Summary ===\n\n")
    f.write(f"Dataset shape: {df.shape}\n\n")

    f.write("=== Missing Value Summary (%): ===\n")
    f.write(str(missing_pct) + "\n\n")

    f.write("=== Numerical Stats ===\n")
    f.write(str(df[num_cols].describe()) + "\n\n")

    f.write("=== Outliers ===\n")
    f.write(str(outliers) + "\n\n")

    f.write("=== Top Pickup Zones ===\n")
    f.write(str(top_pickup_zones) + "\n\n")

    f.write("=== Top Zones by Avg Fare ===\n")
    f.write(str(top_fare_zones) + "\n\n")

    f.write("=== Temporal Patterns ===\n")
    f.write("Pickups by hour:\n")
    f.write(str(pickups_by_hour) + "\n\n")
    f.write("Pickups by day of week:\n")
    f.write(str(pickups_by_day) + "\n\n")
    f.write("Pickups by month:\n")
    f.write(str(pickups_by_month) + "\n\n")

    f.write("=== Key Observations ===\n")
    f.write("- Check impact of extreme fares and distances on modeling.\n")
    f.write("- Hour, day_of_week, and month show clear temporal patterns.\n")
    f.write("- Borough and zone have different average faresâ€”useful for features.\n")
    f.write("- Weather (rain, temp_category) may have moderate effect on fares.\n")

print("\n EDA summary saved to:", SUMMARY_PATH)
print(" EDA completed successfully!")


Merged dataset path: ../data/processed/taxi_weather_zones_merged_2023_sample.csv
Visualization output dir: ../output/eda_visualizations
Summary file: ../output/eda_summary.txt

=== Basic Info ===
Shape: (1960211, 29)

Columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'Airport_fee', 'pickup_date', 'pickup_borough', 'pickup_zone', 'temperature_avg', 'precipitation', 'is_raining', 'temp_category', 'dropoff_borough', 'dropoff_zone']

=== Data Types ===
VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime            object
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag   