In [9]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression

# Create output directory
os.makedirs("figures", exist_ok=True)

# Load TLC trip data
df_taxi = pd.read_parquet("tlc_trip.parquet")
df_taxi['tpep_pickup_datetime'] = pd.to_datetime(df_taxi['tpep_pickup_datetime'])
df_taxi['date'] = df_taxi['tpep_pickup_datetime'].dt.date
df_taxi = df_taxi[(df_taxi['trip_distance'] > 0) ]

# Load weather data
df_weather = pd.read_csv("weather.csv")
df_weather['time'] = pd.to_datetime(df_weather['time'])
df_weather['date'] = df_weather['time'].dt.date
df_temp = df_weather.groupby('date')['temperature_2m (°C)'].max().reset_index()
df_temp.columns = ['date', 'max_temp']

# Aggregate trip count by date and pickup zone
df_daily = df_taxi.groupby(['date', 'PULocationID']).agg(
    trip_count=('VendorID', 'count')
).reset_index()

# Merge weather into trip data
df_temp['date'] = pd.to_datetime(df_temp['date'])
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_merged = pd.merge(df_daily, df_temp, on='date', how='left')
df_merged['is_hot_day'] = df_merged['max_temp'] > 32

# Pivot to calculate zone-level hot/cool day averages
df_zone = df_merged.pivot_table(
    index='PULocationID',
    columns='is_hot_day',
    values='trip_count',
    aggfunc='mean'
).reset_index()
df_zone.columns = ['PULocationID', 'normal_day_avg', 'hot_day_avg']
df_zone['diff'] = df_zone['hot_day_avg'] - df_zone['normal_day_avg']

# Load Taxi Zones shapefile
gdf_zone = gpd.read_file("taxi_zones.shp").to_crs("EPSG:4326")
gdf_zone = gdf_zone.rename(columns={'LocationID': 'PULocationID'})
gdf_joined = pd.merge(gdf_zone, df_zone, on='PULocationID', how='left')




import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


### 正式

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression

# Create a directory to save figures
os.makedirs("figures", exist_ok=True)

# Load TLC taxi trip data
df_taxi = pd.read_parquet("tlc_trip.parquet")
df_taxi['tpep_pickup_datetime'] = pd.to_datetime(df_taxi['tpep_pickup_datetime'])
df_taxi['hour'] = df_taxi['tpep_pickup_datetime'].dt.hour
df_taxi['date'] = df_taxi['tpep_pickup_datetime'].dt.date
df_taxi['weekday'] = df_taxi['tpep_pickup_datetime'].dt.weekday
df_taxi = df_taxi[(df_taxi['trip_distance'] > 0)]

# Load weather data
df_weather = pd.read_csv("weather.csv")
df_weather['time'] = pd.to_datetime(df_weather['time'])
df_weather['date'] = df_weather['time'].dt.date
df_temp = df_weather.groupby('date')['temperature_2m (°C)'].max().reset_index()
df_temp.columns = ['date', 'max_temp']

# Merge weather data with taxi data
df_temp['date'] = pd.to_datetime(df_temp['date'])
df_taxi['date'] = pd.to_datetime(df_taxi['date'])
df_merged = pd.merge(df_taxi, df_temp, on='date', how='left')
df_merged['is_hot_day'] = df_merged['max_temp'] > 32

# Plot: Average daily trip counts on hot days
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
gdf_joined.plot(column='hot_day_avg', cmap='OrRd', legend=True, ax=ax)
ax.set_title("Average Daily Trips on Hot Days (Temp > 32°C)", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.savefig("figures/hot_day_trip_heatmap.png")
plt.close()

# === Figure 1: Trip volume during 14:00–17:00 on hot vs normal days ===
df_afternoon = df_merged[df_merged['hour'].between(14, 17)]
df_grouped = df_afternoon.groupby(['is_hot_day', 'date']).size().reset_index(name='trip_count')
df_grouped['is_hot_day'] = df_grouped['is_hot_day'].map({True: 'Hot Days', False: 'Normal Days'})

plt.figure(figsize=(10, 6))
sns.boxplot(data=df_grouped, x='is_hot_day', y='trip_count', palette='Set2')
plt.title("Trip Volume During 14:00–17:00 on Hot vs Normal Days")
plt.xlabel("")
plt.ylabel("Trip Count")
plt.tight_layout()
plt.savefig("figures/trip_drop_14_17_hot_vs_normal.png")
plt.close()

# === Figure 2: Trip volume on hot days - Weekday vs Weekend ===
df_merged['is_weekend'] = df_merged['weekday'] >= 5
df_city = df_merged.groupby(['date', 'is_weekend']).agg(
    trip_count=('VendorID', 'count'),
    max_temp=('max_temp', 'mean')
).reset_index()
df_city['is_hot_day'] = df_city['max_temp'] > 32

plt.figure(figsize=(10, 6))
sns.boxplot(data=df_city[df_city['is_hot_day']], x='is_weekend', y='trip_count', palette='Set1')
plt.xticks([0, 1], ['Weekday', 'Weekend'])
plt.title("Trip Volume on Hot Days: Weekday vs Weekend")
plt.ylabel("Total Trips")
plt.tight_layout()
plt.savefig("figures/trip_hotday_weekday_vs_weekend.png")
plt.close()

# === Figure 3: Linear regression - Temperature vs Total Trip Count ===
df_temp_city = df_merged.groupby('date').agg(
    total_trips=('VendorID', 'count'),
    max_temp=('max_temp', 'mean')
).reset_index().dropna()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_grouped, x='is_hot_day', y='trip_count', palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_city[df_city['is_hot_day']], x='is_weekend', y='trip_count', palette='Set1')


In [None]:
# Prepare data for regression
X = df_temp_city[['max_temp']]
y = df_temp_city['total_trips']
model = LinearRegression().fit(X, y)

# Plot regression: total trips vs. max temperature
sns.lmplot(data=df_temp_city, x='max_temp', y='total_trips', height=6, aspect=1.5)
plt.title("Regression: Total Trips vs. Max Temperature")
plt.xlabel("Max Temperature (°C)")
plt.ylabel("Total Trips")
plt.tight_layout()
plt.savefig("figures/regression_temp_vs_trips.png")
plt.close()

# === Figure 4: Top 10 pickup zones with the most significant drop in trip count on hot days ===
df_daily = df_taxi.groupby(['date', 'PULocationID']).size().reset_index(name='trip_count')
df_daily = pd.merge(df_daily, df_temp, on='date', how='left')
df_daily['is_hot_day'] = df_daily['max_temp'] > 32

# Create pivot table comparing average trips on normal vs hot days
pivot = df_daily.pivot_table(
    index='PULocationID',
    columns='is_hot_day',
    values='trip_count',
    aggfunc='mean'
).reset_index()

pivot.columns = ['PULocationID', 'normal_day_avg', 'hot_day_avg']
pivot['diff'] = pivot['hot_day_avg'] - pivot['normal_day_avg']

# Sort and select top 10 zones with the largest negative difference
top_diff = pivot.sort_values('diff').dropna().head(10)


In [None]:
# Extract the top 10 zones with the most significant drop in trip volume
top10_sensitive = pivot.sort_values('diff').dropna().head(10)

# Save the result as a CSV file
top10_sensitive.to_csv("figures/top10_sensitive_zones_diff.csv", index=False)
