In [None]:
import geopandas as gpd
import pandas as pd


In [None]:
path_pref = "/Users/joeyshoyer/Downloads/"

In [None]:
# Load the GeoJSON files
variance_gdf = gpd.read_file(path_pref + "182_Midday_variance.geojson")
midday_gdf = gpd.read_file(path_pref + "182_Midday_speeds.geojson")
pm_peak_gdf = gpd.read_file(path_pref + "182_PM_Peak_speeds.geojson")
am_peak_gdf = gpd.read_file(path_pref + "182_AM_Peak_speeds.geojson")


In [None]:
# Remove rows with missing or None values in stop_id or route_id for each GeoDataFrame
variance_gdf = variance_gdf.dropna(subset=['stop_id', 'route_id'])
midday_gdf = midday_gdf.dropna(subset=['stop_id', 'route_id'])
pm_peak_gdf = pm_peak_gdf.dropna(subset=['stop_id', 'route_id'])
am_peak_gdf = am_peak_gdf.dropna(subset=['stop_id', 'route_id'])


In [None]:
# Create the new column in each GeoDataFrame
variance_gdf['stop_route_id'] = variance_gdf['stop_id'].astype(str) + '_' + variance_gdf['route_id'].astype(str)
midday_gdf['stop_route_id'] = midday_gdf['stop_id'].astype(str) + '_' + midday_gdf['route_id'].astype(str)
pm_peak_gdf['stop_route_id'] = pm_peak_gdf['stop_id'].astype(str) + '_' + pm_peak_gdf['route_id'].astype(str)
am_peak_gdf['stop_route_id'] = am_peak_gdf['stop_id'].astype(str) + '_' + am_peak_gdf['route_id'].astype(str)


In [None]:
variance_gdf = variance_gdf.drop(columns=["geometry", 'id', 'shape_id', 'stop_sequence', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'route_short_name', 'route_id', 'stop_name', 'stop_id'])
midday_gdf = midday_gdf.drop(columns=['id', 'shape_id', 'stop_sequence', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'time_formatted', 'organization_name', 'p20_mph', 'p80_mph', 'stop_id'])
pm_peak_gdf = pm_peak_gdf.drop(columns=["geometry", 'id', 'shape_id', 'stop_sequence', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'time_formatted', 'organization_name', 'route_short_name', 'route_id', 'stop_name', 'p20_mph', 'p80_mph', 'stop_id'])
am_peak_gdf = am_peak_gdf.drop(columns=["geometry", 'id', 'shape_id', 'stop_sequence', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'time_formatted', 'organization_name', 'route_short_name', 'route_id', 'stop_name', 'p20_mph', 'p80_mph', 'stop_id'])

In [None]:
# Check for duplicates in pm_peak_gdf
midday_gdf.drop_duplicates(subset=['stop_route_id'], keep='first', inplace=True)
pm_peak_gdf.drop_duplicates(subset=['stop_route_id'], keep='first', inplace=True)
am_peak_gdf.drop_duplicates(subset=['stop_route_id'], keep='first', inplace=True)
variance_gdf.drop_duplicates(subset=['stop_route_id'], keep='first', inplace=True)

In [None]:
midday_gdf.info()

In [None]:
pm_peak_gdf.info()


In [None]:
am_peak_gdf.info()

In [None]:
# Merge on stop_name
merged_gdf = midday_gdf.merge(pm_peak_gdf, on="stop_route_id", how="inner", suffixes=('_midday', '_pm'))


In [None]:
merged_gdf.info()

In [None]:
merged_gdf = merged_gdf.merge(am_peak_gdf, on="stop_route_id", how="inner", suffixes=('', '_am'))


In [None]:
#merged_gdf = merged_gdf.merge(variance_gdf, on="stop_route_id", how="inner", suffixes=('', '_var'))


In [None]:
merged_gdf = merged_gdf.rename(columns={'p50_mph': 'p50_mph_am'})

In [None]:
merged_gdf.head()

In [None]:
merged_gdf.info()

In [None]:
# Calculate the mean of each p50_mph column
mean_midday = merged_gdf['p50_mph_midday'].mean()
mean_pm = merged_gdf['p50_mph_pm'].mean()
mean_am = merged_gdf['p50_mph_am'].mean()


print(mean_midday, mean_pm, mean_am)

In [None]:
# Calculate the difference from the average for each p50_mph column
merged_gdf['diff_from_avg_midday'] = mean_midday - merged_gdf['p50_mph_midday']
merged_gdf['diff_from_avg_pm'] = mean_pm - merged_gdf['p50_mph_pm']
merged_gdf['diff_from_avg_am'] = mean_am - merged_gdf['p50_mph_am']

In [None]:
merged_gdf.head()


In [None]:
json_df = pd.read_json(path_pref + "ridership.json")
json_df['line_name'] = json_df['line_name'].astype(str)
json_df.head()

In [None]:
# Step 1: Sort by line_name, year, and month in descending order
json_df_sorted = json_df.sort_values(by=['line_name', 'year', 'month'], ascending=[True, False, False])

# Step 2: Drop duplicates based on line_name, keeping the first (most recent) record
json_df_most_recent = json_df_sorted.drop_duplicates(subset='line_name', keep='first')

# Step 3: Inspect the result
print(json_df_most_recent)

In [None]:
json_df_most_recent.info()

In [None]:
json_df_most_recent.sort_values(by='est_wkday_ridership', ascending=False).head(30)

In [None]:
merged_gdf = merged_gdf.merge(json_df_most_recent, left_on='route_short_name', right_on='line_name', how='left')

In [None]:
merged_gdf.head()

In [None]:
merged_gdf.info()

In [None]:
# Time lost for midday, pm, and am
merged_gdf['time_lost_midday'] = (merged_gdf['miles_from_last'] / merged_gdf['diff_from_avg_midday']) * 60
merged_gdf['time_lost_pm'] = (merged_gdf['miles_from_last'] / merged_gdf['diff_from_avg_pm']) * 60
merged_gdf['time_lost_am'] = (merged_gdf['miles_from_last'] / merged_gdf['diff_from_avg_am']) * 60


In [None]:

merged_gdf['ridership_minutes_lost_midday'] = merged_gdf['time_lost_midday'] * merged_gdf['est_wkday_ridership']
merged_gdf['ridership_minutes_lost_pm'] = merged_gdf['time_lost_pm'] * merged_gdf['est_wkday_ridership']
merged_gdf['ridership_minutes_lost_am'] = merged_gdf['time_lost_am'] * merged_gdf['est_wkday_ridership']


In [None]:
merged_gdf[['stop_name', 'route_short_name', 'ridership_minutes_lost_midday', 'ridership_minutes_lost_pm', 'ridership_minutes_lost_am']]


In [None]:
# Calculate (passenger-hour wasted) / (kilometer travelled)
merged_gdf['passenger_hour_per_mi_midday'] = (merged_gdf['ridership_minutes_lost_midday'] / 60) / merged_gdf['miles_from_last']
merged_gdf['passenger_hour_per_mi_pm'] = (merged_gdf['ridership_minutes_lost_pm'] / 60) / merged_gdf['miles_from_last']
merged_gdf['passenger_hour_per_mi_am'] = (merged_gdf['ridership_minutes_lost_am'] / 60) / merged_gdf['miles_from_last']


In [None]:
# Display results
print("Top 10 segments with highest (passenger-hour wasted) / (mile travelled) for AM peak:")
print(merged_gdf.sort_values('passenger_hour_per_mi_am', ascending=False)[['stop_route_id', 'route_short_name', 'stop_name', 'passenger_hour_per_mi_am']].head(10))

In [None]:
print("\nTop 10 segments with highest (passenger-hour wasted) / (mile travelled) for PM peak:")
print(merged_gdf.sort_values('passenger_hour_per_mi_pm', ascending=False)[['stop_route_id', 'route_short_name', 'stop_name', 'passenger_hour_per_mi_pm']].head(10))

In [None]:
print("\nTop 10 segments with highest (passenger-hour wasted) / (mile travelled) for Midday:")
print(merged_gdf.sort_values('passenger_hour_per_mi_midday', ascending=False)[['stop_route_id', 'route_short_name', 'stop_name', 'passenger_hour_per_mi_midday']].head(10))

In [None]:
# Aggregate results by route
route_aggregated = merged_gdf.groupby('route_short_name').agg({
    'passenger_hour_per_mi_am': 'mean',
    'passenger_hour_per_mi_pm': 'mean',
    'passenger_hour_per_mi_midday': 'mean',
    'est_wkday_ridership': 'first'  # Assuming ridership is the same for all segments of a route
}).reset_index()

In [None]:
print("\nTop 10 routes with highest average (passenger-hour wasted) / (mile travelled) for AM peak:")
print(route_aggregated.sort_values('passenger_hour_per_mi_am', ascending=False)[['route_short_name', 'passenger_hour_per_mi_am', 'est_wkday_ridership']].head(10))


In [None]:
print("\nTop 10 routes with highest average (passenger-hour wasted) / (mile travelled) for PM peak:")
print(route_aggregated.sort_values('passenger_hour_per_mi_pm', ascending=False)[['route_short_name', 'passenger_hour_per_mi_pm', 'est_wkday_ridership']].head(10))


In [None]:

print("\nTop 10 routes with highest average (passenger-hour wasted) / (mile travelled) for Midday:")
print(route_aggregated.sort_values('passenger_hour_per_mi_midday', ascending=False)[['route_short_name', 'passenger_hour_per_mi_midday', 'est_wkday_ridership']].head(10))

In [None]:
# Calculate total ridership hours wasted for each time period
merged_gdf['ridership_hours_lost_midday'] = merged_gdf['ridership_minutes_lost_midday'] / 60
merged_gdf['ridership_hours_lost_pm'] = merged_gdf['ridership_minutes_lost_pm'] / 60
merged_gdf['ridership_hours_lost_am'] = merged_gdf['ridership_minutes_lost_am'] / 60


In [None]:
# Aggregate results by route
route_aggregated = merged_gdf.groupby('route_short_name').agg({
    'ridership_hours_lost_midday': 'sum',
    'ridership_hours_lost_pm': 'sum',
    'ridership_hours_lost_am': 'sum',
    'est_wkday_ridership': 'first'  # Assuming ridership is the same for all segments of a route
}).reset_index()

In [None]:
# Calculate total ridership hours wasted across all time periods
merged_gdf['total_ridership_hours_lost'] = (
    merged_gdf['ridership_hours_lost_midday'] +
    merged_gdf['ridership_hours_lost_pm'] +
    merged_gdf['ridership_hours_lost_am']
)


In [None]:
# Aggregate results by route
route_aggregated = merged_gdf.groupby('route_short_name').agg({
    'total_ridership_hours_lost': 'sum',
    'est_wkday_ridership': 'first'  # Assuming ridership is the same for all segments of a route
}).reset_index()

# Sort the results by total ridership hours lost
route_aggregated_sorted = route_aggregated.sort_values('total_ridership_hours_lost', ascending=False)


In [None]:
# Display the top 20 routes with highest total ridership hours wasted
print("Top 20 routes with highest total ridership hours wasted:")
print(route_aggregated_sorted[['route_short_name', 'total_ridership_hours_lost', 'est_wkday_ridership']].head(20))


In [None]:
merged_gdf.info()

In [None]:
merged_gdf.head()

In [None]:
import json

# Convert GeoDataFrame to a GeoJSON-like Python dictionary
geojson_dict = merged_gdf.__geo_interface__

# Save as GeoJSON
with open("bus_segments.geojson", "w") as f:
    json.dump(geojson_dict, f)

print("Data saved successfully to bus_segments.geojson")