In [241]:
import geopandas as gpd
import pandas as pd
from collections import defaultdict
import numpy as np
import json

In [242]:
path_pref = "/Users/joeyshoyer/Downloads/"

In [243]:
# Load the stops.geojson manually
with open(path_pref + "stops.geojson") as f:
    stops_data = json.load(f)

# Manually extract stop_id, stop_name, route_ids, and coordinates
stops_list = []
for feature in stops_data['features']:
    stop_id = int(feature['properties'].get('stop_id'))
    stop_name = feature['properties'].get('stop_name')
    route_ids = feature['properties'].get('route_ids')
    
    stops_list.append({
        'stop_id': stop_id,
        'stop_name': stop_name,
        'route_ids': route_ids,
    })

# Convert to DataFrame if needed
import pandas as pd
stops_df = pd.DataFrame(stops_list)

# Inspect the DataFrame
print(stops_df)


        stop_id                                         stop_name  \
0             1                               Paramount / Slauson   
1             3                                  Jefferson / 10th   
2             6                        120th / Augustus F Hawkins   
3             7               120th / Martin Luther King Hospital   
4            12                                 15054 Sherman Way   
...         ...                                               ...   
12013  63500003  Dodger Express Loading Zone #3  (Harbor Gateway)   
12014  63500004    Dodger Express Loading Zone #2 (Union Station)   
12015      2319                        Hawthorne / Lennox Station   
12016     30022                       Sofi Stadium Transit Center   
12017      5873           Downtown Inglewood Station Express Stop   

                             route_ids  
0                          [265-13183]  
1                           [35-13183]  
2      [53-13183, 55-13183, 120-13183]  
3      

In [244]:
# Define the function to handle conversion for a single route_id
def safe_convert(route_id):
    try:
        return int(route_id.split('-')[0])
    except (ValueError, AttributeError):
        return np.nan

# Function to apply conversion to each item in the list
def convert_route_ids(route_ids):
    if isinstance(route_ids, list):
        return [safe_convert(route_id) for route_id in route_ids]
    return route_ids


In [245]:
stops_df['route_ids'] = stops_df['route_ids'].apply(convert_route_ids)


In [246]:
# Create a dictionary to map stop_id to route_ids
stop_to_routes = stops_df.set_index('stop_id')['route_ids'].to_dict()
stop_to_routes

{1: [265],
 3: [35],
 6: [53, 55, 120],
 7: [53, 55, 120],
 12: [162],
 16: [96],
 18: [106],
 19: [665],
 25: [62],
 29: [33],
 31: [16],
 33: [236],
 38: [20, 4, 720],
 40: [94, 10, 81, 28],
 42: [18],
 47: [234],
 49: [205],
 51: [115],
 55: [14],
 56: [14],
 57: [108],
 59: [108],
 60: [108],
 62: [108],
 63: [161],
 65: [161],
 68: [182],
 69: [224],
 70: [487],
 73: [76],
 77: [611],
 78: [611],
 79: [611],
 85: [267],
 86: [267],
 87: [603],
 88: [603],
 89: [603],
 90: [603],
 91: [603],
 92: [127],
 93: [127, 128],
 99: [78, 14],
 101: [111],
 102: [111],
 104: [158],
 108: [158],
 110: [158],
 111: [158],
 112: [158],
 115: [162],
 117: [209],
 118: [209],
 126: [209],
 127: [209],
 128: [209],
 136: [92],
 138: [78],
 139: [761],
 141: [211, 344],
 144: [232],
 146: [232],
 147: [211, 344],
 148: [344],
 149: [60, 260],
 150: [344],
 152: [167, 158, 244],
 157: [260],
 158: [260],
 159: [224],
 160: [158],
 161: [260],
 162: [611, 260],
 163: [260, 117],
 164: [260],
 165: [

In [247]:
ridership_data = gpd.read_file(path_pref + "Average_weekday_ridership.geojson")
ridership_data

Unnamed: 0,OBJECTID,Ons,STOP_ID,STOP_NAME,geometry
0,1,117,16470,WASHINGTON / UNION,POINT (-13166973.810 4033981.923)
1,2,224,16514,WASHINGTON / BONNIE BRAE,POINT (-13167273.705 4034148.367)
2,3,53,16448,WASHINGTON / NEW ENGLAND,POINT (-13167700.059 4034196.998)
3,4,387,16471,WASHINGTON / VERMONT,POINT (-13168193.983 4034193.639)
4,5,24,16451,WASHINGTON / BUDLONG,POINT (-13168656.961 4034191.624)
...,...,...,...,...,...
22199,22200,95,522,Second Ave and San Bernardino Rd S,POINT (-13123231.032 4040896.180)
22200,22201,15,528,Seventh Ave and Bonelli St S,POINT (-13133939.078 4032226.012)
22201,22202,134,530,Seventh Ave and Clark Ave N,POINT (-13134136.224 4031903.659)
22202,22203,81,531,Seventh Ave and Clark Ave S,POINT (-13134131.215 4031956.846)


In [248]:
# Load the GeoJSON files
variance_gdf = gpd.read_file(path_pref + "182_Midday_variance.geojson")
midday_gdf = gpd.read_file(path_pref + "182_Midday_speeds.geojson")
pm_peak_gdf = gpd.read_file(path_pref + "182_PM_Peak_speeds.geojson")
am_peak_gdf = gpd.read_file(path_pref + "182_AM_Peak_speeds.geojson")


In [249]:
# Remove rows with missing or None values in stop_id or route_id for each GeoDataFrame
variance_gdf = variance_gdf.dropna(subset=['stop_id', 'route_id'])
midday_gdf = midday_gdf.dropna(subset=['stop_id', 'route_id'])
pm_peak_gdf = pm_peak_gdf.dropna(subset=['stop_id', 'route_id'])
am_peak_gdf = am_peak_gdf.dropna(subset=['stop_id', 'route_id'])


In [250]:
am_peak_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 22046 entries, 0 to 22887
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   id                 22046 non-null  object  
 1   stop_id            22046 non-null  object  
 2   stop_name          22046 non-null  object  
 3   shape_id           22046 non-null  object  
 4   stop_sequence      22046 non-null  float64 
 5   route_id           22046 non-null  object  
 6   route_short_name   21708 non-null  object  
 7   direction_id       22046 non-null  float64 
 8   p50_mph            22046 non-null  float64 
 9   p20_mph            22046 non-null  float64 
 10  p80_mph            22046 non-null  float64 
 11  fast_slow_ratio    22046 non-null  float64 
 12  trips_per_hour     22046 non-null  float64 
 13  miles_from_last    22046 non-null  float64 
 14  time_formatted     22046 non-null  object  
 15  organization_name  22046 non-null  object  
 16  g

In [251]:
variance_gdf.head()

Unnamed: 0,id,fast_slow_ratio,p20_mph,p80_mph,miles_from_last,route_short_name,trips_per_hour,shape_id,stop_sequence,stop_id,route_id,stop_name,geometry
0,12347,1.0,24.2,24.2,0.2,611,0.2,6110027_DEC23,3.0,9177,611-13172,Wilcox / Cecelia,"POLYGON ((-118.17941 33.95786, -118.17938 33.9..."
1,12348,1.0,8.0,8.0,0.2,611,0.2,6110027_DEC23,4.0,142002,611-13172,Wilcox / Santa Ana,"POLYGON ((-118.17902 33.95814, -118.17916 33.9..."
2,12349,1.0,12.5,12.5,0.1,611,0.2,6110027_DEC23,5.0,8352,611-13172,Wilcox / Elizabeth,"POLYGON ((-118.17869 33.96034, -118.17882 33.9..."
3,12350,1.0,17.9,17.9,0.2,611,0.2,6110027_DEC23,6.0,8351,611-13172,Wilcox / Clara,"POLYGON ((-118.17841 33.96219, -118.17854 33.9..."
4,12351,1.0,14.5,14.5,0.2,611,0.2,6110027_DEC23,7.0,8354,611-13172,Wilcox / Live Oak,"POLYGON ((-118.17805 33.96448, -118.17819 33.9..."


In [252]:
# Create the new column in each GeoDataFrame
variance_gdf['stop_route_id'] = variance_gdf['stop_id'].astype(str) + '_' + variance_gdf['route_id'].astype(str)
midday_gdf['stop_route_id'] = midday_gdf['stop_id'].astype(str) + '_' + midday_gdf['route_id'].astype(str)
pm_peak_gdf['stop_route_id'] = pm_peak_gdf['stop_id'].astype(str) + '_' + pm_peak_gdf['route_id'].astype(str)
am_peak_gdf['stop_route_id'] = am_peak_gdf['stop_id'].astype(str) + '_' + am_peak_gdf['route_id'].astype(str)

In [253]:
# Create the new column in each GeoDataFrame
midday_gdf['stop_route_dir_id'] = midday_gdf['stop_id'].astype(str) + '_' + midday_gdf['route_id'].astype(str) + '_' + midday_gdf['direction_id'].astype(str)
pm_peak_gdf['stop_route_dir_id'] = pm_peak_gdf['stop_id'].astype(str) + '_' + pm_peak_gdf['route_id'].astype(str) + '_' + pm_peak_gdf['direction_id'].astype(str)
am_peak_gdf['stop_route_dir_id'] = am_peak_gdf['stop_id'].astype(str) + '_' + am_peak_gdf['route_id'].astype(str) + '_' + am_peak_gdf['direction_id'].astype(str)


In [254]:
am_peak_gdf.head()

Unnamed: 0,id,stop_id,stop_name,shape_id,stop_sequence,route_id,route_short_name,direction_id,p50_mph,p20_mph,p80_mph,fast_slow_ratio,trips_per_hour,miles_from_last,time_formatted,organization_name,geometry,stop_route_id,stop_route_dir_id
0,9280,19100,Lankershim / Chandler,1550041_DEC23,2.0,155-13172,155,1.0,4.2,4.2,4.2,1.0,0.7,0.3,4:17,Los Angeles County Metropolitan Transportation...,"POLYGON ((-118.37669 34.16816, -118.37662 34.1...",19100_155-13172,19100_155-13172_1.0
1,9281,140751,Magnolia / Lankershim,1550041_DEC23,3.0,155-13172,155,1.0,22.3,22.3,22.3,1.0,0.7,0.3,0:48,Los Angeles County Metropolitan Transportation...,"POLYGON ((-118.37683 34.16769, -118.37660 34.1...",140751_155-13172,140751_155-13172_1.0
2,9282,3076,Magnolia / Tujunga,1550041_DEC23,4.0,155-13172,155,1.0,22.3,22.3,22.3,1.0,0.7,0.2,0:32,Los Angeles County Metropolitan Transportation...,"POLYGON ((-118.37518 34.16533, -118.37536 34.1...",3076_155-13172,3076_155-13172_1.0
3,9283,20293,Magnolia / Lemp,1550041_DEC23,5.0,155-13172,155,1.0,18.0,18.0,18.0,1.0,0.7,0.3,1:00,Los Angeles County Metropolitan Transportation...,"POLYGON ((-118.37929 34.16531, -118.37947 34.1...",20293_155-13172,20293_155-13172_1.0
4,9284,20294,Magnolia / Colfax,1550041_DEC23,6.0,155-13172,155,1.0,16.1,16.1,16.1,1.0,0.7,0.2,0:44,Los Angeles County Metropolitan Transportation...,"POLYGON ((-118.38479 34.16531, -118.38498 34.1...",20294_155-13172,20294_155-13172_1.0


In [255]:
variance_gdf = variance_gdf.drop(columns=["geometry", 'id', 'shape_id', 'stop_sequence', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'route_short_name', 'route_id', 'stop_name', 'stop_id'])
midday_gdf = midday_gdf.drop(columns=['id', 'shape_id', 'fast_slow_ratio', 'trips_per_hour', 'time_formatted', 'organization_name', 'p20_mph', 'p80_mph'])
pm_peak_gdf = pm_peak_gdf.drop(columns=["geometry", 'id', 'shape_id', 'stop_sequence', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'time_formatted', 'organization_name', 'route_short_name', 'route_id', 'stop_name', 'p20_mph', 'p80_mph', 'stop_id'])
am_peak_gdf = am_peak_gdf.drop(columns=["geometry", 'id', 'shape_id', 'stop_sequence', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'time_formatted', 'organization_name', 'route_short_name', 'route_id', 'stop_name', 'p20_mph', 'p80_mph', 'stop_id'])

In [256]:
midday_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 18535 entries, 0 to 19224
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   stop_id            18535 non-null  object  
 1   stop_name          18535 non-null  object  
 2   stop_sequence      18535 non-null  float64 
 3   route_id           18535 non-null  object  
 4   route_short_name   18199 non-null  object  
 5   direction_id       18535 non-null  float64 
 6   p50_mph            18535 non-null  float64 
 7   miles_from_last    18535 non-null  float64 
 8   geometry           18535 non-null  geometry
 9   stop_route_id      18535 non-null  object  
 10  stop_route_dir_id  18535 non-null  object  
dtypes: float64(4), geometry(1), object(6)
memory usage: 1.7+ MB


In [257]:
# Check for duplicates in pm_peak_gdf
midday_gdf.drop_duplicates(subset=['stop_route_dir_id'], keep='first', inplace=True)
pm_peak_gdf.drop_duplicates(subset=['stop_route_dir_id'], keep='first', inplace=True)
am_peak_gdf.drop_duplicates(subset=['stop_route_dir_id'], keep='first', inplace=True)
variance_gdf.drop_duplicates(subset=['stop_route_id'], keep='first', inplace=True)

In [258]:
midday_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 13865 entries, 0 to 19166
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   stop_id            13865 non-null  object  
 1   stop_name          13865 non-null  object  
 2   stop_sequence      13865 non-null  float64 
 3   route_id           13865 non-null  object  
 4   route_short_name   13587 non-null  object  
 5   direction_id       13865 non-null  float64 
 6   p50_mph            13865 non-null  float64 
 7   miles_from_last    13865 non-null  float64 
 8   geometry           13865 non-null  geometry
 9   stop_route_id      13865 non-null  object  
 10  stop_route_dir_id  13865 non-null  object  
dtypes: float64(4), geometry(1), object(6)
memory usage: 1.3+ MB


In [259]:
pm_peak_gdf.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 14081 entries, 0 to 23060
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   p50_mph            14081 non-null  float64
 1   stop_route_id      14081 non-null  object 
 2   stop_route_dir_id  14081 non-null  object 
dtypes: float64(1), object(2)
memory usage: 440.0+ KB


In [260]:
am_peak_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 14021 entries, 0 to 22776
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   p50_mph            14021 non-null  float64
 1   stop_route_id      14021 non-null  object 
 2   stop_route_dir_id  14021 non-null  object 
dtypes: float64(1), object(2)
memory usage: 438.2+ KB


In [261]:
# Merge on stop_name
merged_gdf = midday_gdf.merge(pm_peak_gdf, on="stop_route_dir_id", how="inner", suffixes=('_midday', '_pm'))

In [262]:
merged_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13830 entries, 0 to 13829
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   stop_id               13830 non-null  object  
 1   stop_name             13830 non-null  object  
 2   stop_sequence         13830 non-null  float64 
 3   route_id              13830 non-null  object  
 4   route_short_name      13552 non-null  object  
 5   direction_id          13830 non-null  float64 
 6   p50_mph_midday        13830 non-null  float64 
 7   miles_from_last       13830 non-null  float64 
 8   geometry              13830 non-null  geometry
 9   stop_route_id_midday  13830 non-null  object  
 10  stop_route_dir_id     13830 non-null  object  
 11  p50_mph_pm            13830 non-null  float64 
 12  stop_route_id_pm      13830 non-null  object  
dtypes: float64(5), geometry(1), object(7)
memory usage: 1.4+ MB


In [263]:
merged_gdf = merged_gdf.merge(am_peak_gdf, on="stop_route_dir_id", how="inner", suffixes=('', '_am'))


In [264]:
merged_gdf = merged_gdf.rename(columns={'p50_mph': 'p50_mph_am'})

In [265]:
merged_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   stop_id               13743 non-null  object  
 1   stop_name             13743 non-null  object  
 2   stop_sequence         13743 non-null  float64 
 3   route_id              13743 non-null  object  
 4   route_short_name      13465 non-null  object  
 5   direction_id          13743 non-null  float64 
 6   p50_mph_midday        13743 non-null  float64 
 7   miles_from_last       13743 non-null  float64 
 8   geometry              13743 non-null  geometry
 9   stop_route_id_midday  13743 non-null  object  
 10  stop_route_dir_id     13743 non-null  object  
 11  p50_mph_pm            13743 non-null  float64 
 12  stop_route_id_pm      13743 non-null  object  
 13  p50_mph_am            13743 non-null  float64 
 14  stop_route_id         13743 non-null  object  

In [266]:
variance_gdf = variance_gdf.rename(columns={'p20_mph': 'p20_mph_var', 'p80_mph': 'p80_mph_var'})
variance_gdf.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 13710 entries, 0 to 19166
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   p20_mph_var    13710 non-null  float64
 1   p80_mph_var    13710 non-null  float64
 2   stop_route_id  13710 non-null  object 
dtypes: float64(2), object(1)
memory usage: 428.4+ KB


In [267]:
merged_gdf = merged_gdf.merge(variance_gdf, on="stop_route_id", how="inner", suffixes=('', '_var'))


In [268]:
merged_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   stop_id               13743 non-null  object  
 1   stop_name             13743 non-null  object  
 2   stop_sequence         13743 non-null  float64 
 3   route_id              13743 non-null  object  
 4   route_short_name      13465 non-null  object  
 5   direction_id          13743 non-null  float64 
 6   p50_mph_midday        13743 non-null  float64 
 7   miles_from_last       13743 non-null  float64 
 8   geometry              13743 non-null  geometry
 9   stop_route_id_midday  13743 non-null  object  
 10  stop_route_dir_id     13743 non-null  object  
 11  p50_mph_pm            13743 non-null  float64 
 12  stop_route_id_pm      13743 non-null  object  
 13  p50_mph_am            13743 non-null  float64 
 14  stop_route_id         13743 non-null  object  

In [269]:
merged_gdf.head()


Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_id_midday,stop_route_dir_id,p50_mph_pm,stop_route_id_pm,p50_mph_am,stop_route_id,p20_mph_var,p80_mph_var
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172,7120_169-13172_0.0,7.3,7120_169-13172,10.3,7120_169-13172,8.1,8.8
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172,15495_169-13172_0.0,8.8,15495_169-13172,8.7,15495_169-13172,8.2,16.4
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172,6598_169-13172_0.0,6.6,6598_169-13172,6.7,6598_169-13172,6.9,7.5
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172,14946_169-13172_0.0,9.7,14946_169-13172,18.2,14946_169-13172,9.6,10.6
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172,4190_169-13172_0.0,13.1,4190_169-13172,15.0,4190_169-13172,16.0,19.7


In [270]:
merged_gdf = merged_gdf.drop(columns=['stop_route_id_midday', 'stop_route_id_pm'])
merged_gdf.head()

Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_dir_id,p50_mph_pm,p50_mph_am,stop_route_id,p20_mph_var,p80_mph_var
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172_0.0,7.3,10.3,7120_169-13172,8.1,8.8
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172_0.0,8.8,8.7,15495_169-13172,8.2,16.4
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172_0.0,6.6,6.7,6598_169-13172,6.9,7.5
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172_0.0,9.7,18.2,14946_169-13172,9.6,10.6
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172_0.0,13.1,15.0,4190_169-13172,16.0,19.7


In [271]:
# Calculate the mean of each p50_mph column
mean_midday = merged_gdf['p50_mph_midday'].mean()
mean_pm = merged_gdf['p50_mph_pm'].mean()
mean_am = merged_gdf['p50_mph_am'].mean()


print(mean_midday, mean_pm, mean_am)

14.860867350651242 13.888990758931822 15.307494724587063


In [272]:
# Calculate the difference from the average for each p50_mph column
# merged_gdf['diff_from_avg_midday'] = mean_midday - merged_gdf['p50_mph_midday']
# merged_gdf['diff_from_avg_pm'] = mean_pm - merged_gdf['p50_mph_pm']
# merged_gdf['diff_from_avg_am'] = mean_am - merged_gdf['p50_mph_am']

merged_gdf['diff_from_avg_midday'] = merged_gdf['p80_mph_var'] - merged_gdf['p50_mph_midday']
merged_gdf['diff_from_avg_pm'] = merged_gdf['p80_mph_var'] - merged_gdf['p50_mph_pm']
merged_gdf['diff_from_avg_am'] = merged_gdf['p80_mph_var'] - merged_gdf['p50_mph_am']

In [273]:
merged_gdf.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   stop_id               13743 non-null  object  
 1   stop_name             13743 non-null  object  
 2   stop_sequence         13743 non-null  float64 
 3   route_id              13743 non-null  object  
 4   route_short_name      13465 non-null  object  
 5   direction_id          13743 non-null  float64 
 6   p50_mph_midday        13743 non-null  float64 
 7   miles_from_last       13743 non-null  float64 
 8   geometry              13743 non-null  geometry
 9   stop_route_dir_id     13743 non-null  object  
 10  p50_mph_pm            13743 non-null  float64 
 11  p50_mph_am            13743 non-null  float64 
 12  stop_route_id         13743 non-null  object  
 13  p20_mph_var           13743 non-null  float64 
 14  p80_mph_var           13743 non-null  float64 

In [274]:
merged_gdf.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   stop_id               13743 non-null  object  
 1   stop_name             13743 non-null  object  
 2   stop_sequence         13743 non-null  float64 
 3   route_id              13743 non-null  object  
 4   route_short_name      13465 non-null  object  
 5   direction_id          13743 non-null  float64 
 6   p50_mph_midday        13743 non-null  float64 
 7   miles_from_last       13743 non-null  float64 
 8   geometry              13743 non-null  geometry
 9   stop_route_dir_id     13743 non-null  object  
 10  p50_mph_pm            13743 non-null  float64 
 11  p50_mph_am            13743 non-null  float64 
 12  stop_route_id         13743 non-null  object  
 13  p20_mph_var           13743 non-null  float64 
 14  p80_mph_var           13743 non-null  float64 

In [275]:
json_df = pd.read_json(path_pref + "ridership.json")
json_df['line_name'] = json_df['line_name'].astype(str)
json_df.head()

Unnamed: 0,year,month,line_name,est_wkday_ridership,est_sat_ridership,est_sun_ridership
0,2009,1,2,21816.0,13442.0,8924.0
1,2009,1,4,20393.0,17941.0,12765.0
2,2009,1,10,13337.0,7603.0,4961.0
3,2009,1,14,16026.0,10696.0,8161.0
4,2009,1,16,26706.0,19855.0,14266.0


In [276]:
# Step 1: Sort by line_name, year, and month in descending order
json_df_sorted = json_df.sort_values(by=['line_name', 'year', 'month'], ascending=[True, False, False])

# Step 2: Drop duplicates based on line_name, keeping the first (most recent) record
json_df_most_recent = json_df_sorted.drop_duplicates(subset='line_name', keep='first')

# Step 3: Inspect the result
print(json_df_most_recent)

       year  month line_name  est_wkday_ridership  est_sat_ridership  \
25443  2024      3        10               7440.0             3664.0   
25468  2024      3       102               1485.0             1135.0   
25469  2024      3       105              14733.0             8856.0   
25470  2024      3       106               4118.0             1860.0   
25471  2024      3       108              14496.0             7759.0   
...     ...    ...       ...                  ...                ...   
25557  2024      3       910              16579.0             7809.0   
25465  2024      3        92               5877.0             3483.0   
3927   2010     12       920               3141.0                NaN   
25466  2024      3        94               6909.0             4724.0   
25467  2024      3        96                752.0              405.0   

       est_sun_ridership  
25443             3463.0  
25468             1016.0  
25469             8378.0  
25470             1820.0  


In [277]:
json_df_most_recent.info()
json_df_most_recent.sort_values(by='est_wkday_ridership', ascending=False).head(30)
json_df_most_recent = json_df_most_recent.drop(columns=['est_sat_ridership', 'est_sun_ridership', 'year', 'month'])

<class 'pandas.core.frame.DataFrame'>
Index: 209 entries, 25443 to 25467
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 209 non-null    int64  
 1   month                209 non-null    int64  
 2   line_name            209 non-null    object 
 3   est_wkday_ridership  208 non-null    float64
 4   est_sat_ridership    155 non-null    float64
 5   est_sun_ridership    152 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 11.4+ KB


In [278]:
merged_gdf = merged_gdf.merge(json_df_most_recent, left_on='route_short_name', right_on='line_name', how='left')

In [279]:
merged_gdf.head()

Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_dir_id,p50_mph_pm,p50_mph_am,stop_route_id,p20_mph_var,p80_mph_var,diff_from_avg_midday,diff_from_avg_pm,diff_from_avg_am,line_name,est_wkday_ridership
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172_0.0,7.3,10.3,7120_169-13172,8.1,8.8,0.1,1.5,-1.5,169,1831.0
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172_0.0,8.8,8.7,15495_169-13172,8.2,16.4,7.7,7.6,7.7,169,1831.0
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172_0.0,6.6,6.7,6598_169-13172,6.9,7.5,0.5,0.9,0.8,169,1831.0
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172_0.0,9.7,18.2,14946_169-13172,9.6,10.6,0.9,0.9,-7.6,169,1831.0
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172_0.0,13.1,15.0,4190_169-13172,16.0,19.7,1.2,6.6,4.7,169,1831.0


In [280]:
merged_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   stop_id               13743 non-null  object  
 1   stop_name             13743 non-null  object  
 2   stop_sequence         13743 non-null  float64 
 3   route_id              13743 non-null  object  
 4   route_short_name      13465 non-null  object  
 5   direction_id          13743 non-null  float64 
 6   p50_mph_midday        13743 non-null  float64 
 7   miles_from_last       13743 non-null  float64 
 8   geometry              13743 non-null  geometry
 9   stop_route_dir_id     13743 non-null  object  
 10  p50_mph_pm            13743 non-null  float64 
 11  p50_mph_am            13743 non-null  float64 
 12  stop_route_id         13743 non-null  object  
 13  p20_mph_var           13743 non-null  float64 
 14  p80_mph_var           13743 non-null  float64 

In [281]:
ridership_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 22204 entries, 0 to 22203
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   OBJECTID   22204 non-null  int64   
 1   Ons        22204 non-null  int64   
 2   STOP_ID    22204 non-null  int64   
 3   STOP_NAME  22204 non-null  object  
 4   geometry   22204 non-null  geometry
dtypes: geometry(1), int64(3), object(1)
memory usage: 867.5+ KB


In [282]:

# Initialize a dictionary to store boardings for each route at each stop_id
boardings_per_route_stop = defaultdict(lambda: defaultdict(float))

# Ensure line_name is now of int type
json_df_most_recent['line_name'] = json_df_most_recent['line_name'].astype(int)


# Get total ridership per route from json_df_most_recent
total_ridership_per_route = json_df_most_recent.set_index('line_name')['est_wkday_ridership'].to_dict()

# Iterate over the ridership GeoDataFrame
for _, row in ridership_data.iterrows():
    stop_id = row['STOP_ID']
    ons = row['Ons']

    # Get routes serving this stop
    if stop_id in stop_to_routes:
        routes = stop_to_routes[stop_id]
        print(routes)
        
        # Calculate total ridership for normalization
        total_ridership = sum(total_ridership_per_route.get(route, 0) for route in routes)

        # Distribute boardings across routes proportionally
        for route in routes:
            route_ridership = total_ridership_per_route.get(route, 0)
            if total_ridership > 0:
                boarding_per_route = ons * (route_ridership / total_ridership)
                boardings_per_route_stop[stop_id][route] += boarding_per_route


[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[66, 35]
[51, 35]
[35]
[102, 35]
[102, 35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[35]
[105, 35, 217]
[105, 35, 217]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[40]
[102, 232, 40, 111, 117]
[232, 40, 117]
[40, 117]
[40, 117]
[40]
[40, 120]
[40, 120]
[40, 120]
[40, 120]
[40, 120]
[40, 120]
[212, 40]
[212, 40]
[40]
[40]
[40]
[40]
[40, 111]
[40, 111]
[40, 111]
[40, 111]
[40, 111]
[212, 40, 111]
[212, 40, 111]
[40, 111]
[40, 111]
[40, 111]
[40, 111]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[40, 210]
[105]
[105, 40, 210]
[105, 40, 210]
[40]
[40]
[40]
[40]
[102, 

In [283]:
print(boardings_per_route_stop)
# Print the results to inspect
for stop_id, routes in boardings_per_route_stop.items():
    if len(routes)>1:
        print(f"Stop ID {stop_id}:")
        for route, boardings in routes.items():
            print(f"  Route {route}: {boardings:.2f} boardings")

defaultdict(<function <lambda> at 0x2aa99caf0>, {16470: defaultdict(<class 'float'>, {35: 117.0}), 16514: defaultdict(<class 'float'>, {35: 224.0}), 16448: defaultdict(<class 'float'>, {35: 53.0}), 16471: defaultdict(<class 'float'>, {35: 387.0}), 16451: defaultdict(<class 'float'>, {35: 24.0}), 16466: defaultdict(<class 'float'>, {35: 114.0}), 16460: defaultdict(<class 'float'>, {35: 26.0}), 8194: defaultdict(<class 'float'>, {35: 368.0}), 16458: defaultdict(<class 'float'>, {35: 37.0}), 16442: defaultdict(<class 'float'>, {35: 73.0}), 16440: defaultdict(<class 'float'>, {35: 37.0}), 16441: defaultdict(<class 'float'>, {35: 21.0}), 16520: defaultdict(<class 'float'>, {35: 81.0}), 16444: defaultdict(<class 'float'>, {35: 161.0}), 16523: defaultdict(<class 'float'>, {35: 23.0}), 16474: defaultdict(<class 'float'>, {35: 30.0}), 16469: defaultdict(<class 'float'>, {35: 25.0}), 16463: defaultdict(<class 'float'>, {35: 50.0}), 16467: defaultdict(<class 'float'>, {35: 9.0}), 16468: defaultdi

In [284]:
# Load the average trip lengths from Excel
trip_length_df = pd.read_excel(path_pref + "Ridership Report - Monthly Line Level (NTD).xlsx")
trip_length_df

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Year,Month No,Month,Day Type,Line,Riders,Passenger Miles,Day Count,Total Riders,Total Passenger Miles,Group,Avg Trip Length
0,2009,1.0,Jan,DX,2.0,21816.0,89989.0,21.0,458136.0,1889769.0,Directly Operated,4.124908
1,2009,1.0,Jan,DX,4.0,20393.0,71490.0,21.0,428253.0,1501290.0,Directly Operated,3.505615
2,2009,1.0,Jan,DX,10.0,13337.0,43302.0,21.0,280077.0,909342.0,Directly Operated,3.246757
3,2009,1.0,Jan,DX,14.0,16026.0,51177.0,21.0,336546.0,1074717.0,Directly Operated,3.193373
4,2009,1.0,Jan,DX,16.0,26706.0,71782.0,21.0,560826.0,1507422.0,Directly Operated,2.687860
...,...,...,...,...,...,...,...,...,...,...,...,...
68523,2024,7.0,Jul,SU,266.0,3314.0,14470.0,5.0,16570.0,72350.0,Purchased Transportation,4.366325
68524,2024,7.0,Jul,SU,603.0,4772.0,15370.0,5.0,23860.0,76850.0,Purchased Transportation,3.220872
68525,2024,7.0,Jul,SU,605.0,1032.0,1610.0,5.0,5160.0,8050.0,Purchased Transportation,1.560078
68526,,,,,,,,,,,,


In [285]:
# Drop columns other than 'Year', 'Month No', 'Line', and 'Avg Trip Length'
trip_length_df = trip_length_df[['Year', 'Month No', 'Line', 'Avg Trip Length']]

# Filter for the year 2023
trip_length_df_2023 = trip_length_df[trip_length_df['Year'] == 2023]

# Group by 'Line' and average the 'Avg Trip Length'
average_trip_length = trip_length_df_2023.groupby('Line')['Avg Trip Length'].mean().to_dict()

# Print the result
print(len(average_trip_length), average_trip_length)

119 {2.0: 2.633591731884332, 4.0: 3.930482562075087, 10.0: 2.969281518272646, 14.0: 3.0637714695243274, 16.0: 2.5540719670578413, 18.0: 2.2170926018360313, 20.0: 3.51331717214222, 28.0: 3.06948129531576, 30.0: 1.8738710520870727, 33.0: 4.458596442983013, 35.0: 2.5090965484833934, 40.0: 3.636341395355311, 45.0: 2.665667108189143, 51.0: 2.7093607606183214, 53.0: 2.7979831747550863, 55.0: 2.7498428209105583, 60.0: 3.514861198728278, 62.0: 5.782368044031641, 66.0: 2.446785531874463, 70.0: 3.4512942359450345, 76.0: 4.007975886025587, 78.0: 3.8699650206789897, 81.0: 3.3243150963822785, 90.0: 5.266077229360777, 92.0: 4.7921951940896435, 94.0: 4.86361351803826, 96.0: 4.8092156658397505, 102.0: 3.8407534426033476, 105.0: 2.7605728549510125, 106.0: 3.448466870266894, 108.0: 3.585683939041926, 110.0: 3.5538103863782746, 111.0: 2.617707565182164, 115.0: 3.3150945591961563, 117.0: 2.9666513271415518, 120.0: 4.452371778457443, 125.0: 3.2583493598797766, 127.0: 2.966450287754428, 128.0: 3.76639148402

In [286]:
# Ensure route_short_name is treated as a string for initial checks
merged_gdf['route_short_name'] = merged_gdf['route_short_name'].astype(str)

# Extract numeric part of route_short_name if needed
# For this example, let's assume route_short_name has a numeric part that we want to convert
# If route_short_name is already numeric, you can skip this step
merged_gdf['route_short_name_numeric'] = merged_gdf['route_short_name'].str.extract('(\d+)').astype(float)

# Convert to integer, handling any potential issues
merged_gdf['route_short_name_numeric'] = merged_gdf['route_short_name_numeric'].fillna(0).astype(int)

merged_gdf.head()

Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_dir_id,...,p50_mph_am,stop_route_id,p20_mph_var,p80_mph_var,diff_from_avg_midday,diff_from_avg_pm,diff_from_avg_am,line_name,est_wkday_ridership,route_short_name_numeric
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172_0.0,...,10.3,7120_169-13172,8.1,8.8,0.1,1.5,-1.5,169,1831.0,169
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172_0.0,...,8.7,15495_169-13172,8.2,16.4,7.7,7.6,7.7,169,1831.0,169
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172_0.0,...,6.7,6598_169-13172,6.9,7.5,0.5,0.9,0.8,169,1831.0,169
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172_0.0,...,18.2,14946_169-13172,9.6,10.6,0.9,0.9,-7.6,169,1831.0,169
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172_0.0,...,15.0,4190_169-13172,16.0,19.7,1.2,6.6,4.7,169,1831.0,169


In [287]:
merged_gdf['Avg Trip Length'] = merged_gdf['route_short_name_numeric'].map(average_trip_length)
merged_gdf

Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_dir_id,...,stop_route_id,p20_mph_var,p80_mph_var,diff_from_avg_midday,diff_from_avg_pm,diff_from_avg_am,line_name,est_wkday_ridership,route_short_name_numeric,Avg Trip Length
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172_0.0,...,7120_169-13172,8.1,8.8,0.1,1.5,-1.5,169,1831.0,169,4.289633
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172_0.0,...,15495_169-13172,8.2,16.4,7.7,7.6,7.7,169,1831.0,169,4.289633
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172_0.0,...,6598_169-13172,6.9,7.5,0.5,0.9,0.8,169,1831.0,169,4.289633
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172_0.0,...,14946_169-13172,9.6,10.6,0.9,0.9,-7.6,169,1831.0,169,4.289633
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172_0.0,...,4190_169-13172,16.0,19.7,1.2,6.6,4.7,169,1831.0,169,4.289633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13738,2089,Franklin / Bronson,59.0,207-13172,207,0.0,15.3,0.2,"POLYGON ((-118.32142 34.10472, -118.32108 34.1...",2089_207-13172_0.0,...,2089_207-13172,9.0,18.0,2.7,5.0,0.2,207,25334.0,207,2.228640
13739,7724,Imperial / Crenshaw,3.0,207-13172,207,0.0,9.1,0.1,"POLYGON ((-118.32585 33.92908, -118.32617 33.9...",7724_207-13172_0.0,...,7724_207-13172,4.9,15.0,5.9,10.5,-3.0,207,25334.0,207,2.228640
13740,7716,Imperial / Ardath,4.0,207-13172,207,0.0,10.8,0.1,"POLYGON ((-118.32615 33.93044, -118.32580 33.9...",7716_207-13172_0.0,...,7716_207-13172,6.3,14.0,3.2,5.3,-1.5,207,25334.0,207,2.228640
13741,7730,Imperial / Van Ness,5.0,207-13172,207,0.0,14.5,0.4,"POLYGON ((-118.32405 33.93044, -118.32371 33.9...",7730_207-13172_0.0,...,7730_207-13172,10.3,20.1,5.6,4.1,5.2,207,25334.0,207,2.228640


In [288]:
merged_gdf.head()


Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_dir_id,...,stop_route_id,p20_mph_var,p80_mph_var,diff_from_avg_midday,diff_from_avg_pm,diff_from_avg_am,line_name,est_wkday_ridership,route_short_name_numeric,Avg Trip Length
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172_0.0,...,7120_169-13172,8.1,8.8,0.1,1.5,-1.5,169,1831.0,169,4.289633
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172_0.0,...,15495_169-13172,8.2,16.4,7.7,7.6,7.7,169,1831.0,169,4.289633
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172_0.0,...,6598_169-13172,6.9,7.5,0.5,0.9,0.8,169,1831.0,169,4.289633
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172_0.0,...,14946_169-13172,9.6,10.6,0.9,0.9,-7.6,169,1831.0,169,4.289633
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172_0.0,...,4190_169-13172,16.0,19.7,1.2,6.6,4.7,169,1831.0,169,4.289633


In [289]:
merged_gdf.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   stop_id                   13743 non-null  object  
 1   stop_name                 13743 non-null  object  
 2   stop_sequence             13743 non-null  float64 
 3   route_id                  13743 non-null  object  
 4   route_short_name          13743 non-null  object  
 5   direction_id              13743 non-null  float64 
 6   p50_mph_midday            13743 non-null  float64 
 7   miles_from_last           13743 non-null  float64 
 8   geometry                  13743 non-null  geometry
 9   stop_route_dir_id         13743 non-null  object  
 10  p50_mph_pm                13743 non-null  float64 
 11  p50_mph_am                13743 non-null  float64 
 12  stop_route_id             13743 non-null  object  
 13  p20_mph_var               13743 non-nu

In [290]:

def print_type_and_sample(name, data):
    print(f"\n{name}:")
    print(f"Type: {type(data)}")
    if isinstance(data, pd.DataFrame):
        print(data.dtypes)
        print("\nSample data:")
        print(data.head())
    elif isinstance(data, dict):
        print("\nSample data:")
        print(dict(list(data.items())[:5]))
    else:
        print("\nSample data:")
        print(data[:5] if hasattr(data, '__len__') else data)

# Check merged_gdf
print_type_and_sample("merged_gdf", merged_gdf)

# Check specific columns in merged_gdf
columns_to_check = ['stop_id', 'route_short_name', 'stop_sequence', 'miles_from_last', 'est_wkday_ridership']
for col in columns_to_check:
    print_type_and_sample(f"merged_gdf['{col}']", merged_gdf[col])

# Check boardings_per_route_stop
print_type_and_sample("boardings_per_route_stop", boardings_per_route_stop)

# Check a sample of boardings for a specific stop
sample_stop_id = list(boardings_per_route_stop.keys())[0]
print_type_and_sample(f"boardings_per_route_stop[{sample_stop_id}]", boardings_per_route_stop[sample_stop_id])

# Check average_trip_length
print_type_and_sample("average_trip_length", average_trip_length)

# Check for zero values
def check_zero_values(data, name):
    if isinstance(data, pd.DataFrame):
        zero_counts = (data == 0).sum()
        print(f"\nZero value counts in {name}:")
        print(zero_counts[zero_counts > 0])
    elif isinstance(data, dict):
        zero_count = sum(1 for v in data.values() if v == 0)
        print(f"\nNumber of zero values in {name}: {zero_count}")

check_zero_values(merged_gdf, "merged_gdf")
check_zero_values(boardings_per_route_stop, "boardings_per_route_stop")
check_zero_values(average_trip_length, "average_trip_length")

# Check for NaN values
def check_nan_values(data, name):
    if isinstance(data, pd.DataFrame):
        nan_counts = data.isna().sum()
        print(f"\nNaN value counts in {name}:")
        print(nan_counts[nan_counts > 0])
    elif isinstance(data, dict):
        nan_count = sum(1 for v in data.values() if pd.isna(v))
        print(f"\nNumber of NaN values in {name}: {nan_count}")

check_nan_values(merged_gdf, "merged_gdf")
check_nan_values(boardings_per_route_stop, "boardings_per_route_stop")
check_nan_values(average_trip_length, "average_trip_length")

print("\nDiagnostic complete. Please review the output for any inconsistencies or unexpected values.")


merged_gdf:
Type: <class 'geopandas.geodataframe.GeoDataFrame'>
stop_id                       object
stop_name                     object
stop_sequence                float64
route_id                      object
route_short_name              object
direction_id                 float64
p50_mph_midday               float64
miles_from_last              float64
geometry                    geometry
stop_route_dir_id             object
p50_mph_pm                   float64
p50_mph_am                   float64
stop_route_id                 object
p20_mph_var                  float64
p80_mph_var                  float64
diff_from_avg_midday         float64
diff_from_avg_pm             float64
diff_from_avg_am             float64
line_name                     object
est_wkday_ridership          float64
route_short_name_numeric       int64
Avg Trip Length              float64
dtype: object

Sample data:
  stop_id                 stop_name  stop_sequence   route_id  \
0    7120          Victory /

In [291]:
def simulate_single_route(merged_gdf, boardings_per_route_stop, test_route, test_direction):
    
    test_route = int(test_route)

    # Filter the dataframe for the specific route and direction
    route_gdf = merged_gdf[
        (merged_gdf['route_short_name'] == str(test_route)) & 
        (merged_gdf['direction_id'] == test_direction)
    ].sort_values('stop_sequence')

    # Create a dictionary to store segment ridership
    segment_ridership = defaultdict(float)
    
    # Initialize riders on the bus
    riders_on_bus = 0
    # Keep track of where each rider boarded and the distance they’ve traveled
    rider_origins = []
    rider_distances = []
    
    print(f"\nSimulating route {test_route}, direction {test_direction}")
    
    cumulative_distance = 0  # Track cumulative distance for each stop
    
    for _, row in route_gdf.iterrows():
        stop_id = int(row['stop_id'])
        segment_length = row['miles_from_last']
        cumulative_distance += segment_length  # Update cumulative distance
        stop_route_dir_id = row['stop_route_dir_id']
        
        # Simulate riders getting off
        avg_trip_length = row['Avg Trip Length']  # Default to 5 miles if not found
        riders_getting_off = 0
        
        print(f"Stop {stop_id} Segment Length: {segment_length}")  # Debugging segment length
        
        if avg_trip_length > 0:
            riders_getting_off = sum(1 for dist in rider_distances if dist >= avg_trip_length)
            riders_on_bus -= riders_getting_off
            print(f"Riders getting off at stop {stop_id}: {riders_getting_off}")  # Debugging riders getting off
            # Remove riders who got off and update distances for remaining riders
            rider_origins = [origin for origin, dist in zip(rider_origins, rider_distances) if dist < avg_trip_length]
            rider_distances = [dist for dist in rider_distances if dist < avg_trip_length]
        
        # Simulate riders getting on
        riders_getting_on = 0
        if stop_id in boardings_per_route_stop and test_route in boardings_per_route_stop[stop_id]:
            riders_getting_on = boardings_per_route_stop[stop_id][test_route]
            riders_on_bus += riders_getting_on
            print(f"Riders getting on at stop {stop_id}: {riders_getting_on}")  # Debugging riders getting on
            # Add these riders’ starting distances (i.e., zero since they’re boarding now)
            rider_origins.extend([row['stop_sequence']] * int(riders_getting_on))
            rider_distances.extend([0] * int(riders_getting_on))
        
        # Update distances traveled for remaining riders
        rider_distances = [dist + segment_length for dist in rider_distances]
        
        # Store the ridership for this segment (riders on bus * segment length)
        segment_ridership[stop_route_dir_id] = riders_on_bus
        print(f"Riders on bus after stop {stop_id}: {riders_on_bus}")  # Debugging riders on bus

    return segment_ridership

# Specify the route and direction you want to test
test_route = '4'  # Replace with the route number you want to test
test_direction = 1  # Specify the direction you want to test (e.g., 0 or 1)

# Run the simulation for the single route and direction
segment_ridership_182 = simulate_single_route(merged_gdf, boardings_per_route_stop, test_route, test_direction)



Simulating route 4, direction 1
Stop 4756 Segment Length: 0.2
Riders getting off at stop 4756: 0
Riders getting on at stop 4756: 602.6492022079051
Riders on bus after stop 4756: 602.6492022079051
Stop 4819 Segment Length: 0.2
Riders getting off at stop 4819: 0
Riders getting on at stop 4819: 82.48708107096049
Riders on bus after stop 4819: 685.1362832788656
Stop 15598 Segment Length: 0.1
Riders getting off at stop 15598: 0
Riders getting on at stop 15598: 75.0
Riders on bus after stop 15598: 760.1362832788656
Stop 4818 Segment Length: 0.3
Riders getting off at stop 4818: 0
Riders getting on at stop 4818: 140.88329673011225
Riders on bus after stop 4818: 901.0195800089778
Stop 1248 Segment Length: 0.1
Riders getting off at stop 1248: 0
Riders getting on at stop 1248: 248.26662301165044
Riders on bus after stop 1248: 1149.2862030206281
Stop 16947 Segment Length: 0.2
Riders getting off at stop 16947: 0
Riders getting on at stop 16947: 926.443043846007
Riders on bus after stop 16947: 2075

In [292]:
def simulate_all_routes(merged_gdf, boardings_per_route_stop):
    all_segment_ridership = {}
    
    # Get unique combinations of route_short_name and direction_id
    route_directions = merged_gdf[['route_short_name_numeric', 'direction_id']].drop_duplicates()
    
    for _, row in route_directions.iterrows():
        route = row['route_short_name_numeric']
        direction = row['direction_id']
        
        print(f"Simulating route {route}, direction {direction}")
        segment_ridership = simulate_single_route(merged_gdf, boardings_per_route_stop, route, direction)
        all_segment_ridership.update(segment_ridership)
    
    return all_segment_ridership

# Run the simulation for all routes and directions
all_routes_ridership = simulate_all_routes(merged_gdf, boardings_per_route_stop)


Simulating route 169.0, direction 0.0

Simulating route 169, direction 0.0
Stop 7120 Segment Length: 0.3
Riders getting off at stop 7120: 0
Riders getting on at stop 7120: 10.89481874771146
Riders on bus after stop 7120: 10.89481874771146
Stop 15495 Segment Length: 0.2
Riders getting off at stop 15495: 0
Riders getting on at stop 15495: 16.25842182350787
Riders on bus after stop 15495: 27.153240571219328
Stop 6598 Segment Length: 0.3
Riders getting off at stop 6598: 0
Riders getting on at stop 6598: 24.423353170531502
Riders on bus after stop 6598: 51.57659374175083
Stop 14946 Segment Length: 0.3
Riders getting off at stop 14946: 0
Riders getting on at stop 14946: 18.787194746562694
Riders on bus after stop 14946: 70.36378848831352
Stop 4190 Segment Length: 0.3
Riders getting off at stop 4190: 0
Riders getting on at stop 4190: 15.405499692181408
Riders on bus after stop 4190: 85.76928818049493
Stop 6580 Segment Length: 0.3
Riders getting off at stop 6580: 0
Riders getting on at stop 65

In [293]:
all_routes_ridership

{'7120_169-13172_0.0': 10.89481874771146,
 '15495_169-13172_0.0': 27.153240571219328,
 '6598_169-13172_0.0': 51.57659374175083,
 '14946_169-13172_0.0': 70.36378848831352,
 '4190_169-13172_0.0': 85.76928818049493,
 '6580_169-13172_0.0': 95.16288555377628,
 '6568_169-13172_0.0': 101.5505317676076,
 '14967_169-13172_0.0': 120.5505317676076,
 '14941_169-13172_0.0': 121.5505317676076,
 '14950_169-13172_0.0': 121.5505317676076,
 '14943_169-13172_0.0': 125.5505317676076,
 '14969_169-13172_0.0': 125.5505317676076,
 '20010_169-13172_0.0': 125.5505317676076,
 '3321_169-13172_0.0': 209.5505317676076,
 '11804_169-13172_0.0': 210.5505317676076,
 '11809_169-13172_0.0': 210.5505317676076,
 '11805_169-13172_0.0': 189.5505317676076,
 '20219_169-13172_0.0': 168.5505317676076,
 '8891_169-13172_0.0': 158.5505317676076,
 '15099_169-13172_0.0': 146.5505317676076,
 '15096_169-13172_0.0': 224.5505317676076,
 '6742_169-13172_0.0': 211.5505317676076,
 '15107_169-13172_0.0': 214.5505317676076,
 '15102_169-13172_

In [294]:
merged_gdf['simulated_segment_ridership'] = merged_gdf['stop_route_dir_id'].map(all_routes_ridership)
merged_gdf.columns

Index(['stop_id', 'stop_name', 'stop_sequence', 'route_id', 'route_short_name',
       'direction_id', 'p50_mph_midday', 'miles_from_last', 'geometry',
       'stop_route_dir_id', 'p50_mph_pm', 'p50_mph_am', 'stop_route_id',
       'p20_mph_var', 'p80_mph_var', 'diff_from_avg_midday',
       'diff_from_avg_pm', 'diff_from_avg_am', 'line_name',
       'est_wkday_ridership', 'route_short_name_numeric', 'Avg Trip Length',
       'simulated_segment_ridership'],
      dtype='object')

In [295]:
# Filter for midday
midday_filtered = merged_gdf[merged_gdf['diff_from_avg_midday'] > 0].copy()

# Filter for AM peak
am_filtered = merged_gdf[merged_gdf['diff_from_avg_am'] > 0].copy()

# Filter for PM peak
pm_filtered = merged_gdf[merged_gdf['diff_from_avg_pm'] > 0].copy()


In [296]:
# Time lost for midday, pm, and am
midday_filtered['time_lost_midday'] = (midday_filtered['miles_from_last'] / midday_filtered['diff_from_avg_midday']) * 60
pm_filtered['time_lost_pm'] = (pm_filtered['miles_from_last'] / pm_filtered['diff_from_avg_pm']) * 60
am_filtered['time_lost_am'] = (am_filtered['miles_from_last'] / am_filtered['diff_from_avg_am']) * 60


In [311]:

midday_filtered['ridership_minutes_lost_midday'] = midday_filtered['time_lost_midday'] * midday_filtered['simulated_segment_ridership']
pm_filtered['ridership_minutes_lost_pm'] = pm_filtered['time_lost_pm'] * pm_filtered['simulated_segment_ridership']
am_filtered['ridership_minutes_lost_am'] = am_filtered['time_lost_am'] * am_filtered['simulated_segment_ridership']

In [312]:
# Group by stop_route_dir_id and sum the ridership minutes lost for each time period
midday_sum = midday_filtered.groupby('stop_route_dir_id')['ridership_minutes_lost_midday'].sum().reset_index()
pm_sum = pm_filtered.groupby('stop_route_dir_id')['ridership_minutes_lost_pm'].sum().reset_index()
am_sum = am_filtered.groupby('stop_route_dir_id')['ridership_minutes_lost_am'].sum().reset_index()

# Merge the summed data back into the merged_gdf
merged_gdf = merged_gdf.merge(midday_sum, on='stop_route_dir_id', how='left')
merged_gdf = merged_gdf.merge(pm_sum, on='stop_route_dir_id', how='left')
merged_gdf = merged_gdf.merge(am_sum, on='stop_route_dir_id', how='left')

# Fill NaN values with 0 (for segments that didn't have any time lost)
merged_gdf['ridership_minutes_lost_midday'] = merged_gdf['ridership_minutes_lost_midday'].fillna(0)
merged_gdf['ridership_minutes_lost_pm'] = merged_gdf['ridership_minutes_lost_pm'].fillna(0)
merged_gdf['ridership_minutes_lost_am'] = merged_gdf['ridership_minutes_lost_am'].fillna(0)

# Calculate total ridership minutes lost across all time periods
merged_gdf['total_ridership_minutes_lost'] = (
    merged_gdf['ridership_minutes_lost_midday'] +
    merged_gdf['ridership_minutes_lost_pm'] +
    merged_gdf['ridership_minutes_lost_am']
)

# Display summary statistics
print("Summary statistics for total ridership minutes lost:")
print(merged_gdf['total_ridership_minutes_lost'].describe())



Summary statistics for total ridership minutes lost:
count    1.374300e+04
mean     2.588731e+04
std      1.081524e+05
min      0.000000e+00
25%      2.111344e+03
50%      7.862883e+03
75%      2.316225e+04
max      9.247961e+06
Name: total_ridership_minutes_lost, dtype: float64

Top 10 segments with highest total ridership minutes lost:


Unnamed: 0,stop_route_dir_id,route_short_name,stop_name,total_ridership_minutes_lost,ridership_minutes_lost_midday,ridership_minutes_lost_pm,ridership_minutes_lost_am
12960,16769_720-13172_1.0,720,Wilshire / Vermont,9247961.0,1206256.0,2010426.0,6031279.0
12963,16691_720-13172_1.0,720,Wilshire / Crenshaw,3563342.0,187544.3,750177.2,2625620.0
12962,8467_720-13172_1.0,720,Wilshire / Western,3017780.0,192624.3,706288.9,2118867.0
12964,16711_720-13172_1.0,720,Wilshire / Cloverdale,1508094.0,237729.1,141151.7,1129213.0
3075,10640_460-13172_0.0,460,Fullerton Park and Ride,1507882.0,115990.9,0.0,1391891.0
13095,14152_207-13172_1.0,207,Western / Olympic,1448659.0,90541.16,0.0,1358117.0
13065,1210_720-13172_0.0,720,6th / Central,1366339.0,285736.2,742914.2,337688.3
13062,7368_720-13172_0.0,720,6th / Witmer,1283484.0,314816.9,181625.1,787042.2
13402,7272_16-13172_0.0,16,3rd / Bonnie Brae,1195715.0,96249.54,40720.96,1058745.0
12661,6818_233-13172_1.0,233,Van Nuys / Vanowen,1070720.0,27931.82,782090.9,260697.0


In [316]:
merged_gdf['passenger_hour_per_mi'] = (merged_gdf['total_ridership_minutes_lost'] / 60) / merged_gdf['miles_from_last']

In [318]:
top_10 = merged_gdf.nlargest(50, 'passenger_hour_per_mi')
top_10[['stop_route_dir_id', 'route_short_name', 'stop_name', 'passenger_hour_per_mi', 'total_ridership_minutes_lost',
              'ridership_minutes_lost_midday', 'ridership_minutes_lost_pm', 'ridership_minutes_lost_am', 'miles_from_last']]


Unnamed: 0,stop_route_dir_id,route_short_name,stop_name,passenger_hour_per_mi,total_ridership_minutes_lost,ridership_minutes_lost_midday,ridership_minutes_lost_pm,ridership_minutes_lost_am,miles_from_last
12960,16769_720-13172_1.0,720,Wilshire / Vermont,154132.677051,9247961.0,1206256.0,2010426.0,6031279.0,1.0
12962,8467_720-13172_1.0,720,Wilshire / Western,100592.665488,3017780.0,192624.3,706288.9,2118867.0,0.5
13402,7272_16-13172_0.0,16,3rd / Bonnie Brae,99642.949834,1195715.0,96249.54,40720.96,1058745.0,0.2
12963,16691_720-13172_1.0,720,Wilshire / Crenshaw,98981.711643,3563342.0,187544.3,750177.2,2625620.0,0.6
13231,6935_33-13172_0.0,33,Venice / Motor,84176.470588,1010118.0,38117.65,324000.0,648000.0,0.2
13095,14152_207-13172_1.0,207,Western / Olympic,80481.033872,1448659.0,90541.16,0.0,1358117.0,0.3
13296,9761_51-13172_1.0,51,7th / Main,79199.898795,475199.4,178199.8,118799.8,178199.8,0.1
12857,15653_16-13172_1.0,16,3rd / Rossmore,75926.563437,911118.8,0.0,911118.8,0.0,0.2
11872,1641_108-13172_0.0,108,Slauson / Hooper,74984.180884,899810.2,112476.3,674857.6,112476.3,0.2
11875,2513_108-13172_0.0,108,Slauson / Holmes,63622.497807,763470.0,49455.54,21636.8,692377.6,0.2


In [315]:
# Display top 10 segments with highest total ridership minutes lost
print("\nTop segments with highest total ridership minutes lost:")
top_10 = merged_gdf.nlargest(50, 'total_ridership_minutes_lost')
top_10[['stop_route_dir_id', 'route_short_name', 'stop_name', 'total_ridership_minutes_lost', 
              'ridership_minutes_lost_midday', 'ridership_minutes_lost_pm', 'ridership_minutes_lost_am', 'miles_from_last']]



Top segments with highest total ridership minutes lost:


Unnamed: 0,stop_route_dir_id,route_short_name,stop_name,total_ridership_minutes_lost,ridership_minutes_lost_midday,ridership_minutes_lost_pm,ridership_minutes_lost_am,miles_from_last
12960,16769_720-13172_1.0,720,Wilshire / Vermont,9247961.0,1206256.0,2010426.0,6031279.0,1.0
12963,16691_720-13172_1.0,720,Wilshire / Crenshaw,3563342.0,187544.3,750177.2,2625620.0,0.6
12962,8467_720-13172_1.0,720,Wilshire / Western,3017780.0,192624.3,706288.9,2118867.0,0.5
12964,16711_720-13172_1.0,720,Wilshire / Cloverdale,1508094.0,237729.1,141151.7,1129213.0,0.5
3075,10640_460-13172_0.0,460,Fullerton Park and Ride,1507882.0,115990.9,0.0,1391891.0,0.7
13095,14152_207-13172_1.0,207,Western / Olympic,1448659.0,90541.16,0.0,1358117.0,0.3
13065,1210_720-13172_0.0,720,6th / Central,1366339.0,285736.2,742914.2,337688.3,0.6
13062,7368_720-13172_0.0,720,6th / Witmer,1283484.0,314816.9,181625.1,787042.2,0.7
13402,7272_16-13172_0.0,16,3rd / Bonnie Brae,1195715.0,96249.54,40720.96,1058745.0,0.2
12661,6818_233-13172_1.0,233,Van Nuys / Vanowen,1070720.0,27931.82,782090.9,260697.0,0.3


In [319]:
# Display results
print("Top 10 segments with highest ridership_minutes_lost_am for AM peak:")
am_filtered.sort_values('ridership_minutes_lost_am', ascending=False)[['route_short_name', 'stop_name', 'ridership_minutes_lost_am', 'miles_from_last']].head(30)

Top 10 segments with highest ridership_minutes_lost_am for AM peak:


Unnamed: 0,route_short_name,stop_name,ridership_minutes_lost_am,miles_from_last
12960,720,Wilshire / Vermont,6031279.0,1.0
12963,720,Wilshire / Crenshaw,2625620.0,0.6
12962,720,Wilshire / Western,2118867.0,0.5
3075,460,Fullerton Park and Ride,1391891.0,0.7
13095,207,Western / Olympic,1358117.0,0.3
12964,720,Wilshire / Cloverdale,1129213.0,0.5
13402,16,3rd / Bonnie Brae,1058745.0,0.2
10465,60,Long Beach / Kansas,871023.0,0.3
11469,204,Vermont / 24th,809840.8,0.4
13062,720,6th / Witmer,787042.2,0.7


In [322]:
merged_gdf.head()

Unnamed: 0,stop_id,stop_name,stop_sequence,route_id,route_short_name,direction_id,p50_mph_midday,miles_from_last,geometry,stop_route_dir_id,...,ridership_minutes_lost_pm_x,ridership_minutes_lost_am_x,total_ridership_minutes_lost,ridership_minutes_lost_midday_y,ridership_minutes_lost_pm_y,ridership_minutes_lost_am_y,ridership_minutes_lost_midday,ridership_minutes_lost_pm,ridership_minutes_lost_am,passenger_hour_per_mi
0,7120,Victory / Canoga,3.0,169-13172,169,0.0,8.7,0.3,"POLYGON ((-118.59655 34.19092, -118.59646 34.1...",7120_169-13172_0.0,...,130.737825,0.0,2091.8052,1961.067375,130.737825,,1961.067375,130.737825,0.0,116.2114
1,15495,Victory / Owensmouth,4.0,169-13172,169,0.0,8.7,0.2,"POLYGON ((-118.60154 34.18788, -118.60175 34.1...",15495_169-13172_0.0,...,42.873538,42.316739,127.507015,42.316739,42.873538,42.316739,42.316739,42.873538,42.316739,10.625585
2,6598,Topanga Canyon / Victory,5.0,169-13172,169,0.0,7.0,0.3,"POLYGON ((-118.60604 34.18699, -118.60604 34.1...",6598_169-13172_0.0,...,1031.531875,1160.473359,4048.762609,1856.757375,1031.531875,1160.473359,1856.757375,1031.531875,1160.473359,224.931256
3,14946,Topanga Canyon / Erwin,6.0,169-13172,169,0.0,9.7,0.3,"POLYGON ((-118.60637 34.18667, -118.60621 34.1...",14946_169-13172_0.0,...,1407.27577,0.0,2814.55154,1407.27577,1407.27577,,1407.27577,1407.27577,0.0,156.363974
4,4190,Topanga Canyon / Oxnard,7.0,169-13172,169,0.0,18.5,0.3,"POLYGON ((-118.60587 34.17989, -118.60548 34.1...",4190_169-13172_0.0,...,233.91624,328.478125,1848.933688,1286.539323,233.91624,328.478125,1286.539323,233.91624,328.478125,102.718538


In [323]:
merged_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13743 entries, 0 to 13742
Data columns (total 34 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   stop_id                          13743 non-null  object  
 1   stop_name                        13743 non-null  object  
 2   stop_sequence                    13743 non-null  float64 
 3   route_id                         13743 non-null  object  
 4   route_short_name                 13743 non-null  object  
 5   direction_id                     13743 non-null  float64 
 6   p50_mph_midday                   13743 non-null  float64 
 7   miles_from_last                  13743 non-null  float64 
 8   geometry                         13743 non-null  geometry
 9   stop_route_dir_id                13743 non-null  object  
 10  p50_mph_pm                       13743 non-null  float64 
 11  p50_mph_am                       13743 non-null  float64 
 

In [324]:
import json

# Convert GeoDataFrame to a GeoJSON-like Python dictionary
geojson_dict = merged_gdf.__geo_interface__

# Save as GeoJSON
with open("bus_segments.geojson", "w") as f:
    json.dump(geojson_dict, f)

print("Data saved successfully to bus_segments.geojson")

Data saved successfully to bus_segments.geojson
