In [None]:
import geopandas as gpd
import pandas as pd
from collections import defaultdict
import numpy as np
import json
import pyogrio
import fiona
from shapely.ops import unary_union
import geopandas as gpd
from shapely.geometry import LineString, Polygon
import matplotlib.pyplot as plt
import folium
from branca.colormap import LinearColormap
from scipy.sparse.csgraph import connected_components
from scipy.sparse import csr_matrix
import shapely.speedups
if shapely.speedups.available:
    shapely.speedups.enable()


In [2]:
path_pref = "/Users/joeyshoyer/Downloads/"

In [None]:
ridership_data = gpd.read_file(path_pref + "Average_weekday_ridership.geojson")
ridership_data

In [None]:
# Load the GeoJSON files
midday_gdf = gpd.read_file(path_pref + "182_Midday_speeds.geojson")
pm_peak_gdf = gpd.read_file(path_pref + "182_PM_Peak_speeds.geojson")
am_peak_gdf = gpd.read_file(path_pref + "182_AM_Peak_speeds.geojson")
am_peak_gdf = gpd.read_file(path_pref + "182_AM_Peak_speeds.geojson")

print(len(midday_gdf))

In [5]:
# segments_data = gpd.read_file("segments.geojson")
# segments_data = segments_data.drop(columns=['segment_id', 'row_id', 'trip_id', 'traversals', 'traversal_time', 'speed', 'stop_id2'])
# segments_data = segments_data.rename(columns={"stop_id1": "stop_id"})

# def split_before_dash(df, source_column, new_column):
#     df[new_column] = pd.to_numeric(df[source_column].str.split('-').str[0], errors='coerce')
#     return df

# segments_data = split_before_dash(segments_data, 'route_id', 'route_short_name_numeric')
# segments_data

In [6]:
midday_gdf = midday_gdf.drop(columns=['id', 'fast_slow_ratio', 'trips_per_hour', 'time_formatted', 'organization_name'])
pm_peak_gdf = pm_peak_gdf.drop(columns=["geometry", 'id', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'time_formatted', 'organization_name', 'route_short_name', 'route_id', 'stop_name', 'p20_mph', 'p80_mph', 'stop_id'])
am_peak_gdf = am_peak_gdf.drop(columns=["geometry", 'id', 'direction_id', 'fast_slow_ratio', 'trips_per_hour', 'miles_from_last', 'time_formatted', 'organization_name', 'route_short_name', 'route_id', 'stop_name', 'p20_mph', 'p80_mph', 'stop_id'])

In [7]:
midday_gdf['uid'] = midday_gdf['shape_id'] + midday_gdf['stop_sequence'].astype(str)
pm_peak_gdf['uid'] = pm_peak_gdf['shape_id'] + pm_peak_gdf['stop_sequence'].astype(str)
am_peak_gdf['uid'] = am_peak_gdf['shape_id'] + am_peak_gdf['stop_sequence'].astype(str)

In [8]:
# Merge on stop_name
merged_gdf = midday_gdf.merge(pm_peak_gdf, on="uid", how="inner", suffixes=('_midday', '_pm'))

In [9]:
merged_gdf = merged_gdf.merge(am_peak_gdf, on="uid", how="inner", suffixes=('', '_am'))


In [10]:
merged_gdf = merged_gdf.rename(columns={'p50_mph': 'p50_mph_am'})

In [11]:
merged_gdf = merged_gdf.drop(columns=['shape_id_midday', 'stop_sequence_midday', 'shape_id', 'stop_sequence_pm'])

In [None]:
merged_gdf.info()

In [13]:
merged_gdf = merged_gdf[~merged_gdf['route_id'].str.startswith('8')]
merged_gdf = merged_gdf[~merged_gdf['route_id'].str.startswith('9')]

In [14]:
# Filter for midday
# midday_filtered = merged_gdf[merged_gdf['diff_from_avg_midday'] > 0].copy()

# # Filter for AM peak
# am_filtered = merged_gdf[merged_gdf['diff_from_avg_am'] > 0].copy()

# # Filter for PM peak
# pm_filtered = merged_gdf[merged_gdf['diff_from_avg_pm'] > 0].copy()


In [15]:
def process_route_name(value):
    if pd.isna(value):
        print("Encountered a null value in route_short_name.")
        return value
    
    value = str(value)  # Convert to string to handle potential float values
    
    if '/' in value:
        value = value.split('/')[0]
    
    value = value.strip()  # Remove any leading/trailing whitespace
    
    if value == '':
        print("Encountered an empty string in route_short_name after processing.")
        return value
    
    try:
        int(value)
        #return value
    except ValueError:
        print(f"Value '{value}' in route_short_name is not convertible to an integer.")
        #return value
    return value

# Apply the function to the 'route_short_name' column
merged_gdf['route_short_name'] = merged_gdf['route_short_name'].apply(process_route_name)

In [None]:
# Ensure route_short_name is treated as a string for initial checks
merged_gdf['route_short_name'] = merged_gdf['route_short_name'].astype(str)

# Extract numeric part of route_short_name if needed
# For this example, let's assume route_short_name has a numeric part that we want to convert
# If route_short_name is already numeric, you can skip this step
merged_gdf['route_short_name_numeric'] = merged_gdf['route_short_name'].str.extract('(\d+)').astype(float)

# Convert to integer, handling any potential issues
merged_gdf['route_short_name_numeric'] = merged_gdf['route_short_name_numeric'].fillna(0).astype(int)

merged_gdf.head()

In [None]:
# Create the new column in each GeoDataFrame
merged_gdf['stop_route_dir_id'] = merged_gdf['stop_id'].astype(str) + '_' + merged_gdf['route_id'].astype(str) + '_' + merged_gdf['direction_id'].astype(str)
merged_gdf

In [18]:
merged_gdf['speed_diff'] = merged_gdf['p80_mph'] - merged_gdf['p50_mph_midday']

In [None]:
# Removing duplicates, keep the one with the highest difference in speed

# Separate rows with missing 'stop_id'
missing_stop_id = merged_gdf[merged_gdf['stop_id'].isna()]
print(len(missing_stop_id))


# Filter out rows with non-missing 'stop_id'
non_missing_gdf = merged_gdf[merged_gdf['stop_id'].notnull()]
print(len(non_missing_gdf))

# Group by 'stop_route_dir_id' and get the index of the max 'speed_diff'
max_speed_diff_indices = non_missing_gdf.groupby('stop_route_dir_id')['speed_diff'].idxmax()

# Create a new GeoDataFrame with the rows corresponding to the max indices
cleaned_non_missing_gdf = non_missing_gdf.loc[max_speed_diff_indices].reset_index(drop=True)

# Combine cleaned non-missing rows with rows that had missing 'stop_route_dir_id'
cleaned_gdf = pd.concat([cleaned_non_missing_gdf, missing_stop_id], ignore_index=True)

# Print the cleaned GeoDataFrame
print("Cleaned GeoDataFrame (duplicates removed, missing stop IDs preserved):")
cleaned_gdf


In [20]:
merged_gdf = cleaned_gdf

In [None]:
# Identify duplicates in the 'stop_route_dir_id' column
duplicates = cleaned_gdf[cleaned_gdf.duplicated(subset=['stop_route_dir_id'], keep=False)]

# Print the duplicates
print("Duplicates based on 'stop_route_dir_id':")
duplicates

In [None]:
# Read the Excel file
ridership_full = pd.read_excel(path_pref + 'Bus Ridership CPRA 24-1647.xlsx', sheet_name='2024-03')

# Display the first few rows of the data
print(ridership_full.head())

In [23]:
ridership_full = ridership_full[ridership_full['DAY TYPE'] == 'DX']

In [None]:
ridership_full['DIRECTION'].value_counts()

In [25]:

ridership_full['YEAR MONTH'] = ridership_full['YEAR MONTH'].astype(str)
ridership_full['LINE'] = ridership_full['LINE'].astype(str)
ridership_full['DIRECTION_bin'] = ridership_full['DIRECTION'].map({'North': 0, 'South': 1, 'East': 0, 'West': 1})
ridership_full['STOP ID'] = ridership_full['STOP ID'].astype(str)

# Create a unique identifier for joining in ridership_full
ridership_full['stop_route_short_dir_id'] = (
    ridership_full['STOP ID'] + '_' + 
    ridership_full['LINE'] + '_' + 
    ridership_full['DIRECTION_bin'].astype(str)
)

# Create the same identifier in merged_gdf
merged_gdf['stop_route_short_dir_id'] = (
    merged_gdf['stop_id'].astype(str) + '_' + 
    merged_gdf['route_short_name'].astype(str) + '_' + 
    merged_gdf['direction_id'].astype(str)
)


In [None]:
ridership_full

In [None]:

# Aggregate ridership data to handle duplicates
ridership_agg = ridership_full.groupby('stop_route_short_dir_id').agg({
    'Avg Ons': 'mean',
    'Avg Offs': 'mean'
}).reset_index()

# Create a dictionary for efficient mapping
ridership_dict = ridership_agg.set_index('stop_route_short_dir_id')[['Avg Ons', 'Avg Offs']].to_dict(orient='index')

# Use map to add Avg Ons and Avg Offs to merged_gdf
merged_gdf['Avg Ons'] = merged_gdf['stop_route_short_dir_id'].map(lambda x: ridership_dict.get(x, {}).get('Avg Ons'))
merged_gdf['Avg Offs'] = merged_gdf['stop_route_short_dir_id'].map(lambda x: ridership_dict.get(x, {}).get('Avg Offs'))


# Check if the number of rows remained the same
print(f"Original merged_gdf rows: {len(merged_gdf)}")
print(f"Final merged data rows: {len(merged_gdf)}")

# Check for any null values in the new columns
print(f"Null values in Avg Ons: {merged_gdf['Avg Ons'].isnull().sum()}")
print(f"Null values in Avg Offs: {merged_gdf['Avg Offs'].isnull().sum()}")

# Check for unique values in stop_route_short_dir_id
print(f"Unique stop_route_short_dir_id in merged_gdf: {merged_gdf['stop_route_short_dir_id'].nunique()}")
print(f"Unique stop_route_short_dir_id in ridership_full: {ridership_full['stop_route_short_dir_id'].nunique()}")

In [None]:
def calculate_route_ridership(data, route, direction):
    # Filter the dataframe for the specific route and direction
    route_df = data[
        (data['route_short_name'] == str(route)) & 
        (data['direction_id'] == direction)
    ].sort_values('stop_sequence')

    # Initialize variables
    riders_on_bus = 0
    segment_ridership = {}

    print(f"\nCalculating ridership for route {route}, direction {direction}")

    for _, row in route_df.iterrows():
        print(row['stop_sequence'])
        stop_id = int(row['stop_id'] or '0')
        stop_name = row['stop_name']
        stop_route_short_dir_id = row['stop_route_short_dir_id']
        
        riders_getting_on = row['Avg Ons'] if pd.notna(row['Avg Ons']) else 0
        riders_getting_off = row['Avg Offs'] if pd.notna(row['Avg Offs']) else 0
        
        # Update riders on bus
        riders_on_bus += riders_getting_on - riders_getting_off
        riders_on_bus = max(0, riders_on_bus)  # Ensure it doesn't go below zero

        segment_ridership[stop_route_short_dir_id] = riders_on_bus

        print(f"Stop {stop_id} - {stop_name}: On: {riders_getting_on:.2f}, Off: {riders_getting_off:.2f}, On Bus: {riders_on_bus:.2f}")

    return segment_ridership

# Test the function with route "4" in direction 0
test_route = "720"
test_direction = 0

# Assuming 'data' is your DataFrame with all the necessary columns
result = calculate_route_ridership(merged_gdf, test_route, test_direction)

In [29]:
def calculate_all_routes_ridership(merged_data):
    all_segment_ridership = {}
    # Get unique combinations of route_short_name and direction_id
    route_directions = merged_data[['route_short_name', 'direction_id']].drop_duplicates()
    
    for _, row in route_directions.iterrows():
        route = row['route_short_name']
        direction = row['direction_id']
        print(f"Calculating ridership for route {route}, direction {direction}")
        segment_ridership = calculate_route_ridership(merged_data, route, direction)
        all_segment_ridership.update(segment_ridership)
    
    return all_segment_ridership


In [None]:
# Calculate ridership for all routes
all_routes_ridership = calculate_all_routes_ridership(merged_gdf)

# Add the calculated ridership to the original DataFrame
merged_gdf['calculated_segment_ridership'] = merged_gdf['stop_route_short_dir_id'].map(all_routes_ridership)

In [None]:
import folium
import geopandas as gpd
import numpy as np
from branca.colormap import LinearColormap

# Assuming your GeoDataFrame is named 'gdf'
# If it's not, replace 'gdf' with the actual variable name

# Handle null values in simulated_segment_ridership
#merged_gdf['simulated_segment_ridership'] = merged_gdf['simulated_segment_ridership'].fillna(0)


# Create a map centered on Los Angeles
m = folium.Map(location=[34.0522, -118.2437], zoom_start=10)

# Create a color map
vmin = merged_gdf['calculated_segment_ridership'].min()
vmax = merged_gdf['calculated_segment_ridership'].max()
color_map = LinearColormap(colors=['blue', 'yellow', 'red'], vmin=vmin, vmax=vmax)

# Function to style the features
def style_function(feature):
    ridership = feature['properties']['calculated_segment_ridership']
    if ridership is None or np.isnan(ridership):
        color = 'gray'  # Use gray for null values
    else:
        color = color_map(ridership)
    return {
        'color': color,
        'weight': 2,
        'fillOpacity': 0.7
    }

# Function to create popups
def popup_function(feature):
    props = feature['properties']
    ridership = props['calculated_segment_ridership']
    if ridership is None or np.isnan(ridership):
        ridership_str = "No data"
    else:
        ridership_str = f"{ridership:.2f}"
    return folium.Popup(
        f"Stop Name: {props['stop_name']}<br>"
        f"Route: {props['route_short_name']}<br>"
        f"Ridership: {ridership_str}"
    )

# Add the GeoDataFrame to the map
folium.GeoJson(
    merged_gdf,
    style_function=style_function,
    popup=popup_function,
    tooltip=folium.GeoJsonTooltip(fields=['stop_name', 'route_short_name', 'calculated_segment_ridership'],
                                  aliases=['Stop Name:', 'Route:', 'Ridership:'],
                                  localize=True,
                                  sticky=False,
                                  labels=True,
                                  style="""
                                      background-color: #F0EFEF;
                                      border: 2px solid black;
                                      border-radius: 3px;
                                      box-shadow: 3px;
                                  """,
                                  max_width=800,)
).add_to(m)

# Add the color map to the map
color_map.add_to(m)

# Add a title to the map
title_html = '''
             <h3 align="center" style="font-size:16px"><b>Los Angeles Bus Segments</b></h3>
             '''
m.get_root().html.add_child(folium.Element(title_html))

# Save the map
m.save('los_angeles_bus_ridership_map.html')

# Display the map (if you're in a Jupyter notebook)
m

In [None]:
# Convert GeoDataFrame to a GeoJSON-like Python dictionary
geojson_dict = merged_gdf.__geo_interface__

# Save as GeoJSON
with open("bus_segments_calculated_ridership.geojson", "w") as f:
    json.dump(geojson_dict, f)

print("Data saved successfully to bus_segments.geojson")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def compare_speed_calculations(gdf):
    """
    Compare two methods of speed calculation and create visualizations
    """
    # Create a new DataFrame for comparison
    comparison_df = pd.DataFrame()
    
    # Calculate both methods
  # Calculate both methods correctly
    comparison_df['min_method'] = gdf[['p50_mph_midday', 'p50_mph_pm', 'p50_mph_am']].min(axis=1)
    comparison_df['avg_method'] = (gdf['p50_mph_midday'] + gdf['p50_mph_pm'] + gdf['p50_mph_am']) / 3
    
    # Calculate summary statistics
    summary_stats = pd.DataFrame({
        'Minimum Method': comparison_df['min_method'].describe(),
        'Average Method': comparison_df['avg_method'].describe()
    })
    
    # Calculate additional statistics
    additional_stats = pd.DataFrame({
        'Metric': ['Skewness', 'Kurtosis', 'Median'],
        'Minimum Method': [
            comparison_df['min_method'].skew(),
            comparison_df['min_method'].kurtosis(),
            comparison_df['min_method'].median()
        ],
        'Average Method': [
            comparison_df['avg_method'].skew(),
            comparison_df['avg_method'].kurtosis(),
            comparison_df['avg_method'].median()
        ]
    })
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Histogram
    sns.histplot(data=comparison_df, x='min_method', ax=axes[0,0], label='Minimum Method', alpha=0.5)
    sns.histplot(data=comparison_df, x='avg_method', ax=axes[0,0], label='Average Method', alpha=0.5)
    axes[0,0].set_title('Distribution of Speed Calculations')
    axes[0,0].set_xlabel('Speed (mph)')
    axes[0,0].legend()
    
    # Box plot
    comparison_df_melted = pd.melt(comparison_df)
    sns.boxplot(data=comparison_df_melted, x='variable', y='value', ax=axes[0,1])
    axes[0,1].set_title('Box Plot of Speed Calculations')
    axes[0,1].set_xlabel('Method')
    axes[0,1].set_ylabel('Speed (mph)')
    
    # Q-Q plot for minimum method
    stats.probplot(comparison_df['min_method'], dist="norm", plot=axes[1,0])
    axes[1,0].set_title('Q-Q Plot - Minimum Method')
    
    # Q-Q plot for average method
    stats.probplot(comparison_df['avg_method'], dist="norm", plot=axes[1,1])
    axes[1,1].set_title('Q-Q Plot - Average Method')
    
    plt.tight_layout()
    
    # Calculate the difference between methods
    comparison_df['difference'] = comparison_df['min_method'] - comparison_df['avg_method']
    difference_stats = comparison_df['difference'].describe()
    
    return summary_stats, additional_stats, difference_stats, comparison_df, fig

# Run the comparison
summary_stats, additional_stats, difference_stats, comparison_df, fig = compare_speed_calculations(merged_gdf)

# Print statistics
print("Summary Statistics:")
print(summary_stats)
print("\nAdditional Statistics:")
print(additional_stats)
print("\nDifference Statistics (Minimum - Average):")
print(difference_stats)

# Display the plot (if in a notebook)
plt.show()

In [34]:
def calculate_delay_effects(gdf):
    """
    Calculate delay effects for the consolidated segments using average p50 speed across time periods.
    """
    # Calculate average p50 speed across the three time periods
    #gdf['p50_mph_avg'] = gdf[['p50_mph_midday', 'p50_mph_pm', 'p50_mph_am']].min(axis=1)
    gdf['p50_mph_avg'] = (gdf['p50_mph_midday'] + gdf['p50_mph_pm'] + gdf['p50_mph_am'] )/ 3
    #gdf['p50_mph_avg'] = ((gdf['p50_mph_midday'] * 20) + (gdf['p50_mph_pm'] * 40) + (gdf['p50_mph_am'] * 40) ) / 100

    # Calculate difference from average speed
    gdf['diff_from_avg'] = gdf['p80_mph'] - gdf['p50_mph_avg']
    
    gdf['actual_time'] = gdf['miles_from_last'] / gdf['p50_mph_avg']
    gdf['best_time'] = gdf['miles_from_last'] / gdf['p80_mph']


    gdf['delay_hours'] = np.maximum(0, gdf['actual_time'] - gdf['best_time'])
    gdf['time_lost'] = gdf['delay_hours'] * 60

    
    # Calculate total ridership minutes lost
    gdf['total_ridership_minutes_lost'] = gdf['time_lost'] * gdf['calculated_segment_ridership']
    return gdf

In [35]:
merged_gdf = calculate_delay_effects(merged_gdf)

In [None]:
merged_gdf

In [37]:
gdf_to_merge = merged_gdf[['stop_id', 'stop_name', 'route_id', 'route_short_name', 'direction_id',
        'miles_from_last', 'geometry', 'uid',
       'stop_sequence',
       'stop_route_short_dir_id',
       'calculated_segment_ridership', 'p50_mph_avg', 'p80_mph', 'diff_from_avg',
       'time_lost', 'total_ridership_minutes_lost']]

In [38]:
# segments_data = gpd.read_file("segments.geojson")
# segments_data = segments_data.drop(columns=['segment_id', 'row_id', 'trip_id', 'traversals', 'traversal_time', 'speed', 'stop_id2'])
# segments_data = segments_data.rename(columns={"stop_id1": "stop_id"})

# def split_before_dash(df, source_column, new_column):
#     df[new_column] = pd.to_numeric(df[source_column].str.split('-').str[0], errors='coerce')
#     return df

# segments_data = split_before_dash(segments_data, 'route_id', 'route_short_name_numeric')
# segments_data

In [39]:
#combined = merged_consolidated_gdf.merge(segments_data, how='left', on=['route_short_name_numeric', 'stop_id', 'direction_id'])
# combined = merged_consolidated_gdf
# combined

In [40]:
import pandas as pd
import numpy as np

def merge_without_expansion(merged_consolidated_gdf, segments_data):
    # Ensure we're working with copies to avoid modifying the original dataframes
    merged_gdf = merged_consolidated_gdf.copy()
    segments = segments_data.copy()

    # Remove rows where route_short_name is None or NaN
    merged_gdf = merged_gdf.dropna(subset=['route_short_name'])
    
    # Verify the removal
    print(f"Rows in merged_gdf after removing None/NaN route_short_name: {len(merged_gdf)}")

    # Check for duplicates in the merge keys
    merge_keys = ['route_short_name_numeric', 'stop_id', 'direction_id']
    
    print("\nChecking for duplicates in merged_gdf:")
    print(merged_gdf[merged_gdf.duplicated(subset=merge_keys, keep=False)])
    
    print("\nChecking for duplicates in segments:")
    print(segments[segments.duplicated(subset=merge_keys, keep=False)])

    # Handle duplicates if necessary
    # For example, keep the first occurrence of each duplicate
    merged_gdf = merged_gdf.drop_duplicates(subset=merge_keys, keep='first')
    segments = segments.drop_duplicates(subset=merge_keys, keep='first')

    # Perform the merge
    combined = merged_gdf.merge(
        segments,
        how='left',
        on=merge_keys,
        indicator=True  # This will help us see which rows were matched
    )

    # Check the merge results
    print("\nMerge results:")
    print(combined['_merge'].value_counts())

    # Remove the indicator column
    combined = combined.drop(columns=['_merge'])

    # Verify the final row count
    print(f"\nFinal row count: {len(combined)}")

    return combined

# try:
#     combined = merge_without_expansion(merged_consolidated_gdf, segments_data)
# except pd.errors.MergeError as e:
#     print(f"MergeError occurred: {e}")

In [None]:
gdf_to_merge['direction_id'] = gdf_to_merge['direction_id'].astype(str)
gdf_to_merge['stop_sequence'] = gdf_to_merge['stop_sequence'].astype(str)
gdf_to_merge['p50_mph_avg'] = gdf_to_merge['p50_mph_avg'].astype(str)
gdf_to_merge['p80_mph'] = gdf_to_merge['p80_mph'].astype(str)
gdf_to_merge['diff_from_avg'] = gdf_to_merge['diff_from_avg'].astype(str)
gdf_to_merge['calculated_segment_ridership_str'] = gdf_to_merge['calculated_segment_ridership'].astype(str)
gdf_to_merge.info()

In [None]:
gdf_to_merge

In [None]:
gdf_to_merge_dir_zero = gdf_to_merge[gdf_to_merge['direction_id'] == '0.0']
gdf_to_merge_dir_one = gdf_to_merge[gdf_to_merge['direction_id'] == '1.0']
print(gdf_to_merge_dir_zero.info())
print(gdf_to_merge_dir_one.info())

In [None]:
# CRS84 is equivalent to EPSG:4326 with longitude/latitude ordering
whole_crs = gdf_to_merge.set_crs('EPSG:4326', allow_override=True)
gdf_to_merge_dir_zero_crs = gdf_to_merge_dir_zero.set_crs('EPSG:4326', allow_override=True)
gdf_to_merge_dir_one_crs = gdf_to_merge_dir_one.set_crs('EPSG:4326', allow_override=True)

# Check lengths in decimal degrees
print("Lengths in decimal degrees:")
print(gdf_to_merge_dir_zero_crs.geometry.length.describe())

# Convert to UTM Zone 11N (appropriate for LA area) and check lengths
gdf_to_merge_dir_zero_crs = gdf_to_merge_dir_zero_crs.to_crs('EPSG:32611')
print("\nLengths in meters:")
print(gdf_to_merge_dir_zero_crs.geometry.length.describe())

In [46]:
def merge_geom_ov_fast(gdf, overlap_threshold=0.8, debug=True):
    """
    Optimized version of merge_geom_ov using spatial indexing and sparse matrices.
    
    Parameters:
    gdf (GeoDataFrame): Input GeoDataFrame with polygon geometries
    overlap_threshold (float): Minimum overlap ratio required for merging (0.0 to 1.0)
    debug (bool): Whether to print debug messages
    
    Returns:
    GeoDataFrame: Processed GeoDataFrame with selectively merged segments
    """
    def log_debug(message):
        if debug:
            print(message)

    try:
        log_debug("Starting optimized segment merging...")
        
        # Create spatial index
        spatial_index = gdf.sindex
        
        # Pre-compute areas for all polygons
        areas = np.array([geom.area for geom in gdf.geometry])
        
        # Initialize sparse matrix for connectivity
        n = len(gdf)
        rows = []
        cols = []
        data = []
        
        # Function to check overlap ratio
        def check_overlap(idx1, idx2):
            if idx1 == idx2:
                return False
                
            geom1 = gdf.geometry.iloc[idx1]
            geom2 = gdf.geometry.iloc[idx2]
            
            if not (isinstance(geom1, Polygon) and isinstance(geom2, Polygon)):
                return False
                
            intersection_area = geom1.intersection(geom2).area
            min_area = min(areas[idx1], areas[idx2])
            
            if min_area == 0:
                return False
                
            return (intersection_area / min_area) >= overlap_threshold

        # Use spatial index to find potential overlaps
        for idx in range(n):
            geom = gdf.geometry.iloc[idx]
            bounds = geom.bounds
            
            # Get potential neighbors using spatial index
            potential_matches_idx = list(spatial_index.intersection(bounds))
            
            # Filter actual overlaps
            for match_idx in potential_matches_idx:
                if match_idx <= idx:  # Only check each pair once
                    continue
                    
                if check_overlap(idx, match_idx):
                    rows.extend([idx, match_idx])
                    cols.extend([match_idx, idx])
                    data.extend([1, 1])
        
        # Create sparse matrix
        adjacency_matrix = csr_matrix((data, (rows, cols)), shape=(n, n))
        
        # Find connected components
        n_components, labels = connected_components(adjacency_matrix, directed=False)
        
        log_debug(f"Found {n_components} distinct groups")
        
        # Create new GeoDataFrame with merged geometries
        new_rows = []
        
        for component_id in range(n_components):
            # Get indices for this component
            component_indices = np.where(labels == component_id)[0]
            
            if len(component_indices) == 1:
                # Single segment, no merging needed
                new_rows.append(gdf.iloc[component_indices[0]])
            else:
                # Get the segments to merge
                group_data = gdf.iloc[component_indices]
                
                # Merge geometries using unary_union
                merged_geom = group_data.geometry.unary_union
                
                # Aggregate non-geometric data
                agg_data = {}
                for col in group_data.columns:
                    if col == 'geometry':
                        continue
                    elif col == 'miles_from_last':
                        agg_data[col] = group_data[col].max()
                    elif pd.api.types.is_numeric_dtype(group_data[col]):
                        agg_data[col] = group_data[col].sum()
                    else:
                        values = group_data[col].dropna().unique()
                        agg_data[col] = ', '.join(str(v) for v in values)
                
                agg_data['geometry'] = merged_geom
                new_rows.append(pd.Series(agg_data))
        
        # Create final GeoDataFrame
        merged_gdf = gpd.GeoDataFrame(new_rows, crs=gdf.crs)
        log_debug(f"Final segment count: {len(merged_gdf)}")
        
        return merged_gdf
        
    except Exception as e:
        log_debug(f"Error during merging: {str(e)}")
        return None

In [None]:
try:
    # shorten_distance and buffer_width are now in meters
    processed = merge_geom_ov_fast(gdf_to_merge_dir_zero_crs)     
    processed_one = merge_geom_ov_fast(gdf_to_merge_dir_one)     

    if processed is None:
        print("Processing failed - check the error messages above")
except Exception as e:
    print(f"Unexpected error: {e}")

In [None]:

try:
    # shorten_distance and buffer_width are now in meters
    whole_crs_proc = merge_geom_ov_fast(whole_crs)     

    if processed is None:
        print("Processing failed - check the error messages above")
except Exception as e:
    print(f"Unexpected error: {e}")

In [49]:
processed['minutes_lost_per_mile'] = processed['total_ridership_minutes_lost'] / processed['miles_from_last']
processed_one['minutes_lost_per_mile'] = processed_one['total_ridership_minutes_lost'] / processed_one['miles_from_last']
whole_crs_proc['minutes_lost_per_mile'] = whole_crs_proc['total_ridership_minutes_lost'] / whole_crs_proc['miles_from_last']

In [None]:
import folium
import geopandas as gpd
import numpy as np
from branca.colormap import LinearColormap

# Create a map centered on Los Angeles
m = folium.Map(location=[34.0522, -118.2437], zoom_start=10)

# Create a color map
vmin = processed['total_ridership_minutes_lost'].min()
vmax = processed['total_ridership_minutes_lost'].max()
color_map = LinearColormap(colors=['blue', 'yellow', 'red'], vmin=vmin, vmax=vmax)

# Function to style the features
def style_function(feature):
    ridership = feature['properties']['total_ridership_minutes_lost']
    if ridership is None or np.isnan(ridership):
        color = 'gray'  # Use gray for null values
    else:
        color = color_map(ridership)
    return {
        'color': color,
        'weight': 2,
        'fillOpacity': 0.7
    }

# Function to create popups
def popup_function(feature):
    props = feature['properties']
    ridership = props['total_ridership_minutes_lost']
    if ridership is None or np.isnan(ridership):
        ridership_str = "No data"
    else:
        ridership_str = f"{ridership:.2f}"
    return folium.Popup(
        f"Stop Name: {props['stop_name']}<br>"
        f"Route: {props['route_id']}<br>"
        f"Minutes Lost: {ridership_str}"
    )

# Define fields and aliases for tooltip (excluding 'geometry')
tooltip_fields = [
    'stop_id', 'stop_name', 'route_id', 'direction_id',
    'miles_from_last', 'calculated_segment_ridership_str', 'calculated_segment_ridership',  'p50_mph_avg',
    'diff_from_avg', 'time_lost', 'total_ridership_minutes_lost', 'uid'
]

tooltip_aliases = [
    'Stop ID:', 'Stop Name:', 'Route ID:', 'Direction:',
    'Miles From Last:', 'Segment Ridership:', 'Segment Ridership Aggregated:', 'Average Speed (mph):',
    'Speed Difference:', 'Time Lost (min):', 'Total Minutes Lost:', 'uid'
]

# Add the first GeoDataFrame to the map
folium.GeoJson(
    processed,
    style_function=style_function,
    popup=popup_function,
    tooltip=folium.GeoJsonTooltip(
        fields=tooltip_fields,
        aliases=tooltip_aliases,
        localize=True,
        sticky=False,
        labels=True,
        style="""
            background-color: #F0EFEF;
            border: 2px solid black;
            border-radius: 3px;
            box-shadow: 3px;
            padding: 10px;
            font-size: 12px;
        """,
        max_width=800,
    )
).add_to(m)

# Add the second GeoDataFrame to the map
folium.GeoJson(
    processed_one,
    style_function=style_function,
    popup=popup_function,
    tooltip=folium.GeoJsonTooltip(
        fields=tooltip_fields,
        aliases=tooltip_aliases,
        localize=True,
        sticky=False,
        labels=True,
        style="""
            background-color: #F0EFEF;
            border: 2px solid black;
            border-radius: 3px;
            box-shadow: 3px;
            padding: 10px;
            font-size: 12px;
        """,
        max_width=800,
    )
).add_to(m)

# Add the color map to the map
color_map.add_to(m)

# Add a title to the map
title_html = '''
    <h3 align="center" style="font-size:16px"><b>Los Angeles Bus Segments</b></h3>
'''
m.get_root().html.add_child(folium.Element(title_html))

# Save the map
# m.save('los_angeles_bus_map.html')

# Display the map (if you're in a Jupyter notebook)
m

In [None]:
import folium
import geopandas as gpd
import numpy as np
from branca.colormap import LinearColormap

# Create a map centered on Los Angeles
m = folium.Map(location=[34.0522, -118.2437], zoom_start=10)

# Create a color map
vmin = processed['minutes_lost_per_mile'].min()
vmax = processed['minutes_lost_per_mile'].max()
color_map = LinearColormap(colors=['blue', 'yellow', 'red'], vmin=vmin, vmax=vmax)

# Function to style the features
def style_function(feature):
    ridership = feature['properties']['minutes_lost_per_mile']
    if ridership is None or np.isnan(ridership):
        color = 'gray'  # Use gray for null values
    else:
        color = color_map(ridership)
    return {
        'color': color,
        'weight': 2,
        'fillOpacity': 0.7
    }

# Function to create popups
def popup_function(feature):
    props = feature['properties']
    ridership = props['minutes_lost_per_mile']
    if ridership is None or np.isnan(ridership):
        ridership_str = "No data"
    else:
        ridership_str = f"{ridership:.2f}"
    return folium.Popup(
        f"Stop Name: {props['stop_name']}<br>"
        f"Route: {props['route_id']}<br>"
        f"Minutes Lost: {ridership_str}"
    )

# Define fields and aliases for tooltip (excluding 'geometry')
tooltip_fields = [
    'stop_id', 'stop_name', 'route_id', 'direction_id',
    'miles_from_last', 'calculated_segment_ridership_str', 'calculated_segment_ridership',  'p50_mph_avg',
    'diff_from_avg', 'time_lost', 'total_ridership_minutes_lost', 'minutes_lost_per_mile'
]

tooltip_aliases = [
    'Stop ID:', 'Stop Name:', 'Route ID:', 'Direction:',
    'Miles From Last:', 'Segment Ridership:', 'Segment Ridership Aggregated:', 'Average Speed (mph):',
    'Speed Difference:', 'Time Lost (min):', 'Total Minutes Lost:', 'minutes_lost_per_mile'
]

# Add the first GeoDataFrame to the map
folium.GeoJson(
    processed,
    style_function=style_function,
    popup=popup_function,
    tooltip=folium.GeoJsonTooltip(
        fields=tooltip_fields,
        aliases=tooltip_aliases,
        localize=True,
        sticky=False,
        labels=True,
        style="""
            background-color: #F0EFEF;
            border: 2px solid black;
            border-radius: 3px;
            box-shadow: 3px;
            padding: 10px;
            font-size: 12px;
        """,
        max_width=800,
    )
).add_to(m)

# Add the second GeoDataFrame to the map
folium.GeoJson(
    processed_one,
    style_function=style_function,
    popup=popup_function,
    tooltip=folium.GeoJsonTooltip(
        fields=tooltip_fields,
        aliases=tooltip_aliases,
        localize=True,
        sticky=False,
        labels=True,
        style="""
            background-color: #F0EFEF;
            border: 2px solid black;
            border-radius: 3px;
            box-shadow: 3px;
            padding: 10px;
            font-size: 12px;
        """,
        max_width=800,
    )
).add_to(m)

# Add the color map to the map
color_map.add_to(m)

# Add a title to the map
title_html = '''
    <h3 align="center" style="font-size:16px"><b>Los Angeles Bus Segments</b></h3>
'''
m.get_root().html.add_child(folium.Element(title_html))

# Save the map
#m.save('los_angeles_bus_map.html')

# Display the map (if you're in a Jupyter notebook)
m

In [None]:
processed_one.sort_values('total_ridership_minutes_lost', ascending=False).head(10)


In [None]:
# Calculate total ridership minutes lost across all segments
total_minutes = processed['total_ridership_minutes_lost'].sum() + processed_one['total_ridership_minutes_lost'].sum()

# Convert to hours for better readability
total_hours = total_minutes / 60

per_pass = total_minutes / 770000

print(f"Total passenger minutes lost: {total_minutes:,.2f}")
print(f"Total passenger hours lost: {total_hours:,.2f}")
print(f"Total minutes per passenger lost: {per_pass:,.2f}")

In [54]:
# Convert both to WGS 84
processed = processed.to_crs("EPSG:4326")
processed_one = processed_one.to_crs("EPSG:4326")

# Now concatenate
recombined = pd.concat([processed, processed_one], ignore_index=True)

# Export to GeoJSON
recombined.to_file("combined_routes.geojson", driver='GeoJSON')