In [21]:
import pandas as pd
try:
    import osmsnx
except:
    %pip install osmnx
    import osmnx
import os
import geopandas as gpd
try:
    import plotly.express as px
except:
    %pip install plotly.express
    import plotly.express as px
from shapely.geometry import Point

Note: you may need to restart the kernel to use updated packages.


In [7]:
bus_delays = gpd.read_file('data/bus_delay_2025.csv')

In [31]:
bus_delays

Unnamed: 0,_id,Date,Line,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Vehicle
0,1,2025-01-01T00:00:00,102 MARKHAM ROAD,02:15,Wednesday,WARDEN STATION,MFESA,20,40,N,3442
1,2,2025-01-01T00:00:00,65 PARLIAMENT,02:15,Wednesday,KIPLING STATION,MFUS,0,0,,0
2,3,2025-01-01T00:00:00,64 MAIN,02:40,Wednesday,BROADVIEW STATION,MFUI,0,0,,8546
3,4,2025-01-01T00:00:00,100 FLEMINGDON PARK,02:43,Wednesday,OVERLEA AND THORNCLIFF,MFSAN,17,32,N,8693
4,5,2025-01-01T00:00:00,34 EGLINTON EAST,03:05,Wednesday,EGLINTON AND DON MILLS,MFUI,20,40,W,8801
...,...,...,...,...,...,...,...,...,...,...,...
46286,46287,2025-09-30T00:00:00,131 NUGGET,23:42,Tuesday,NUGGET AND SHORTING,MFDV,21,42,W,1283
46287,46288,2025-09-30T00:00:00,129 MCCOWAN NORTH,00:12,Tuesday,MCCOWAND AND TRAIL RID,EFD,30,30,S,3212
46288,46289,2025-09-30T00:00:00,40 JUNCTION-DUNDAS WES,00:29,Tuesday,DUNDAS AND PACIFIC,MFSAN,30,60,E,8123
46289,46290,2025-09-30T00:00:00,72 PAPE,00:54,Tuesday,PAPE STATION,MFUI,10,20,N,8410


In [41]:
stops_locations = gpd.read_file('data/stops.csv')
stops_locations

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,662,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1
1,929,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1
2,940,940,Davenport Rd at Dupont St,,43.675511,-79.401938,,,,,,2
3,1871,1871,Davisville Ave at Cleveland St,,43.702088,-79.378112,,,,,,1
4,11700,11700,Disco Rd at Attwell Dr,,43.701362,-79.594843,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9301,16806,16806,Kingston Rd at Rylander Blvd,,43.797281,-79.148975,,,,,,1
9302,16807,16807,Port Union Rd at Tilley Dr,,43.788615,-79.140347,,,,,,1
9303,16808,16808,Port Union Rd at Conference Blvd,,43.786023,-79.139246,,,,,,1
9304,16809,16809,Lawrence Ave East at Port Union Rd,,43.779626,-79.136112,,,,,,1


In [26]:
#converting separate longitude and latitude comments into a point geometry
stops_locations['geometry'] = stops_locations.apply(
    lambda row: Point(row['stop_lon'], row['stop_lat']), 
    axis=1
)
gdf = gpd.GeoDataFrame(stops_locations, geometry='geometry', crs="EPSG:4326")
gdf.explore()

In [82]:
#just a vibe check
gdf.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,geometry
0,662,662,DANFORTH AT KENNEDY,,43.714379,-79.260939,,,,,,1,POINT (-79.26094 43.71438)
1,929,929,DAVENPORT AT BEDFORD,,43.674448,-79.399659,,,,,,1,POINT (-79.39966 43.67445)
2,940,940,DAVENPORT AT DUPONT,,43.675511,-79.401938,,,,,,2,POINT (-79.40194 43.67551)
3,1871,1871,DAVISVILLE AT CLEVELAND,,43.702088,-79.378112,,,,,,1,POINT (-79.37811 43.70209)
4,11700,11700,DISCO AT ATTWELL,,43.701362,-79.594843,,,,,,1,POINT (-79.59484 43.70136)


In [42]:
# converting data from object to numeric + filling N/A sections
bus_delays_clean = bus_delays.copy()
bus_delays_clean['Min Delay'] = pd.to_numeric(bus_delays_clean['Min Delay'], errors='coerce')
bus_delays_clean['Bound'] = bus_delays_clean['Bound'].fillna('Unknown')

#summarizing delay data by stop, summing instances + minutes
delay_summary = (bus_delays_clean
                 .groupby(['Line', 'Station', 'Bound'])
                 .agg(delay_count=('Min Delay', 'count'), 
                      total_delay_minutes=('Min Delay', 'sum'))
                 .reset_index())
                 #.sort_values('delay_count', ascending=False))

# After creating delay_summary, filtering out rows with 0 total_delay_minutes, cus i think they don't make sense?
delay_summary = delay_summary[delay_summary['total_delay_minutes'] > 0]
delay_summary

Unnamed: 0,Line,Station,Bound,delay_count,total_delay_minutes
0,08,BROADVIEW STATION,S,1,22
1,09 BELLAMY,BELLAMY AND NELSON,,1,92
4,1 SHUTTLE,EGLINTON STATION,,1,7
10,10 VAN HORNE,BRIAN AND VAN HORNE,N,1,30
11,10 VAN HORNE,DON MILLS AND GOODVIEW,S,1,30
...,...,...,...,...,...
21404,SHUTTLE BUS,UNIVERSITY AND KING,S,1,1
21405,SHUTTLE BUS,WARDEN STATION,N,1,2
21407,SHUTTLE BUS,WOODBINE STATION,E,1,2
21409,SHUTTLE BUS,YONGE AND GERRARD,S,1,1


In [48]:
#this was done with the hape of deepseek guys
import re 
def clean_stop_name(name):
    """Clean stop names by removing street types and standardizing formats"""
    if pd.isna(name):
        return ""
    
    name = str(name).upper().strip()
    
    # Remove common street types
    street_types = [' RD', ' BLVD',' DR',' AVE', ' ST', ' CRES',
                   ' LN', ' PL', ' CT']
    
    for street_type in street_types:
        name = re.sub(r'\b' + re.escape(street_type) + r'\b', '', name)
    
    # Standardize intersection connectors
    name = re.sub(r'\s+AND\s+', ' AT ', name)
    
    # Clean up extra spaces
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name

In [49]:
# Modify stop_name column IN PLACE in gdf
gdf['stop_name'] = gdf['stop_name'].apply(clean_stop_name)

# Modify Station column IN PLACE in delay_summary  
delay_summary['Station'] = delay_summary['Station'].apply(clean_stop_name)  # Using same function

print("Modified gdf stop_name column:")
print(gdf['stop_name'].head())
print("\nModified delay_summary Station column:")
print(delay_summary['Station'].head())

Modified gdf stop_name column:
0        DANFORTH AT KENNEDY
1       DAVENPORT AT BEDFORD
2        DAVENPORT AT DUPONT
3    DAVISVILLE AT CLEVELAND
4           DISCO AT ATTWELL
Name: stop_name, dtype: object

Modified delay_summary Station column:
0         BROADVIEW STATION
1         BELLAMY AT NELSON
4          EGLINTON STATION
10       BRIAN AT VAN HORNE
11    DON MILLS AT GOODVIEW
Name: Station, dtype: object


In [65]:
# Now joining stop data coords to delay data using the modified columns
delay_with_coords = delay_summary.merge(
    gdf[['stop_name', 'geometry']],  
    left_on='Station', 
    right_on='stop_name',
    how='left'  # Keep all delay records even if no match found
)

print(f"Joined dataset shape: {delay_with_coords.shape}")
print(f"Successfully matched: {delay_with_coords['geometry'].notna().sum()} out of {len(delay_with_coords)}")

Joined dataset shape: (26584, 7)
Successfully matched: 12517 out of 26584


In [78]:
delay_with_coords.sort_values('delay_count', ascending=False)
#seems like there is a bunch of duplicates
#there is different coords for finch west but the delay count and delay minutes are the same...???

Unnamed: 0,Line,Station,Bound,delay_count,total_delay_minutes,stop_name,geometry
10935,36 FINCH WEST,FINCH WEST STATION,,80,580,FINCH WEST STATION,POINT (-79.49095 43.76536)
10937,36 FINCH WEST,FINCH WEST STATION,,80,580,FINCH WEST STATION,POINT (-79.49086 43.76492)
10938,36 FINCH WEST,FINCH WEST STATION,,80,580,FINCH WEST STATION,POINT (-79.49081 43.76472)
10936,36 FINCH WEST,FINCH WEST STATION,,80,580,FINCH WEST STATION,POINT (-79.4909 43.76513)
9375,32 EGLINTON WEST,EGLINTON STATION,,79,732,EGLINTON STATION,POINT (-79.39946 43.70557)
...,...,...,...,...,...,...,...
11159,37 ISLINGTON,ISLINGTON AT BIRMINGH,S,1,30,,
11156,37 ISLINGTON,ISLINGTON AT BERGAMOT,N,1,20,ISLINGTON AT BERGAMOT,POINT (-79.5562 43.71712)
11155,37 ISLINGTON,ISLINGTON AT BERGAMOT,N,1,20,ISLINGTON AT BERGAMOT,POINT (-79.55574 43.71675)
11154,37 ISLINGTON,ISLINGTON AT BERGAMON,S,1,15,,


In [75]:
# Removing duplicates based on specific key columns (most common approach)
initial_count = len(delay_with_coords)
delay_clean = delay_with_coords.drop_duplicates(
    subset=['Line', 'Station', 'Bound', 'delay_count', 'total_delay_minutes', 'stop_name']  # Adjust columns as needed
)
final_count = len(delay_clean)

print(f"Removed {initial_count - final_count} duplicate rows")
print(f"Final count: {final_count} rows")

Removed 7915 duplicate rows
Final count: 18669 rows


In [79]:
delay_clean.sort_values('delay_count', ascending=False)

Unnamed: 0,Line,Station,Bound,delay_count,total_delay_minutes,stop_name,geometry
10935,36 FINCH WEST,FINCH WEST STATION,,80,580,FINCH WEST STATION,POINT (-79.49095 43.76536)
9369,32 EGLINTON WEST,EGLINTON STATION,,79,732,EGLINTON STATION,POINT (-79.39831 43.70431)
595,102 MARKHAM ROAD,WARDEN STATION,N,72,1007,WARDEN STATION,POINT (-79.28001 43.70982)
2291,116 MORNINGSIDE,KENNEDY STATION,,68,494,,
7816,24 VICTORIA PARK,VICTORIA PARK STATION,N,61,845,VICTORIA PARK STATION,POINT (-79.28846 43.69398)
...,...,...,...,...,...,...,...
10571,35 JANE,KEELE STATION,N,1,10,KEELE STATION,POINT (-79.45957 43.65565)
10576,35 JANE,MOUNT DENNIS,N,1,13,,
10577,35 JANE,MOUNT DENNIS GARAGE,N,1,12,MOUNT DENNIS GARAGE,POINT (-79.49545 43.69349)
10579,35 JANE,MT DENNIS GARAGE,,1,5,,


In [81]:
print(f"Successfully matched: {delay_clean['geometry'].notna().sum()} out of {len(delay_clean)}")

Successfully matched: 4613 out of 18669
