In [1]:
try:
    import Levenshtein # python library containing functions using Levenshtein distances, to help string-matching
except:
    %pip install Levenshtein
    import Levenshtein

# Found out about it here: https://www.geeksforgeeks.org/python/introduction-to-python-levenshtein-module/

In [2]:
import geopandas as gpd # import geopandas to read in data files

In [3]:
# read in 2025 bus delay data
bus_2025_stops_delays = gpd.read_file('bus_delay_station_count_2025.csv')

In [4]:
# read in the official stop names from gtfs for matching year
gtfs_stops_data = gpd.read_file('./oct_2025_stops/stops.shp')

# visualize the data
gtfs_stops_data['stop_name'].values[:20]

array(['Danforth Rd at Kennedy Rd', 'Davenport Rd at Bedford Rd',
       'Davenport Rd at Dupont St', 'Davisville Ave at Cleveland St',
       'Disco Rd at Attwell Dr', 'Disco Rd at Attwell Dr',
       'Disco Rd at Carlingview Dr East Side',
       'Disco Rd at Carlingview Dr', 'Don Mills Rd at Eglinton Ave East',
       'Don Mills Rd at Eglinton Ave East North Side',
       'Don Mills Rd at Lawrence Ave East South Side',
       'Don Mills Rd at Leith Hill Rd',
       'Don Mills Rd at Steeles Ave East',
       'Don Mills Rd at Van Horne Ave', 'Don Mills Rd at Wynford Dr',
       'Doncaster Ave at Main St West Side', 'Doncaster Ave at Main St',
       'Doncaster Ave at Main St', 'Drewry Ave at Gardenia Crt',
       'Drewry Ave at Grantbrook St'], dtype=object)

In [34]:
# subset to work with only relevant data for now
names_points = gtfs_stops_data[['stop_name', 'geometry']]

import string # allows for neater string cleaning

def clean_name(name: str) -> str:
    '''
    Takes in a name from either input file. Cleans them to be consistently formatted.
    
    '''
    # things to be removed from the file that contains them to assist in matching
    removers = ['RD', 'CRT',
                 'DR', 'ST', 'BLVD', 'AVE', 'GR',
                 'LN', 'SIDE', 'PKWY', 'PL']
    
    directions = ['NORTH', 'SOUTH', 'EAST', 'WEST']
    
    # possible connectors for intersections
    special_cases = ['AT', 'AND', '&', '/']
    
    # tokenize name
    split_form = name[0].split(' ')

    # accumulator list for cleaned name
    corrected_form = []

    for item in split_form:
        # make it uppercase
        item = item.upper() # make uppercase but do not clean punctuation out YET because it might be valuable
        
        if item in special_cases: # changing all ATs to ANDs for consistency
            corrected_form.append('&')

        item = item.translate(str.maketrans('', '', string.punctuation)) # now we can remove punctuation

        if item == 'STN': # changing all instances of STN to STATION for consistency
            corrected_form.append('STATION')
        
        elif item in directions: # relace directions with letters for consistency and to minimize Levenshtein distance when the direction is only in one string
            corrected_form.append(item[0])

        elif (item not in removers) and (item not in special_cases): # don't add any tokens that we do not want
            corrected_form.append(item.upper())
        
    # put all the tokens back together
    joined_form = " ". join(corrected_form)
    
    # return cleaned string
    return joined_form




In [38]:
# create new column storing cleaned versions of names from gtfs data
names_points['clean_name'] = names_points.apply(lambda row: clean_name([row['stop_name']]), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [37]:
# create analagous column in the delay bus names dataset
bus_2025_stops_delays['clean_name'] = bus_2025_stops_delays.apply(lambda row: clean_name([row['Location']]), axis = 1)

In [39]:
subset = list(set(bus_2025_stops_delays['clean_name'].values))[:30] # created a subset for testing out the match technique without repeats
dictionary = list(set(names_points['clean_name'].values)) # dictionary for testing, no repeats

for name in subset:
    # get the dictionary item with the shortest Levenshtein distance (minimum number of insertions, deletions, modifications to turn one string into another)
    closest_match = min(dictionary, key = lambda x: Levenshtein.distance(name, x))
    print(f"Closest match for '{name}': {closest_match}")


Closest match for '': 300 BAY
Closest match for 'MSNS': 400 EVANS
Closest match for 'SHEPPARD & BRENYON WAY': SHEPPARD E & BRENYON WAY W
Closest match for 'S  KODIAK & SHEPPARD': OAKDALE & SHEPPARD W
Closest match for 'CHERRY &  VILLARS': CHERRY & MILL S
Closest match for 'MILADY & ISLINGTON': EVANS & ISLINGTON
Closest match for 'MILLIKEN & KENNEDY': CLAIR E & KENNEDY
Closest match for 'BARMACSTEELE': 24 MABELLE
Closest match for 'KEELE &  GULLIVER': KEELE & GULLIVER
Closest match for 'VESTA & BATHURST': FLEET & BATHURST
Closest match for 'PJPS': 47 ORFUS
Closest match for 'SUNNYBROOK HOSPITAL HEES WING': GRANTBROOK & FINCH W N
Closest match for 'MANSE & HAINFORD': MANSE & HAINFORD
Closest match for 'CONSUMERS & VICTORIA PARK': CONSUMERS & VICTORIA PARK W
Closest match for 'BROADOAKS & KEELE': ROGERS & KEELE
Closest match for 'ROYWOOD': TORONTO ZOO
Closest match for 'EMMETT & EGLINTON': EMMETT & EGLINTON W N
Closest match for 'EGLINTON & MARLEE': EGLINTON W & MARLEE
Closest match for '

In [None]:
# try a better method that only assigns matches for good quality matches
bad_matches = []
good_matches = []
mid_matches = []
subset = list(set(bus_2025_stops_delays['clean_name'].values)) # make a set so no repeats
print(len(subset))

for name in subset:
    closest_match = min(dictionary, key = lambda x: Levenshtein.distance(name, x))
    if Levenshtein.ratio(closest_match, name) > 0.85: # Closest match has high ratio = good match
        good_matches.append(f"'{name}': {closest_match}")
    elif Levenshtein.ratio(closest_match, name) > 0.8 # Closest match has medium ratio = mid match
        mid_matches.append(f"'{name}': {closest_match}")
    else:
        bad_matches.append(f"'{name}': {closest_match}") # ratio below 8 is a bad match


2387


In [43]:
#print(good_matches)
print(len(subset))
print("NUMBER OF BAD MATCHES: ", len(bad_matches))
print(bad_matches[:15])
print("NUMBER OF MID MATCHES:", len(mid_matches))
print(mid_matches[:15])
print("NUMBER OF GOOD MATCHES: ", len(good_matches))
print(good_matches[:15])

2387
NUMBER OF BAD MATCHES:  1074
["'': 300 BAY", "'MSNS': 400 EVANS", "'S  KODIAK & SHEPPARD': OAKDALE & SHEPPARD W", "'MILADY & ISLINGTON': EVANS & ISLINGTON", "'MILLIKEN & KENNEDY': CLAIR E & KENNEDY", "'BARMACSTEELE': 24 MABELLE", "'PJPS': 47 ORFUS", "'SUNNYBROOK HOSPITAL HEES WING': GRANTBROOK & FINCH W N", "'BROADOAKS & KEELE': ROGERS & KEELE", "'ROYWOOD': TORONTO ZOO", "'91A PARKVIEW HILLS ROUTE': PARKVIEW HILL CRES & ALDER", "'COXWELL &  MORTIMER': COXWELL & ROBBINS", "'FOREMAN & EGLINTON': WARDEN & ELLINGTON", "'STCLAIR LOOP': STEELES E LOOP", "'KEELE RTE': KEELE & TORO"]
NUMBER OF MID MATCHES: 187
["'CHERRY &  VILLARS': CHERRY & MILL S", "'VESTA & BATHURST': FLEET & BATHURST", "'SHEPPARD & BEAMAN': SHEPPARD E & REAN", "'MAPLE LEAF & CULFORD': MAPLE LEAF & BOURDON", "'FOUNDER & IAN MACDONALD': THE POND & IAN MACDONALD", "'WESTON RDOAK': WESTON & OAK", "'KIPLING STATION TO  T3': KIPLING STATION", "'YOUNG &  FINCH': YONGE & FINCH E", "'ATHOL & ISLINGTON': ALBION & ISLINGTON", "'

In [None]:
print('Best match rate observed: ', len(good_matches)/len(subset) * 100

Best match rate observed:  0.47172182656053624
