In [56]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import unicodedata
import re
import os

In [57]:
from sentence_transformers import SentenceTransformer, util
from sklearn.neighbors import NearestNeighbors

# Initialize BERT model
model = SentenceTransformer('paraphrase-mpnet-base-v2')

## Load data

In [58]:
df_cra = pd.read_csv("../cra-data/cra_2021_combined_filtered.csv")
df_cra = df_cra.drop(columns=["Province", "Country"])
df_cra = df_cra.rename(columns={
    "BN": "CRA_BN_ID",
    "Category": "CRA_Category",
    "Sub Category": "CRA_SubCategory",
    "Designation": "CRA_Designation",
    "Legal Name": "CRA_LegalName",
    "Account Name": "CRA_Name",
    "Address Line 1": "CRA_Address1",
    "Address Line 2": "CRA_Address2",
    "City": "CRA_City",
    "Postal Code": "CRA_PostalCode",
    "Full Address": "CRA_FullAddress",
    "4050": "CRA_4050",
    "4155": "CRA_4155",
    "4850": "CRA_4850",
})

# Replace semicolons with " - " in all string columns
df_cra = df_cra.apply(lambda x: x.str.replace(';', ' - ') if x.dtype == 'object' else x)
# df_cra["Name"] = df_cra["Name"].str.split(pat=" - ", n=1).str[0]

df_cra = df_cra.drop_duplicates(subset=["CRA_Name"]).reset_index(drop=True)

In [59]:
# Clean and rename 211 data
df_211 = pd.read_csv("../211-data/2021_211_PeelYorkTO.csv", encoding='latin-1')
df_211 = df_211.drop(columns=["TaxonomyTerms"])
df_211 = df_211.rename(columns={
    "PublicName": "211_PublicName",
    "ParentAgency": "211_Name",
    "Address1":"211_Address1",
    "Address2":"211_Address2",
    "City":"211_City",
    "County":"211_County",
    "Province":"211_Province",
    "PostalCode":"211_PostalCode",
    "Latitude": "211_Latitude",
    "Longitude": "211_Longitude",
})

# Replace semicolons with " - " in all string columns
df_211 = df_211.apply(lambda x: x.str.replace(';', ' - ') if x.dtype == 'object' else x)

df_211 = df_211.drop_duplicates(subset=["211_PublicName", "211_Latitude", "211_Longitude"]).reset_index(drop=True)

## Algorithmic matching

In [62]:
# Clean text functions
def clean_text(text):
    if pd.isna(text):
        return ''
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'[^a-zA-Z0-9]', '', text).lower()
    return text

def clean_text_with_spaces(text):
    if pd.isna(text):
        return ''
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    # Keep spaces but remove other special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text).lower()
    # Collapse multiple spaces into one
    text = re.sub(r' +', ' ', text).strip()
    return text

# Prepare CRA data
df_cra_prep = df_cra.copy()
df_cra_prep['clean_name_CRA'] = df_cra['CRA_Name'].apply(clean_text)
df_cra_prep['clean_name_space_CRA'] = df_cra['CRA_Name'].apply(clean_text_with_spaces)
df_cra_prep['PostalCode_clean_CRA'] = df_cra['CRA_PostalCode'].str.replace(r'\s+', '', regex=True)

# Prepare 211 data
df_211_prep = df_211.copy()
df_211_prep['clean_name_211'] = df_211['211_Name'].apply(clean_text)
df_211_prep['clean_name_space_211'] = df_211['211_Name'].apply(clean_text_with_spaces)
df_211_prep['PostalCode_clean_211'] = df_211['211_PostalCode'].str.replace(r'\s+', '', regex=True)

print("Data preparation complete")
print(f"CRA records: {len(df_cra_prep)}")
print(f"211 records: {len(df_211_prep)}")

Data preparation complete
CRA records: 6519
211 records: 6405


In [63]:
# STEP 1: Input manual matches we've already found
df_manual_map = pd.read_csv('../joined-data/manual_matches.csv')

# Merge with df_cra_prep on CRA_name <-> Name
df_cra_joined = df_manual_map.merge(df_cra_prep, left_on='CRA_name', right_on='CRA_Name', how='left', suffixes=('', '_cra'))

# Merge with df_211_prep on 211_name <-> Name
df_full_joined = df_cra_joined.merge(df_211_prep, left_on='211_name', right_on='211_Name', how='left', suffixes=('', '_211'))
df_full_joined = df_full_joined.drop(columns=['CRA_name', '211_name'])
df_full_joined['Match_Method'] = 'Manual'

remaining_cra = df_cra_prep[~df_cra_prep['clean_name_CRA'].isin(df_full_joined['clean_name_CRA'].unique())]
remaining_211 = df_211_prep[~df_211_prep['clean_name_211'].isin(df_full_joined['clean_name_211'].unique())]

df_full_joined = df_full_joined.drop(columns=['clean_name_CRA', 'clean_name_space_CRA', 'PostalCode_clean_CRA', 'clean_name_211', 'clean_name_space_211', 'PostalCode_clean_211'])

# df_full_joined.to_csv('../joined-data/manual_matches_joined.csv', index=False)

print(f"Step 1 Complete: {len(df_full_joined)} matched rows")
print(f"Unique CRA orgs matched: {df_full_joined['CRA_BN_ID'].nunique()}")
print(f"Unique 211 branches matched: {df_full_joined['211_PublicName'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")

Step 1 Complete: 288 matched rows
Unique CRA orgs matched: 94
Unique 211 branches matched: 285
Remaining CRA records: 6425
Remaining 211 branches: 6117


In [70]:
# STEP 2: Exact name matching
exact_matches = pd.merge(
    df_cra_prep,
    df_211_prep,
    left_on='clean_name_CRA',
    right_on='clean_name_211',
    how='inner',
    suffixes=('_cra', '_211')
)

if not exact_matches.empty:
    matched_orgs = exact_matches['clean_name_CRA'].unique()
    all_branches = df_211_prep[df_211_prep['clean_name_211'].isin(matched_orgs)]
    
    exact_match_results = pd.merge(
        df_cra_prep[df_cra_prep['clean_name_CRA'].isin(matched_orgs)],
        all_branches,
        left_on='clean_name_CRA',
        right_on='clean_name_211',
        how='left',
        suffixes=('_cra', '_211')
    )
    exact_match_results['Match_Method'] = 'Exact Name'
else:
    exact_match_results = pd.DataFrame()

# remaining_cra = df_cra_prep[~df_cra_prep['clean_name'].isin(exact_match_results['clean_name'].unique())]
# remaining_211 = df_211_prep[~df_211_prep['clean_name'].isin(exact_match_results['clean_name'].unique())]

remaining_cra = remaining_cra[~remaining_cra['clean_name_CRA'].isin(exact_match_results['clean_name_CRA'].unique())]
remaining_211 = remaining_211[~remaining_211['clean_name_211'].isin(exact_match_results['clean_name_211'].unique())]

exact_match_results = exact_match_results.drop(columns=['clean_name_CRA', 'clean_name_space_CRA', 'PostalCode_clean_CRA', 'clean_name_211', 'clean_name_space_211', 'PostalCode_clean_211'])

print(f"Step 1 Complete: {len(exact_match_results)} matched rows")
print(f"Unique CRA orgs matched: {exact_match_results['CRA_BN_ID'].nunique()}")
print(f"Unique 211 branches matched: {exact_match_results['211_PublicName'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")

Step 1 Complete: 1094 matched rows
Unique CRA orgs matched: 393
Unique 211 branches matched: 1073
Remaining CRA records: 6032
Remaining 211 branches: 5023


In [72]:
# STEP 3: Postal code + fuzzy name matching using clean_name_space
common_postal_codes = set(remaining_cra['PostalCode_clean_CRA']).intersection(set(remaining_211['PostalCode_clean_211']))
postal_matched_results = []
postal_bad_matches = []

for postal_code in tqdm(common_postal_codes):
    cra_subset = remaining_cra[remaining_cra['PostalCode_clean_CRA'] == postal_code]
    two11_subset = remaining_211[remaining_211['PostalCode_clean_211'] == postal_code]
    
    if len(cra_subset) == 0 or len(two11_subset) == 0:
        continue
        
    cra_embeddings = model.encode(cra_subset['clean_name_space_CRA'].tolist())
    two11_embeddings = model.encode(two11_subset['clean_name_space_211'].tolist())
    
    nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(two11_embeddings)
    distances, indices = nn.kneighbors(cra_embeddings, return_distance=True)
    
    for i, (distance, index) in enumerate(zip(distances, indices)):
        similarity = 1 - distance[0]
        if similarity >= 0.80:
            matched_org_name = two11_subset.iloc[index[0]]['clean_name_space_211']
            matched_branches = remaining_211[remaining_211['clean_name_space_211'] == matched_org_name]
            
            for _, branch_row in matched_branches.iterrows():
                merged_row = cra_subset.iloc[i].to_dict()
                merged_row.update({f'{k}': v for k, v in branch_row.to_dict().items()})  # formerly f'{k}_211' -- we may lose clean_name etc for 211?
                merged_row['Match_Method'] = 'Postal Code + Fuzzy Name'
                merged_row['Similarity_Score'] = similarity
                postal_matched_results.append(merged_row)
        else:
            bad_row = {
                'CRA_Name': cra_subset.iloc[i]['CRA_Name'],
                '211_Name': two11_subset.iloc[index[0]]['211_Name'],
                'Postal_Code': postal_code,
                'Similarity_Score': similarity,
                'Match_Status': 'Below Threshold'
            }
            postal_bad_matches.append(bad_row)

postal_fuzzy_results = pd.DataFrame(postal_matched_results) if postal_matched_results else pd.DataFrame()
postal_bad_matches_df = pd.DataFrame(postal_bad_matches) if postal_bad_matches else pd.DataFrame()

# remaining_cra = df_cra_prep[~df_cra_prep['CRA_BN_ID'].isin(postal_fuzzy_results['CRA_BN_ID'])]
# remaining_211 = df_211_prep[~df_211_prep['clean_name_space_211'].isin(postal_fuzzy_results['clean_name_space_211'])]

remaining_cra = remaining_cra[~remaining_cra['CRA_BN_ID'].isin(postal_fuzzy_results['CRA_BN_ID'])]
remaining_211 = remaining_211[~remaining_211['clean_name_space_211'].isin(postal_fuzzy_results['clean_name_space_211'])]

postal_fuzzy_results = postal_fuzzy_results.drop(columns=['clean_name_CRA', 'clean_name_space_CRA', 'PostalCode_clean_CRA', 'clean_name_211', 'clean_name_space_211', 'PostalCode_clean_211'])

print(f"\nStep 3 Complete: {len(postal_fuzzy_results)} matched rows")
print(f"Unique CRA orgs matched: {postal_fuzzy_results['CRA_BN_ID'].nunique()}")
print(f"Unique 211 branches matched: {postal_fuzzy_results['211_PublicName'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")


Step 3 Complete: 504 matched rows
Unique CRA orgs matched: 209
Unique 211 branches matched: 469
Remaining CRA records: 5823
Remaining 211 branches: 4552


In [78]:
# STEP 4: General fuzzy matching using clean_name_space
general_matched_results = []
general_bad_matches = []

cra_embeddings = model.encode(remaining_cra['clean_name_space_CRA'].tolist())
two11_embeddings = model.encode(remaining_211['clean_name_space_211'].tolist())
print('Embeddings generated')

nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(two11_embeddings)
distances, indices = nn.kneighbors(cra_embeddings, return_distance=True)
print('Nearest neighbours computed')

for i in tqdm(range(len(remaining_cra))):
    similarity = 1 - distances[i][0]
    if similarity >= 0.80:
        matched_org_name = remaining_211.iloc[indices[i][0]]['clean_name_space_211']
        matched_branches = remaining_211[remaining_211['clean_name_space_211'] == matched_org_name]
        
        for _, branch_row in matched_branches.iterrows():
            merged_row = remaining_cra.iloc[i].to_dict()
            merged_row.update({f'{k}': v for k, v in branch_row.to_dict().items()})
            merged_row['Match_Method'] = 'General Fuzzy Match'
            merged_row['Similarity_Score'] = similarity
            general_matched_results.append(merged_row)
    else:
        bad_row = {
            'CRA_Name': remaining_cra.iloc[i]['CRA_Name'],
            '211_Name': remaining_211.iloc[indices[i][0]]['211_Name'],
            'Similarity_Score': similarity,
            'Match_Status': 'Below Threshold'
        }
        general_bad_matches.append(bad_row)

general_fuzzy_results = pd.DataFrame(general_matched_results) if general_matched_results else pd.DataFrame()
general_bad_matches_df = pd.DataFrame(general_bad_matches) if general_bad_matches else pd.DataFrame()

# remaining_cra = df_cra_prep[~df_cra_prep['BN'].isin(general_fuzzy_results['BN'])]
# remaining_211 = df_211_prep[~df_211_prep['clean_name_space_211'].isin(general_fuzzy_results['clean_name_space_211'])]

remaining_cra = remaining_cra[~remaining_cra['CRA_BN_ID'].isin(general_fuzzy_results['CRA_BN_ID'])]
remaining_211 = remaining_211[~remaining_211['clean_name_space_211'].isin(general_fuzzy_results['clean_name_space_211'])]

general_fuzzy_results = general_fuzzy_results.drop(columns=['clean_name_CRA', 'clean_name_space_CRA', 'PostalCode_clean_CRA', 'clean_name_211', 'clean_name_space_211', 'PostalCode_clean_211'])

print(f"\nStep 4 Complete: {len(general_fuzzy_results)} matched rows")
print(f"Unique CRA orgs matched: {general_fuzzy_results['CRA_BN_ID'].nunique()}")
print(f"Unique 211 branches matched: {general_fuzzy_results['211_PublicName'].nunique()}")
print(f"Final Unmatched CRA records: {len(remaining_cra)}")
print(f"Final Unmatched 211 branches: {len(remaining_211)}")

Embeddings generated
Nearest neighbours computed


100%|█████████████████████████████████████| 5823/5823 [00:00<00:00, 8419.55it/s]



Step 4 Complete: 461 matched rows
Unique CRA orgs matched: 245
Unique 211 branches matched: 301
Final Unmatched CRA records: 5578
Final Unmatched 211 branches: 4250


## Clean-up, combine, save

In [86]:
# Manually identified mismatches for removal
bad_orgs_211 = [
    "Toronto Catholic District School Board",
    "York Region Children's Aid Society",
    "Toronto Mental Health and Addictions Access Point",
    "Toronto Public Health",
    "Canadian Cultural Society of the Deaf",
    "Canadian Environmental Law Association",
    "Friends of Jesus Christ",
    "Toronto. Shelter, Support and Housing Administration. Homelessness Initiatives and Prevention Services",
    "Salvation Army of Georgina Community Church",
    "Matthew House Refugee Reception Services, Toronto",
    "Salvation Army of Georgina Community Church",
    "International Schizophrenia Foundation",
    "Ontario Addiction Treatment Centres",
    "St Peter's Seniors' Residence",
    "Ukrainian Canadian Social Services (Toronto)",
    "Baycrest Child Care Centre",
    "Stewardship Ontario",
    "Canada. Health Canada",
    "Brampton Life Centre",
    "Chabad Lubavitch Of York Mills",
    "Registered Nurses' Association of Ontario",
    "Ontario. Ministry of Children, Community and Social Services",
    "Canadian Cultural Society of the Deaf",
    "Redeemed Christian Church of God",
    "St. James the Apostle Anglican Church",
    "Redeemed Christian Church of God",
    "Royal Day Care Centre",
    "Peel Region",
    "Society of St Vincent de Paul.[Newmarket / East Gwillimbury Region]",
    "Cypriot Community of Toronto",
    "Seventh-Day Adventist Church",
    "Ontario Association on Developmental Disabilities (The)",
    "Eye Bank of Canada",
    "Canadian Training Institute",
    "Maple Hill Baptist Church",
    "Friends of Jesus Christ",
    "Epilepsy South Central Ontario",
    "Toronto. Children's Services",
    "Toronto Community Housing",
    "Unionville Montessori School Church",
    "My House",
    "Chabad Lubavitch Of York Mills",
    "Canada. Health Canada",
    "Arab Community Centre of Toronto",
    "Toronto. Employment and Social Services",
    "Arab Community Centre of Toronto",
    "Mississauga, City of",
    "Chinese Canadian National Council, Toronto Chapter",
    "Ontario. Ministry of Labour, Training and Skills Development. Employment Practices Branch",
    "St Bartholomew's Anglican Church, Regent Park Community Services",
    "Vietnamese Community Centre of Mississauga",
    "Somali Canadian Association of Etobicoke",
    "Willowdale Manor",
    "Father Henri Nouwen",
    "Registered Nurses' Association of Ontario",
    "Epilepsy South Central Ontario",
    "Bereavement Authority of Ontario",
    "Autism in Mind",
    "North American Muslim Foundation",
    "Lighthouse (The)",
    "Our Saviour Lutheran Church Food Bank",
    "National Council of Jewish Women of Canada",
    "Open Arms Preschool",
    "VIVA Thornhill Woods",
    "Chabad Lubavitch Of York Mills",
    "Children's Aid Foundation of Canada",
    "Women's Health Matters",
    "Childhood Cancer Canada",
    "Somali Canadian Association of Etobicoke",
    "Mississauga Food Bank",
    "Armenian Relief Society Toronto Roubina Chapter",
]

In [87]:
# Combine all successful matches
all_matches = pd.concat([
    df_full_joined,
    exact_match_results,
    postal_fuzzy_results,
    general_fuzzy_results
], ignore_index=True)

incorrect_matches = all_matches[all_matches['211_Name'].isin(bad_orgs_211)]
incorrect_matches.loc[:, 'Match_Method'] = incorrect_matches['Match_Method'] + " - incorrect"
all_matches = all_matches[~all_matches['211_Name'].isin(bad_orgs_211)]

In [90]:
# Create output directory
os.makedirs('../joined-data', exist_ok=True)

# Create non-faith matches
remaining_cra_nonfaith = remaining_cra[~((remaining_cra['CRA_Category'] >= 30) & (remaining_cra['CRA_Category'] <= 90))] 

# Save all results
all_matches.to_csv('../joined-data/successful_matches.csv', index=False)
remaining_cra.to_csv('../joined-data/unmatched_cra_records.csv', index=False)
remaining_cra_nonfaith.to_csv('../joined-data/unmatched_cra_nonfaith_records.csv', index=False)
remaining_211.to_csv('../joined-data/unmatched_211_records.csv', index=False)
pd.concat([postal_bad_matches_df, general_bad_matches_df, incorrect_matches]).to_csv('../joined-data/failed_fuzzy_matches.csv', index=False)

# Create simplified version
simplified_matches = all_matches[[
    'CRA_BN_ID', 'CRA_Name', '211_Name', '211_PublicName', 
    '211_Address1', '211_PostalCode', '211_Longitude', '211_Latitude', 'Match_Method'
]].rename(columns={
    'CRA_BN_ID': 'CRA_BN_ID',
    'CRA_Name': 'CRA_Organization_Name',
    '211_Name': '211_Organization_Name',
    '211_PublicName': '211_Location_Name',
    '211_Address1': '211_Address',
    '211_PostalCode': '211_Postal_Code',
    '211_Longitude': 'X_Coordinate',
    '211_Latitude': 'Y_Coordinate'
})

simplified_matches.to_csv('../joined-data/simplified_matches.csv', index=False)

print("\nFinal Results:")
print(f"- Successful matches: {len(all_matches)} records")
print(f"- Matched CRA organizations: {all_matches['CRA_Name'].nunique()} records")
print(f"- Unmatched CRA organizations: {len(remaining_cra)} records")
print(f"- Unmatched CRA (non-faith) organizations: {len(remaining_cra_nonfaith)} records")
print(f"- Matched 211 branches: {all_matches['211_PublicName'].nunique()} records")
print(f"- Unmatched 211 branches: {len(remaining_211)} records")
print(f"- Simplified matches saved with {len(simplified_matches)} records")


Final Results:
- Successful matches: 2035 records
- Matched CRA organizations: 762 records
- Unmatched CRA organizations: 5578 records
- Unmatched CRA (non-faith) organizations: 1214 records
- Matched 211 branches: 1965 records
- Unmatched 211 branches: 4250 records
- Simplified matches saved with 2035 records
