In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import unicodedata
import re
import os

In [2]:
from sentence_transformers import SentenceTransformer, util
from sklearn.neighbors import NearestNeighbors

# Initialize BERT model
model = SentenceTransformer('paraphrase-mpnet-base-v2')

## Load data

In [3]:
df_cra = pd.read_csv("../cra-data/cra_2021_combined_filtered.csv")
df_cra = df_cra.rename(columns={"Account Name":"Name"})
# df_cra["Name"] = df_cra["Name"].str.split(pat=" - ", n=1).str[0]  # Removes any branch information - eg. "Community Center - North Branch" --> "Community Center"
df_cra = df_cra.drop_duplicates(subset=["Name"]).reset_index(drop=True)

In [4]:
# Clean and rename 211 data
df_211 = pd.read_csv("../211-data/2021_211_PeelYorkTO.csv", encoding='latin-1')
df_211 = df_211.drop(columns=["TaxonomyTerms"])
df_211 = df_211.rename(columns={
    "ParentAgency":"Name",
    "Address1":"211_Address1",
    "Address2":"211_Address2",
    "City":"211_City",
    "County":"211_County",
    "Province":"211_Province",
    "PostalCode":"211_PostalCode",
})
df_211 = df_211.drop_duplicates(subset=["PublicName", "Latitude", "Longitude"]).reset_index(drop=True)

## Algorithmic matching

In [6]:
# Clean text functions
def clean_text(text):
    if pd.isna(text):
        return ''
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'[^a-zA-Z0-9]', '', text).lower()
    return text

def clean_text_with_spaces(text):
    if pd.isna(text):
        return ''
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    # Keep spaces but remove other special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text).lower()
    # Collapse multiple spaces into one
    text = re.sub(r' +', ' ', text).strip()
    return text

# Prepare CRA data
df_cra_prep = df_cra.copy()
df_cra_prep['clean_name'] = df_cra['Name'].apply(clean_text)
df_cra_prep['clean_name_space'] = df_cra['Name'].apply(clean_text_with_spaces)
df_cra_prep['PostalCode_clean'] = df_cra['Postal Code'].str.replace(r'\s+', '', regex=True)

# Prepare 211 data
df_211_prep = df_211.copy()
df_211_prep['clean_name'] = df_211['Name'].apply(clean_text)
df_211_prep['clean_name_space'] = df_211['Name'].apply(clean_text_with_spaces)
df_211_prep['PostalCode_clean'] = df_211['211_PostalCode'].str.replace(r'\s+', '', regex=True)

print("Data preparation complete")
print(f"CRA records: {len(df_cra_prep)}")
print(f"211 records: {len(df_211_prep)}")

Data preparation complete
CRA records: 6519
211 records: 6405


In [7]:
# Exact name matching
exact_matches = pd.merge(
    df_cra_prep,
    df_211_prep,
    on='clean_name',
    how='inner',
    suffixes=('_cra', '_211')
)

if not exact_matches.empty:
    matched_orgs = exact_matches['clean_name'].unique()
    all_branches = df_211_prep[df_211_prep['clean_name'].isin(matched_orgs)]
    
    exact_match_results = pd.merge(
        df_cra_prep[df_cra_prep['clean_name'].isin(matched_orgs)],
        all_branches,
        on='clean_name',
        how='left',
        suffixes=('_cra', '_211')
    )
    exact_match_results['Match_Method'] = 'Exact Name'
else:
    exact_match_results = pd.DataFrame()

remaining_cra = df_cra_prep[~df_cra_prep['clean_name'].isin(exact_match_results['clean_name'].unique())]
remaining_211 = df_211_prep[~df_211_prep['clean_name'].isin(exact_match_results['clean_name'].unique())]

print(f"Step 1 Complete: {len(exact_match_results)} matched rows")
print(f"Unique CRA orgs matched: {exact_match_results['BN'].nunique()}")
print(f"Unique 211 orgs matched: {exact_match_results['clean_name'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")

Step 1 Complete: 1094 matched rows
Unique CRA orgs matched: 393
Unique 211 orgs matched: 393
Remaining CRA records: 6126
Remaining 211 branches: 5311


In [8]:
# Postal code + fuzzy name matching using clean_name_space
common_postal_codes = set(remaining_cra['PostalCode_clean']).intersection(set(remaining_211['PostalCode_clean']))
postal_matched_results = []
postal_bad_matches = []

for postal_code in tqdm(common_postal_codes):
    cra_subset = remaining_cra[remaining_cra['PostalCode_clean'] == postal_code]
    two11_subset = remaining_211[remaining_211['PostalCode_clean'] == postal_code]
    
    if len(cra_subset) == 0 or len(two11_subset) == 0:
        continue
        
    cra_embeddings = model.encode(cra_subset['clean_name_space'].tolist())
    two11_embeddings = model.encode(two11_subset['clean_name_space'].tolist())
    
    nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(two11_embeddings)
    distances, indices = nn.kneighbors(cra_embeddings, return_distance=True)
    
    for i, (distance, index) in enumerate(zip(distances, indices)):
        similarity = 1 - distance[0]
        if similarity >= 0.80:
            matched_org_name = two11_subset.iloc[index[0]]['clean_name_space']
            matched_branches = remaining_211[remaining_211['clean_name_space'] == matched_org_name]
            
            for _, branch_row in matched_branches.iterrows():
                merged_row = cra_subset.iloc[i].to_dict()
                merged_row.update({f'{k}_211': v for k, v in branch_row.to_dict().items()})
                merged_row['Match_Method'] = 'Postal Code + Fuzzy Name'
                merged_row['Similarity_Score'] = similarity
                postal_matched_results.append(merged_row)
        else:
            bad_row = {
                'CRA_Name': cra_subset.iloc[i]['Name'],
                '211_Name': two11_subset.iloc[index[0]]['Name'],
                'Postal_Code': postal_code,
                'Similarity_Score': similarity,
                'Match_Status': 'Below Threshold'
            }
            postal_bad_matches.append(bad_row)

postal_fuzzy_results = pd.DataFrame(postal_matched_results) if postal_matched_results else pd.DataFrame()
postal_bad_matches_df = pd.DataFrame(postal_bad_matches) if postal_bad_matches else pd.DataFrame()

remaining_cra = remaining_cra[~remaining_cra['BN'].isin(postal_fuzzy_results['BN'])]
remaining_211 = remaining_211[~remaining_211['clean_name_space'].isin(postal_fuzzy_results['clean_name_space_211'])]

print(f"\nStep 2 Complete: {len(postal_fuzzy_results)} matched rows")
print(f"Unique CRA orgs matched: {postal_fuzzy_results['BN'].nunique()}")
print(f"Unique 211 orgs matched: {postal_fuzzy_results['clean_name_space_211'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")

100%|█████████████████████████████████████████| 852/852 [01:25<00:00,  9.94it/s]



Step 2 Complete: 504 matched rows
Unique CRA orgs matched: 209
Unique 211 orgs matched: 206
Remaining CRA records: 5917
Remaining 211 branches: 4840


In [9]:
# General fuzzy matching using clean_name_space
general_matched_results = []
general_bad_matches = []

cra_embeddings = model.encode(remaining_cra['clean_name_space'].tolist())
two11_embeddings = model.encode(remaining_211['clean_name_space'].tolist())
print('Embeddings generated')

nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(two11_embeddings)
distances, indices = nn.kneighbors(cra_embeddings, return_distance=True)
print('Nearest neighbours computed')

for i in tqdm(range(len(remaining_cra))):
    similarity = 1 - distances[i][0]
    if similarity >= 0.75:
        matched_org_name = remaining_211.iloc[indices[i][0]]['clean_name_space']
        matched_branches = remaining_211[remaining_211['clean_name_space'] == matched_org_name]
        
        for _, branch_row in matched_branches.iterrows():
            merged_row = remaining_cra.iloc[i].to_dict()
            merged_row.update({f'{k}_211': v for k, v in branch_row.to_dict().items()})
            merged_row['Match_Method'] = 'General Fuzzy Match'
            merged_row['Similarity_Score'] = similarity
            general_matched_results.append(merged_row)
    else:
        bad_row = {
            'CRA_Name': remaining_cra.iloc[i]['Name'],
            '211_Name': remaining_211.iloc[indices[i][0]]['Name'],
            'Similarity_Score': similarity,
            'Match_Status': 'Below Threshold'
        }
        general_bad_matches.append(bad_row)

general_fuzzy_results = pd.DataFrame(general_matched_results) if general_matched_results else pd.DataFrame()
general_bad_matches_df = pd.DataFrame(general_bad_matches) if general_bad_matches else pd.DataFrame()

remaining_cra = remaining_cra[~remaining_cra['BN'].isin(general_fuzzy_results['BN'])]
remaining_211 = remaining_211[~remaining_211['clean_name_space'].isin(general_fuzzy_results['clean_name_space_211'])]

print(f"\nStep 3 Complete: {len(general_fuzzy_results)} matched rows")
print(f"Unique CRA orgs matched: {general_fuzzy_results['BN'].nunique()}")
print(f"Unique 211 orgs matched: {general_fuzzy_results['clean_name_space_211'].nunique()}")
print(f"Final Unmatched CRA records: {len(remaining_cra)}")
print(f"Final Unmatched 211 branches: {len(remaining_211)}")

Embeddings generated
Nearest neighbours computed


100%|█████████████████████████████████████| 5917/5917 [00:00<00:00, 6127.90it/s]


Step 3 Complete: 1096 matched rows
Unique CRA orgs matched: 662
Unique 211 orgs matched: 237
Final Unmatched CRA records: 5255
Final Unmatched 211 branches: 4317





In [10]:
exact_match_updated = exact_match_results.drop(
    columns=['clean_name', 'clean_name_space_cra', 'clean_name_space_211', 'PostalCode_clean_cra', 'PostalCode_clean_211']
).rename(
    columns={
        'PublicName': '211_PublicName',
        'Name_211': '211_Name',
        'Latitude': '211_Latitude', 
        'Longitude': '211_Longitude',
    }
)

postal_fuzzy_updated = postal_fuzzy_results.drop(
    columns=['clean_name', 'clean_name_space', 'clean_name_211', 'clean_name_space_211', 'PostalCode_clean', 'PostalCode_clean_211']
).rename(
    columns={
        'Name': 'Name_cra',
        'PublicName_211': '211_PublicName',
        'Name_211': '211_Name',
        '211_Address1_211': '211_Address1', 
        '211_Address2_211': '211_Address2',
        '211_City_211': '211_City', 
        '211_County_211': '211_County', 
        '211_Province_211': '211_Province',
        '211_PostalCode_211': '211_PostalCode', 
        'Latitude_211': '211_Latitude', 
        'Longitude_211': '211_Longitude',
    }
)

general_fuzzy_updated = general_fuzzy_results.drop(
    columns=['clean_name', 'clean_name_space', 'clean_name_211', 'clean_name_space_211', 'PostalCode_clean', 'PostalCode_clean_211']
).rename(
    columns={
        'Name': 'Name_cra',
        'PublicName_211': '211_PublicName',
        'Name_211': '211_Name',
        '211_Address1_211': '211_Address1', 
        '211_Address2_211': '211_Address2',
        '211_City_211': '211_City', 
        '211_County_211': '211_County', 
        '211_Province_211': '211_Province',
        '211_PostalCode_211': '211_PostalCode', 
        'Latitude_211': '211_Latitude', 
        'Longitude_211': '211_Longitude',
    }
)

In [11]:
# Combine all successful matches
all_matches = pd.concat([
    exact_match_updated,
    postal_fuzzy_updated,
    general_fuzzy_updated
], ignore_index=True)

# Create output directory
os.makedirs('../joined-data', exist_ok=True)

# Save all results
all_matches.to_csv('../joined-data/successful_matches.csv', index=False)
remaining_cra.to_csv('../joined-data/unmatched_cra_records.csv', index=False)
remaining_211.to_csv('../joined-data/unmatched_211_records.csv', index=False)
pd.concat([postal_bad_matches_df, general_bad_matches_df]).to_csv('../joined-data/failed_fuzzy_matches.csv', index=False)

# Create simplified version
simplified_matches = all_matches[[
    'BN', 'Name_cra', '211_Name', '211_PublicName', 
    '211_Address1', '211_PostalCode', '211_Longitude', '211_Latitude', 'Match_Method'
]].rename(columns={
    'BN': 'CRA_BN_ID',
    'Name_cra': 'CRA_Organization_Name',
    '211_Name': '211_Organization_Name',
    '211_PublicName': '211_Location_Name',
    '211_Address1': '211_Address',
    '211_PostalCode': '211_Postal_Code',
    '211_Longitude': 'X_Coordinate',
    '211_Latitude': 'Y_Coordinate'
})

simplified_matches.to_csv('../joined-data/simplified_matches.csv', index=False)

print("\nFinal Results:")
print(f"- Successful matches: {len(all_matches)} records")
print(f"- Unmatched CRA organizations: {len(remaining_cra)}")
print(f"- Unmatched 211 branches: {len(remaining_211)}")
print(f"- Simplified matches saved with {len(simplified_matches)} records")


Final Results:
- Successful matches: 2694 records
- Unmatched CRA organizations: 5255
- Unmatched 211 branches: 4317
- Simplified matches saved with 2694 records
