In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import unicodedata
import re
import os

In [11]:
from sentence_transformers import SentenceTransformer, util
from sklearn.neighbors import NearestNeighbors

# Initialize BERT model
model = SentenceTransformer('paraphrase-mpnet-base-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Load data

In [12]:
df_cra = pd.read_csv("../cra-data/cra_2021_combined_filtered.csv")
df_cra = df_cra.rename(columns={"Account Name":"Name"})
# df_cra["Name"] = df_cra["Name"].str.split(pat=" - ", n=1).str[0]  # Removes any branch information - eg. "Community Center - North Branch" --> "Community Center"
df_cra = df_cra.drop_duplicates(subset=["Name"]).reset_index(drop=True)

In [13]:
# Clean and rename 211 data
df_211 = pd.read_csv("../211-data/2021_211_PeelYorkTO.csv", encoding='latin-1')
df_211 = df_211.drop(columns=["TaxonomyTerms"])
df_211 = df_211.rename(columns={
    "ParentAgency":"Name",
    "Address1":"211_Address1",
    "Address2":"211_Address2",
    "City":"211_City",
    "County":"211_County",
    "Province":"211_Province",
    "PostalCode":"211_PostalCode",
})
df_211 = df_211.drop_duplicates(subset=["PublicName", "Latitude", "Longitude"]).reset_index(drop=True)

## Algorithmic matching

In [14]:
# Clean text function
def clean_text(text):
    if pd.isna(text):
        return ''
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'[^a-zA-Z0-9]', '', text).lower()
    return text

# Prepare CRA data
df_cra_prep = df_cra.copy()
df_cra_prep['clean_name'] = df_cra['Name'].apply(clean_text)
df_cra_prep['PostalCode_clean'] = df_cra['Postal Code'].str.replace(r'\s+', '', regex=True)

# Prepare 211 data
df_211_prep = df_211.copy()
df_211_prep['clean_name'] = df_211['Name'].apply(clean_text)
df_211_prep['PostalCode_clean'] = df_211['211_PostalCode'].str.replace(r'\s+', '', regex=True)

print("Data preparation complete")
print(f"CRA records: {len(df_cra_prep)}")
print(f"211 records: {len(df_211_prep)}")

Data preparation complete
CRA records: 6519
211 records: 6405


In [15]:
# Exact name matching
exact_matches = pd.merge(
    df_cra_prep,
    df_211_prep,
    on='clean_name',
    how='inner',
    suffixes=('_cra', '_211')
)

if not exact_matches.empty:
    matched_orgs = exact_matches['clean_name'].unique()
    all_branches = df_211_prep[df_211_prep['clean_name'].isin(matched_orgs)]
    
    exact_match_results = pd.merge(
        df_cra_prep[df_cra_prep['clean_name'].isin(matched_orgs)],
        all_branches,
        on='clean_name',
        how='left',
        suffixes=('_cra', '_211')
    )
    exact_match_results['Match_Method'] = 'Exact Name'
else:
    exact_match_results = pd.DataFrame()

remaining_cra = df_cra_prep[~df_cra_prep['clean_name'].isin(exact_match_results['clean_name'].unique())]
remaining_211 = df_211_prep[~df_211_prep['clean_name'].isin(exact_match_results['clean_name'].unique())]

print(f"Step 1 Complete: {len(exact_match_results)} matched rows")
print(f"Unique CRA orgs matched: {exact_match_results['BN'].nunique()}")
print(f"Unique 211 orgs matched: {exact_match_results['clean_name'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")

Step 1 Complete: 1094 matched rows
Unique CRA orgs matched: 393
Unique 211 orgs matched: 393
Remaining CRA records: 6126
Remaining 211 branches: 5311


In [18]:
# Postal code + fuzzy name matching
common_postal_codes = set(remaining_cra['PostalCode_clean']).intersection(set(remaining_211['PostalCode_clean']))
postal_matched_results = []
postal_bad_matches = []

for postal_code in tqdm(common_postal_codes):
    cra_subset = remaining_cra[remaining_cra['PostalCode_clean'] == postal_code]
    two11_subset = remaining_211[remaining_211['PostalCode_clean'] == postal_code]
    
    if len(cra_subset) == 0 or len(two11_subset) == 0:
        continue
        
    cra_embeddings = model.encode(cra_subset['clean_name'].tolist())
    two11_embeddings = model.encode(two11_subset['clean_name'].tolist())
    
    nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(two11_embeddings)
    distances, indices = nn.kneighbors(cra_embeddings, return_distance=True)
    
    for i, (distance, index) in enumerate(zip(distances, indices)):
        similarity = 1 - distance[0]
        if similarity >= 0.80:
            matched_org_name = two11_subset.iloc[index[0]]['clean_name']
            matched_branches = remaining_211[remaining_211['clean_name'] == matched_org_name]
            
            for _, branch_row in matched_branches.iterrows():
                merged_row = cra_subset.iloc[i].to_dict()
                merged_row.update({f'{k}_211': v for k, v in branch_row.to_dict().items()})
                merged_row['Match_Method'] = 'Postal Code + Fuzzy Name'
                merged_row['Similarity_Score'] = similarity
                postal_matched_results.append(merged_row)
        else:
            bad_row = {
                'CRA_Name': cra_subset.iloc[i]['Name'],
                '211_Name': two11_subset.iloc[index[0]]['Name'],
                'Postal_Code': postal_code,
                'Similarity_Score': similarity,
                'Match_Status': 'Below Threshold'
            }
            postal_bad_matches.append(bad_row)

postal_fuzzy_results = pd.DataFrame(postal_matched_results) if postal_matched_results else pd.DataFrame()
postal_bad_matches_df = pd.DataFrame(postal_bad_matches) if postal_bad_matches else pd.DataFrame()

remaining_cra = remaining_cra[~remaining_cra['BN'].isin(postal_fuzzy_results['BN'])]
remaining_211 = remaining_211[~remaining_211['clean_name'].isin(postal_fuzzy_results['clean_name_211'])]

print(f"\nStep 2 Complete: {len(postal_fuzzy_results)} matched rows")
print(f"Unique CRA orgs matched: {postal_fuzzy_results['BN'].nunique()}")
print(f"Unique 211 orgs matched: {postal_fuzzy_results['clean_name_211'].nunique()}")
print(f"Remaining CRA records: {len(remaining_cra)}")
print(f"Remaining 211 branches: {len(remaining_211)}")

100%|█████████████████████████████████████████| 852/852 [01:59<00:00,  7.15it/s]


Step 2 Complete: 433 matched rows
Unique CRA orgs matched: 178
Unique 211 orgs matched: 175
Remaining CRA records: 5948
Remaining 211 branches: 4897





In [21]:
# General fuzzy matching
general_matched_results = []
general_bad_matches = []

cra_embeddings = model.encode(remaining_cra['clean_name'].tolist())
two11_embeddings = model.encode(remaining_211['clean_name'].tolist())

nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(two11_embeddings)
distances, indices = nn.kneighbors(cra_embeddings, return_distance=True)

for i in tqdm(range(len(remaining_cra))):
    similarity = 1 - distances[i][0]
    if similarity >= 0.75:
        matched_org_name = remaining_211.iloc[indices[i][0]]['clean_name']
        matched_branches = remaining_211[remaining_211['clean_name'] == matched_org_name]
        
        for _, branch_row in matched_branches.iterrows():
            merged_row = remaining_cra.iloc[i].to_dict()
            merged_row.update({f'{k}_211': v for k, v in branch_row.to_dict().items()})
            merged_row['Match_Method'] = 'General Fuzzy Match'
            merged_row['Similarity_Score'] = similarity
            general_matched_results.append(merged_row)
    else:
        bad_row = {
            'CRA_Name': remaining_cra.iloc[i]['Name'],
            '211_Name': remaining_211.iloc[indices[i][0]]['Name'],
            'Similarity_Score': similarity,
            'Match_Status': 'Below Threshold'
        }
        general_bad_matches.append(bad_row)

general_fuzzy_results = pd.DataFrame(general_matched_results) if general_matched_results else pd.DataFrame()
general_bad_matches_df = pd.DataFrame(general_bad_matches) if general_bad_matches else pd.DataFrame()

remaining_cra = remaining_cra[~remaining_cra['BN'].isin(general_fuzzy_results['BN'])]
remaining_211 = remaining_211[~remaining_211['clean_name'].isin(general_fuzzy_results['clean_name_211'])]

print(f"\nStep 3 Complete: {len(general_fuzzy_results)} matched rows")
print(f"Unique CRA orgs matched: {general_fuzzy_results['BN'].nunique()}")
print(f"Unique 211 orgs matched: {general_fuzzy_results['clean_name_211'].nunique()}")
print(f"Final Unmatched CRA records: {len(remaining_cra)}")
print(f"Final Unmatched 211 branches: {len(remaining_211)}")

100%|█████████████████████████████████████| 5948/5948 [00:01<00:00, 4603.91it/s]


Step 3 Complete: 1778 matched rows
Unique CRA orgs matched: 1015
Unique 211 orgs matched: 373
Final Unmatched CRA records: 4933
Final Unmatched 211 branches: 4085





In [22]:
# Combine all successful matches
all_matches = pd.concat([
    exact_match_results,
    postal_fuzzy_results,
    general_fuzzy_results
], ignore_index=True)

# Create output directory
os.makedirs('../joined-data', exist_ok=True)

# Save all results
all_matches.to_csv('../joined-data/successful_matches.csv', index=False)
remaining_cra.to_csv('../joined-data/unmatched_cra_records.csv', index=False)
remaining_211.to_csv('../joined-data/unmatched_211_records.csv', index=False)
pd.concat([postal_bad_matches_df, general_bad_matches_df]).to_csv('../joined-data/failed_fuzzy_matches.csv', index=False)

# Create simplified version
simplified_matches = all_matches[[
    'BN', 'Name_cra', 'Name_211', 'PublicName_211', 
    '211_Address1', '211_PostalCode', 'Longitude', 'Latitude', 'Match_Method'
]].rename(columns={
    'BN': 'CRA_BN_ID',
    'Name_cra': 'CRA_Organization_Name',
    'Name_211': '211_Organization_Name',
    'PublicName_211': '211_Location_Name',
    '211_Address1': '211_Address',
    '211_PostalCode': '211_Postal_Code',
    'Longitude': 'X_Coordinate',
    'Latitude': 'Y_Coordinate'
})

simplified_matches.to_csv('../joined-data/simplified_matches.csv', index=False)

print("\nFinal Results:")
print(f"- Successful matches: {len(all_matches)} records")
print(f"- Unmatched CRA organizations: {len(remaining_cra)}")
print(f"- Unmatched 211 branches: {len(remaining_211)}")
print(f"- Simplified matches saved with {len(simplified_matches)} records")


Final Results:
- Successful matches: 3305 records
- Unmatched CRA organizations: 4933
- Unmatched 211 branches: 4085
- Simplified matches saved with 3305 records
