In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

import unicodedata
import re

## Examine the 2025 August UW data to detect possible missing organizations

Find missing UW orgs from 2025

In [2]:
# Clean text functions
def clean_text(text):
    if pd.isna(text):
        return ''
    
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'/.*$', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\b(the|inc|incorporated|llc|corp|corporation|co|company)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z0-9]', '', text).lower()
    
    return text.strip()

def clean_text_with_spaces(text):
    if pd.isna(text):
        return ''
    
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'/.*$', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\b(the|inc|incorporated|llc|corp|corporation|co|company)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text).lower()
    
    text = re.sub(r' +', ' ', text).strip()
    
    return text

# Load the dataframes
# df_uw_cra211 = pd.read_csv('../joined-data/uw_cra211.csv', encoding='latin')
df_uw_cra211 = pd.read_excel('../uw-data/cre-uw-checking 2.xlsx', sheet_name="UW Orgs")
df_uw3 = pd.read_csv('../uw-data/2024-25 Program Locations for Mapping(Sheet2).csv', encoding='latin')

# Create dummy columns for comparison (without spaces)
df_uw_cra211['dummy'] = df_uw_cra211['Funded Agency - CRA Name'].apply(clean_text)
df_uw3['dummy'] = df_uw3['Organization Name: | Legal Name'].apply(clean_text)
df_uw3['dummy_space'] = df_uw3['Organization Name: | Legal Name'].apply(clean_text_with_spaces)

# Get unique organization names from df_uw3
uw3_orgs = df_uw3['Organization Name: | Legal Name'].unique()
uw3_dummies = df_uw3['dummy'].unique()
uw3_dummies_space = df_uw3['dummy_space'].unique()

# Find which organizations don't have matches in df_uw_cra211
missing_orgs = []
for dummy_space, dummy in zip(uw3_dummies_space, uw3_dummies):
    if dummy not in df_uw_cra211['dummy'].values:
        # Get the clean version with spaces for reporting
        missing_orgs.append(dummy_space)

# Report the results
# if len(missing_orgs) > 0:
#     print("Organizations in df_uw3 that don't have matches in df_uw_cra211:")
#     for i, org in enumerate(missing_orgs, 1):
#         print(f"{i}. {org}")
# else:
#     print("All organizations in df_uw3 have matches in df_uw_cra211")

df_uw_missing = df_uw3[df_uw3['dummy_space'].isin(missing_orgs)][['Organization Name: | Legal Name', 'Organization Name: | Operating/Public Name']].drop_duplicates()
df_uw_missing.to_csv('../uw-data/2025-uw-missing.csv', index=False)

## Join together UW orgs with the CRA/211 data

In [3]:
df_matched_orgs = pd.read_csv('../joined-data/simplified_matches.csv')
df_matched_orgs = df_matched_orgs.drop(columns=['CRA_Category', 'CRA_SubCategory'])

df_uw = pd.read_excel('../uw-data/cre-uw-checking 2.xlsx', sheet_name="UW Orgs")
df_uw = df_uw.rename(columns={'CHARITABLE REGISTRATION NUMBER (BN)': 'CRA_BN_ID'})

In [4]:
df_manual_uw_map = pd.read_csv('../uw-data/manual_matches_uw.csv')

# Load 211 data, select columns only relevant to simplified form, and rename them
df_211 = pd.read_csv("../211-data/2021_211_PeelYorkTO.csv", encoding='latin-1')
df_211 = df_211.drop(columns=["TaxonomyTerms", "Address2", "County", "Province"])
df_211 = df_211.rename(columns={
    "PublicName": "211_PublicName",
    "ParentAgency": "211_Name",
    "Address1":"211_Address1",
    "City":"211_City",
    "PostalCode":"211_PostalCode",
    "Latitude": "211_Latitude",
    "Longitude": "211_Longitude",
}).rename(columns={
    '211_Name': '211_Organization_Name',
    '211_PublicName': '211_Location_Name',
    '211_Address1': '211_Address',
    '211_City': '211_City',
    '211_PostalCode': '211_Postal_Code',
    '211_Longitude': 'X_Coordinate',
    '211_Latitude': 'Y_Coordinate'
})
df_211 = df_211.apply(lambda x: x.str.replace(';', ' - ') if x.dtype == 'object' else x)

Join together manual matches 

In [5]:
df_manual_orgs = pd.merge(
    left = df_manual_uw_map,
    right = df_211,
    on = '211_Organization_Name',
    how = 'left'
)
df_manual_orgs['Match_Method'] = 'Manual UW'

In [6]:
df_uw_cra211_manual = pd.merge(
    left = df_uw,
    right = df_manual_orgs,
    on = 'Funded Agency - CRA Name',
    how = 'inner'  # Identify only these manual matches
)

For other organizations, use the CRA ID

In [7]:
df_uw2 = df_uw[~df_uw['Funded Agency - CRA Name'].isin(df_uw_cra211_manual['Funded Agency - CRA Name'])]
df_uw_cra211_bnid = pd.merge(
    left = df_uw2,
    right = df_matched_orgs,
    on = 'CRA_BN_ID',
    how = 'left'
)

Incorporate the August 2025 version of UW organizations (REDUNDANT)

In [8]:
# df_uw25_map = pd.read_csv('../uw-data/2025-uw-missing-matches.csv').dropna().drop(columns=['UW 2025'])
# df_uw25_map = df_uw25_map[~df_uw25_map['Funded Agency - CRA Name'].isin(df_uw['Funded Agency - CRA Name'])]
# df_uw25_map[['CRA_BN_ID', 'Category', 'Sub Category', 'City', 'Reason', 'Region']] = np.nan

# df_uw25_manual = pd.merge(
#     left = df_uw25_map,
#     right = df_211,
#     on = '211_Organization_Name',
#     how = 'left'
# )
# df_uw25_manual['Match_Method'] = 'Manual UW 25'

Concat

In [9]:
df_uw_cra211 = pd.concat([df_uw_cra211_manual, df_uw_cra211_bnid])
df_uw_cra211 = df_uw_cra211.sort_values(by='CRA_Organization_Name')
df_uw_cra211.to_csv('../joined-data/uw_cra211.csv', index=False)

In [10]:
df_uw_cra211.columns

Index(['Funded Agency - CRA Name', 'CRA_BN_ID', 'Category', 'Sub Category',
       'City', 'Reason', 'Region', 'CRA_Organization_Name',
       '211_Organization_Name', '211_Location_Name', '211_Address', '211_City',
       '211_Postal_Code', 'Y_Coordinate', 'X_Coordinate', 'Match_Method'],
      dtype='object')

In [11]:
print(f'Total UW orgs: {len(df_uw)}')

num_uw_matches = df_uw_cra211['CRA_Organization_Name'].nunique()
print(f'No. UW orgs with 211/CRA data: {num_uw_matches}')

Total UW orgs: 278
No. UW orgs with 211/CRA data: 226


## Extend and prune data to account for missing data and superfluous entries

In [12]:
df_matched_orgs = pd.read_csv('../joined-data/simplified_matches.csv')
df_matched_extended = pd.concat([
    df_matched_orgs,
    df_uw_cra211_manual.drop(columns=['Funded Agency - CRA Name', 'City', 'Reason', 'Region']).rename(columns={'Category': 'CRA_Category', 'Sub Category': 'CRA_SubCategory'})
]).sort_values(by='CRA_Organization_Name')

In [13]:
# Delete organizations and locations which are no longer valid
df_filtered = df_matched_extended.copy()

df_filtered = df_filtered[
    (df_filtered['CRA_Organization_Name'] != "CANADIAN MENTAL HEALTH ASSOCIATION, SIMCOE COUNTY BRANCH") &
    (df_filtered['CRA_Organization_Name'] != "LAKE SIMCOE REGION CONSERVATION AUTHORITY")
]

df_filtered = df_filtered[
    df_filtered['211_Address'] != "Tannery Mall - 465 Davis Dr"
]

df_matched_extended = df_filtered.copy()

In [14]:
# Rename locations which have since changed names
# First replacement: Brampton Multicultural Community Centre -> Building Multicultural Centre
mask_brampton = df_matched_extended['211_Organization_Name'] == 'Brampton Multicultural Community Centre'
df_matched_extended.loc[mask_brampton, '211_Organization_Name'] = 'Building Multicultural Centre'
df_matched_extended.loc[mask_brampton, 'CRA_Organization_Name'] = 'BUILDING MULTICULTURAL CENTRE'
df_matched_extended.loc[mask_brampton, '211_Location_Name'] = df_matched_extended.loc[mask_brampton, '211_Location_Name'].str.replace(
    'Brampton Multicultural Community Centre',
    'Building Multicultural Centre',
    regex=False
)

# Second replacement: Malton Neighbourhood Services -> My Neighbourhood Services
mask_malton = df_matched_extended['211_Organization_Name'] == 'Malton Neighbourhood Services'
df_matched_extended.loc[mask_malton, '211_Organization_Name'] = 'My Neighbourhood Services'
df_matched_extended.loc[mask_malton, 'CRA_Organization_Name'] = 'MY NEIGHBOURHOOD SERVICES'
df_matched_extended.loc[mask_malton, '211_Location_Name'] = df_matched_extended.loc[mask_malton, '211_Location_Name'].str.replace(
    'Malton Neighbourhood Services',
    'My Neighbourhood Services',
    regex=False
)

# Fix coordinates
mask_malton = df_matched_extended['211_Organization_Name'] == 'Pentecostal Assemblies of Canada'
df_matched_extended.loc[mask_malton, 'X_Coordinate'] = -79.756686
df_matched_extended.loc[mask_malton, 'Y_Coordinate'] = 43.597961

In [15]:
# Save an extended version of this as the manual matches were not included in the initial matching
df_matched_extended.to_csv('../joined-data/simplified_matches_uw.csv', index=False)