In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np

Join together UW orgs with the CRA/211 data

In [9]:
df_matched_orgs = pd.read_csv('../joined-data/simplified_matches.csv')
df_matched_orgs = df_matched_orgs.drop(columns=['CRA_Category', 'CRA_SubCategory'])

df_uw = pd.read_excel('../cra-data/cre-uw-checking 2.xlsx', sheet_name="UW Orgs")
df_uw = df_uw.rename(columns={'CHARITABLE REGISTRATION NUMBER (BN)': 'CRA_BN_ID'})

In [10]:
df_manual_uw_map = pd.read_csv('../joined-data/manual_matches_uw.csv')

# Load 211 data, select columns only relevant to simplified form, and rename them
df_211 = pd.read_csv("../211-data/2021_211_PeelYorkTO.csv", encoding='latin-1')
df_211 = df_211.drop(columns=["TaxonomyTerms", "Address2", "City", "County", "Province"])
df_211 = df_211.rename(columns={
    "PublicName": "211_PublicName",
    "ParentAgency": "211_Name",
    "Address1":"211_Address1",
    "PostalCode":"211_PostalCode",
    "Latitude": "211_Latitude",
    "Longitude": "211_Longitude",
}).rename(columns={
    '211_Name': '211_Organization_Name',
    '211_PublicName': '211_Location_Name',
    '211_Address1': '211_Address',
    '211_PostalCode': '211_Postal_Code',
    '211_Longitude': 'X_Coordinate',
    '211_Latitude': 'Y_Coordinate'
})
df_211 = df_211.apply(lambda x: x.str.replace(';', ' - ') if x.dtype == 'object' else x)

In [11]:
df_manual_orgs = pd.merge(
    left = df_manual_uw_map,
    right = df_211,
    on = '211_Organization_Name',
    how = 'left'
)
df_manual_orgs['Match_Method'] = 'Manual UW'

In [12]:
df_uw_cra211_manual = pd.merge(
    left = df_uw,
    right = df_manual_orgs,
    on = 'Funded Agency - CRA Name',
    how = 'inner'  # Identify only these manual matches
)

In [13]:
df_uw2 = df_uw[~df_uw['Funded Agency - CRA Name'].isin(df_uw_cra211_manual['Funded Agency - CRA Name'])]
df_uw_cra211_bnid = pd.merge(
    left = df_uw2,
    right = df_matched_orgs,
    on = 'CRA_BN_ID',
    how = 'left'
)

In [14]:
df_uw_cra211 = pd.concat([df_uw_cra211_manual, df_uw_cra211_bnid])
df_uw_cra211 = df_uw_cra211.sort_values(by='CRA_Organization_Name')
df_uw_cra211.to_csv('../joined-data/uw_cra211.csv', index=False)

In [8]:
# Previous merge function -- for reference
# df_uw_cra211 = pd.merge(
#     left = df_uw,
#     right = df_matched_orgs,
#     on = 'CRA_BN_ID',
#     how = 'left'
# )
# df_uw_cra211 = df_uw_cra211.sort_values(by='CRA_Organization_Name')
# df_uw_cra211.to_csv('../joined-data/uw_cra211.csv', index=False)

In [15]:
print(f'Total UW orgs: {len(df_uw)}')

# df_uw = df_uw[df_uw['CRA_BN_ID'].notna()]
# print(f'Total UW orgs (with CRA BN ID): {len(df_uw)}')

# num_uw_matches = len(set(df_uw['CRA_BN_ID']).intersection(set(df_orgs['CRA_BN_ID'])))
num_uw_matches = df_uw_cra211['CRA_Organization_Name'].nunique()
print(f'No. UW orgs with 211/CRA data: {num_uw_matches}')

Total UW orgs: 278
No. UW orgs with 211/CRA data: 226


## Join tenure data

In [16]:
df_tenure_by_cra = pd.read_excel('../../2024-work/SPRE_data/2021_CRE_DATA_Oct9.xlsx')
df_tenure_by_cra = df_tenure_by_cra.drop_duplicates()
# ['CRA_BN', 'CRA_LegalName', '211 Parent Agency Name', '211_Address1', '211_PostalCode', 'Tenure', 'Latitude', 'Longitude']
df_tenure_by_cra = df_tenure_by_cra[['CRA_BN', 'Latitude', 'Longitude', 'Tenure']].rename(
    columns={
        'CRA_BN': 'CRA_BN_ID',
        # 'CRA_LegalName': 'CRA_Organization_Name',
        # '211 Parent Agency Name': '211_Organization_Name',
        # '211_Address1': '211_Address',
        # '211_PostalCode': '211_Postal_Code',
        'Latitude': 'Y_Coordinate',
        'Longitude': 'X_Coordinate',
    }
)

# df_tenure_by_cra

In [17]:
df_uw_cra211_tenure = pd.merge(
    left = df_uw_cra211,
    right = df_tenure_by_cra,
    on = ['CRA_BN_ID', 'Y_Coordinate', 'X_Coordinate'],
    how = 'left'
)
df_uw_cra211_tenure.to_csv('../joined-data/uw_cra211_tenure.csv', index=False)

num_tenure = df_uw_cra211_tenure['Tenure'].count()
num_locs = len(df_uw_cra211_tenure)
pct_tenure = round(num_tenure / num_locs, 4) * 100
print(f'Number of locations with tenure data: {num_tenure}')
print(f'...which is {pct_tenure}% of all locations')
print(f'...so {num_locs - num_tenure} locations are missing tenure data')
print(f'...as there are {num_locs} locations in total')

Number of locations with tenure data: 430
...which is 43.57% of all locations
...so 557 locations are missing tenure data
...as there are 987 locations in total


## Join tenure data to simplified_matches and extend it

In [18]:
df_matched_orgs = pd.read_csv('../joined-data/simplified_matches.csv')

# Add the UW-only CRA organizations
df_matched_extended = pd.concat([
    df_matched_orgs,
    df_uw_cra211_manual.drop(columns=['Funded Agency - CRA Name', 'City', 'Reason', 'Region']).rename(columns={'Category': 'CRA_Category', 'Sub Category': 'CRA_SubCategory'})
]).sort_values(by='CRA_Organization_Name')

# Add tenure information
df_matched_extended = pd.merge(
    left = df_matched_extended,
    right = df_tenure_by_cra,
    on = ['CRA_BN_ID', 'Y_Coordinate', 'X_Coordinate'],
    how = 'left'
)

# Specify with an organization is UW or not
df_matched_extended['UW'] = df_matched_extended['CRA_Organization_Name'].isin(df_uw_cra211_tenure['CRA_Organization_Name'])

df_matched_extended.to_csv('../joined-data/simplified_matches_uw_tenure.csv', index=False)

num_tenure = df_matched_extended['Tenure'].count()
num_locs = len(df_matched_extended)
pct_tenure = round(num_tenure / num_locs, 4) * 100
print(f'Number of all locations with tenure data: {num_tenure}')
print(f'...which is {pct_tenure}% of all locations')
print(f'...so {num_locs - num_tenure} locations are missing tenure data')
print(f'...as there are {num_locs} locations in total')

Number of all locations with tenure data: 1042
...which is 46.29% of all locations
...so 1209 locations are missing tenure data
...as there are 2251 locations in total


## Save data as GeoJSON with updated columns

In [19]:
df_matches_all = pd.read_csv('../joined-data/simplified_matches_uw_tenure.csv')
df_matches_all.columns

Index(['CRA_BN_ID', 'CRA_Organization_Name', 'CRA_Category', 'CRA_SubCategory',
       '211_Organization_Name', '211_Location_Name', '211_Address',
       '211_Postal_Code', 'X_Coordinate', 'Y_Coordinate', 'Match_Method',
       'Tenure', 'UW'],
      dtype='object')

In [20]:
df_matches_all = df_matches_all.drop(columns=['CRA_BN_ID', 'CRA_Organization_Name', '211_Location_Name', '211_Postal_Code', 'Match_Method', 'UW'])
df_matches_all = df_matches_all.rename(columns={
    '211_Organization_Name': 'N',
    '211_Address': 'A',
    'CRA_Category': 'C',
    'CRA_SubCategory': 'S',
    'Tenure': 'T',
})

In [25]:
gdf_matches = gpd.GeoDataFrame(
    df_matches_all, geometry=gpd.points_from_xy(df_matches_all.X_Coordinate, df_matches_all.Y_Coordinate), crs="EPSG:4326"
)
gdf_matches.to_file('../joined-data/simplified_matches_4326.geojson', driver='GeoJSON')