In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

import unicodedata
import re

## Add tenure data to UW locations, using old tenure information

In [2]:
df_tenure_by_cra = pd.read_excel('../../2024-work/SPRE_data/2021_CRE_DATA_Oct9.xlsx')
df_tenure_by_cra = df_tenure_by_cra.drop_duplicates()
# ['CRA_BN', 'CRA_LegalName', '211 Parent Agency Name', '211_Address1', '211_PostalCode', 'Tenure', 'Latitude', 'Longitude']
df_tenure_by_cra = df_tenure_by_cra[['CRA_BN', 'Latitude', 'Longitude', 'Tenure']].rename(
    columns={
        'CRA_BN': 'CRA_BN_ID',
        # 'CRA_LegalName': 'CRA_Organization_Name',
        # '211 Parent Agency Name': '211_Organization_Name',
        # '211_Address1': '211_Address',
        # '211_PostalCode': '211_Postal_Code',
        'Latitude': 'Y_Coordinate',
        'Longitude': 'X_Coordinate',
    }
)

# df_tenure_by_cra

In [3]:
df_uw_cra211 = pd.read_csv('../joined-data/uw_cra211.csv')
df_uw_cra211_tenure = pd.merge(
    left = df_uw_cra211,
    right = df_tenure_by_cra,
    on = ['CRA_BN_ID', 'Y_Coordinate', 'X_Coordinate'],
    how = 'left'
)
df_uw_cra211_tenure.to_csv('../joined-data/uw_cra211_tenure.csv', index=False)

num_tenure = df_uw_cra211_tenure['Tenure'].count()
num_locs = len(df_uw_cra211_tenure)
pct_tenure = round(num_tenure / num_locs, 4) * 100
print(f'Number of locations with tenure data: {num_tenure}')
print(f'...which is {pct_tenure}% of all locations')
print(f'...so {num_locs - num_tenure} locations are missing tenure data')
print(f'...as there are {num_locs} locations in total')

Number of locations with tenure data: 430
...which is 43.57% of all locations
...so 557 locations are missing tenure data
...as there are 987 locations in total


## Join tenure data to simplified_matches

In [4]:
df_matched_extended = pd.read_csv('../joined-data/simplified_matches_uw.csv')

# Add tenure information
df_matched_extended = pd.merge(
    left = df_matched_extended,
    right = df_tenure_by_cra,
    on = ['CRA_BN_ID', 'Y_Coordinate', 'X_Coordinate'],
    how = 'left'
)

# Specify with an organization is UW or not
df_matched_extended['UW'] = df_matched_extended['CRA_Organization_Name'].isin(df_uw_cra211_tenure['CRA_Organization_Name'])

df_matched_extended.to_csv('../joined-data/simplified_matches_uw_tenure.csv', index=False)

num_tenure = df_matched_extended['Tenure'].count()
num_locs = len(df_matched_extended)
pct_tenure = round(num_tenure / num_locs, 4) * 100
print(f'Number of all locations with tenure data: {num_tenure}')
print(f'...which is {pct_tenure}% of all locations')
print(f'...so {num_locs - num_tenure} locations are missing tenure data')
print(f'...as there are {num_locs} locations in total')

Number of all locations with tenure data: 1042
...which is 46.35% of all locations
...so 1206 locations are missing tenure data
...as there are 2248 locations in total


In [5]:
df_matched_extended = pd.read_csv('../joined-data/simplified_matches_uw_tenure.csv')

# Filter to only rows with no Tenure
df_matched_extended_nt = df_matched_extended[df_matched_extended['Tenure'].isna()]

# Count occurrences
name_counts = df_matched_extended_nt['211_Organization_Name'].value_counts()

# Split into duplicates and uniques
df_dupes = df_matched_extended_nt[df_matched_extended_nt['211_Organization_Name'].isin(name_counts[name_counts > 1].index)]
df_uniques = df_matched_extended_nt[df_matched_extended_nt['211_Organization_Name'].isin(name_counts[name_counts == 1].index)]

# Save the version with only repeated orgs
df_dupes.sort_values(by='211_City').to_csv('../tenure-data/simplified_matches_uw_no_tenure.csv', index=False)

# Save the version with only unique orgs (if needed)
df_uniques.sort_values(by='211_City').to_csv('../tenure-data/simplified_matches_uw_no_tenure_uniques_only.csv', index=False)

# Optionally, view counts
print(df_dupes['211_City'].value_counts())

211_City
Toronto                   772
Mississauga                45
Brampton                   37
Markham                    24
Richmond Hill              24
Newmarket                  23
Vaughan                    17
Aurora                     17
Georgina                    5
Whitchurch-Stouffville      2
King                        2
East Gwillimbury            2
Caledon                     1
Name: count, dtype: int64


## Join manually retrieved tenure data

Merge in Toronto data

In [6]:
df_matches_all = pd.read_csv('../joined-data/simplified_matches_uw_tenure.csv')
df_tenure_to = pd.read_csv('../tenure-data/Non-Profit UW Tenure Toronto - Sheet1.csv')

In [7]:
merge_cols = ['CRA_Organization_Name', '211_Organization_Name', '211_Location_Name', '211_Address', '211_Postal_Code', 'X_Coordinate', 'Y_Coordinate']

# Merge tenure info from df_tenure_to into df_matches_all
df_merged = pd.merge(
    df_matches_all,
    df_tenure_to[merge_cols + ['Tenure (Rent or Own or Unknown)']],
    on=merge_cols,
    how='left'
)

# Create new column 'Tenure_Public' initialized as empty string
df_merged['Tenure_Public'] = ''

# Function to update tenure values
def update_tenure(row):
    current_tenure = row['Tenure'] 
    new_tenure_info = row['Tenure (Rent or Own or Unknown)']
    
    # Only update if current tenure is 'Unknown' or 'Empty'
    if current_tenure == 'Unknown' or pd.isna(current_tenure):
        if pd.isna(new_tenure_info):
            return row['Tenure'], ''
        elif str(new_tenure_info).startswith('City of '):
            return 'Rent', new_tenure_info
        else:
            return new_tenure_info, ''
    else:
        return row['Tenure'], ''

# Apply function to update Tenure and Tenure_Public columns
df_merged[['Tenure', 'Tenure_Public']] = df_merged.apply(update_tenure, axis=1, result_type='expand')

# Drop helper column from df_tenure_to
df_matches_all_2 = df_merged.drop(columns=['Tenure (Rent or Own or Unknown)'])

Merge in non-Toronto GTA data

In [8]:
df_tenure_gta = pd.read_csv('../tenure-data/2021_Joinedv2_short - Copy(CRE2021).csv')

In [9]:
# Ensure the column is named 'Tenure_GTA' (skip renaming again if already done)
# If it’s not renamed yet, you can uncomment the next line:
df_tenure_gta = df_tenure_gta.rename(columns={'Tenure': 'Tenure_GTA'})

# Clean up df_tenure_gta['Tenure_GTA']: replace non 'Rent', 'Own', 'Unknown' with 'Unknown'
df_tenure_gta['Tenure_GTA'] = df_tenure_gta['Tenure_GTA'].str.strip()
df_tenure_gta['Tenure_GTA'] = df_tenure_gta['Tenure_GTA'].where(df_tenure_gta['Tenure_GTA'].isin(['Rent', 'Own', 'Unknown']), 'Unknown')

merge_cols = ['CRA_Organization_Name', '211_Organization_Name', '211_Location_Name', '211_Address', '211_Postal_Code', 'X_Coordinate', 'Y_Coordinate']

# Group by merge columns and resolve conflicts
def resolve_tenure(series):
    unique_tenures = series.dropna().unique()
    if len(unique_tenures) == 1:
        return unique_tenures[0]
    else:
        return 'Unknown'

df_tenure_gta_agg = df_tenure_gta.groupby(merge_cols)['Tenure_GTA'].apply(resolve_tenure).reset_index()

# Merge the resolved GTA tenure info into df_matches_all_2
df_merged_gta = pd.merge(
    df_matches_all_2,
    df_tenure_gta_agg,
    on=merge_cols,
    how='left'
)

# Update 'Tenure' where it is 'Unknown' or NaN, using 'Tenure_GTA' if available
def update_tenure_gta(row):
    current_tenure = str(row['Tenure']).strip().lower()
    gta_tenure = row['Tenure_GTA']
    
    if current_tenure == 'unknown' or pd.isna(row['Tenure']):
        if pd.notna(gta_tenure) and gta_tenure != 'Unknown':
            return gta_tenure
    return row['Tenure']

df_merged_gta['Tenure'] = df_merged_gta.apply(update_tenure_gta, axis=1)

# Drop helper column
df_matches_all_3 = df_merged_gta.drop(columns=['Tenure_GTA'])

Set single property values to 'Own' if we detect a property value

In [10]:
df_matches_all_3.columns

Index(['CRA_BN_ID', 'CRA_Organization_Name', 'CRA_Category', 'CRA_SubCategory',
       '211_Organization_Name', '211_Location_Name', '211_Address', '211_City',
       '211_Postal_Code', 'X_Coordinate', 'Y_Coordinate', 'Match_Method',
       'Tenure', 'UW', 'Tenure_Public'],
      dtype='object')

In [11]:
df_land_status = pd.read_csv('../tenure-data/2021_Joinedv2(CRE2021).csv', encoding='latin')
df_land_status = df_land_status[[
    'CRA_BN_ID', 'CRA_Organi', 'F211_Organ', 'F211_Locat', 'F211_Addre',
    'F211_Posta', 'X_Coordina', 'Y_Coordina', 'Match_Meth', 'Tenure', 'UW', 
    'Land_or_Bldg_owned_4050', 'land_bldg_4155',
]].rename(columns={
    'CRA_Organi': 'CRA_Organization_Name',
    'F211_Organ': '211_Organization_Name',
    'F211_Locat': '211_Location_Name', 
    'F211_Addre': '211_Address',
    'F211_Posta': '211_Postal_Code',
    'X_Coordina': 'X_Coordinate',
    'Y_Coordina': 'Y_Coordinate',
    'Match_Meth': 'Match_Method',
})
df_land_status.columns

Index(['CRA_BN_ID', 'CRA_Organization_Name', '211_Organization_Name',
       '211_Location_Name', '211_Address', '211_Postal_Code', 'X_Coordinate',
       'Y_Coordinate', 'Match_Method', 'Tenure', 'UW',
       'Land_or_Bldg_owned_4050', 'land_bldg_4155'],
      dtype='object')

In [12]:
# Step 1: Clean up whitespace in relevant columns
df_land_status['Land_or_Bldg_owned_4050'] = df_land_status['Land_or_Bldg_owned_4050'].str.strip()
df_land_status['land_bldg_4155'] = df_land_status['land_bldg_4155'].str.strip()

# Step 2: Convert 'land_bldg_4155' to integer (handle NaNs or empty strings)
df_land_status['land_bldg_4155'] = pd.to_numeric(df_land_status['land_bldg_4155'], errors='coerce').fillna(0).astype(int)

# Step 3: Keep only rows where '211_Organization_Name' is unique
unique_orgs = df_land_status['211_Organization_Name'].value_counts()
unique_org_names = unique_orgs[unique_orgs == 1].index
df_land_status_unique = df_land_status[df_land_status['211_Organization_Name'].isin(unique_org_names)].copy()

# Step 4: Update 'Tenure' based on 'Land_or_Bldg_owned_4050' and 'land_bldg_4155'
def infer_tenure(row):
    current_tenure = row['Tenure']
    owned_status = row['Land_or_Bldg_owned_4050']
    building_value = row['land_bldg_4155']
    
    if current_tenure in ['Rent', 'Own']:
        return current_tenure  # Leave as is
    if pd.notna(owned_status) and owned_status != 'N' and building_value > 0:
        return 'Own'
    else:
        return 'Rent'

df_land_status_unique['Tenure_LandStatus'] = df_land_status_unique.apply(infer_tenure, axis=1)

# Step 5: Prepare for merge by keeping necessary columns
merge_cols = ['CRA_Organization_Name', '211_Organization_Name', '211_Location_Name', '211_Address', '211_Postal_Code', 'X_Coordinate', 'Y_Coordinate']
df_land_status_merge = df_land_status_unique[merge_cols + ['Tenure_LandStatus']]

# Step 6: Merge into df_matches_all_3
df_merged_land = pd.merge(
    df_matches_all_3,
    df_land_status_merge,
    on=merge_cols,
    how='left'
)

# Step 7: Update 'Tenure' where it's 'Unknown' or NaN, using 'Tenure_LandStatus'
def update_tenure_land(row):
    current_tenure = str(row['Tenure']).strip().lower()
    land_tenure = row['Tenure_LandStatus']
    
    if current_tenure == 'unknown' or pd.isna(row['Tenure']):
        if pd.notna(land_tenure):
            return land_tenure
    return row['Tenure']

df_merged_land['Tenure'] = df_merged_land.apply(update_tenure_land, axis=1)

# Step 8: Drop helper column
df_matches_all_4 = df_merged_land.drop(columns=['Tenure_LandStatus'])

In [13]:
df_matches_all_4['Tenure'].value_counts()

Tenure
Rent       1317
Own         543
Unknown     173
Name: count, dtype: int64

Join Sarah's manual listing of tenure

In [14]:
df_sarah = pd.read_csv('../tenure-data/simplified_matches_uw_tenure_SCupdate(in).csv')

In [15]:
# Step 1: Define merge columns
merge_cols = [
    'CRA_Organization_Name', '211_Organization_Name', '211_Location_Name',
    '211_Address', '211_Postal_Code', 'X_Coordinate', 'Y_Coordinate'
]

# Step 2: Prepare df_sarah for merge (keep only necessary columns)
df_sarah_merge = df_sarah[merge_cols + ['Tenure']].rename(columns={'Tenure': 'Tenure_Sarah'})

# Step 3: Merge with df_matches_all_4
df_merged_sarah = pd.merge(
    df_matches_all_4,
    df_sarah_merge,
    on=merge_cols,
    how='left'
)

# Step 4: Update Tenure only if current is not 'Rent' or 'Own'
def update_tenure_sarah(row):
    current_tenure = str(row['Tenure']).strip().lower()
    sarah_tenure = row['Tenure_Sarah']
    
    if current_tenure not in ['rent', 'own'] and pd.notna(sarah_tenure):
        return sarah_tenure
    return row['Tenure']

df_merged_sarah['Tenure'] = df_merged_sarah.apply(update_tenure_sarah, axis=1)

# Step 5: Drop helper column
df_matches_all_5 = df_merged_sarah.drop(columns=['Tenure_Sarah'])

In [16]:
df_matches_all_5['Tenure'] = df_matches_all_5['Tenure'].fillna('Unknown')
df_matches_all_5['Tenure'].value_counts()

Tenure
Rent       1468
Own         560
Unknown     221
Name: count, dtype: int64

Concatenate the extra entries manually defined by Sarah

In [19]:
df_extra_sarah = pd.read_csv('../uw-data/uw_additional_entries_sc.csv')

In [25]:
df_matches_all_6 = pd.concat([df_matches_all_5, df_extra_sarah])
df_matches_all_6 = df_matches_all_6.sort_values(by='CRA_Organization_Name')
df_matches_all_6['Tenure'].value_counts()

Tenure
Rent       1470
Own         564
Unknown     221
Name: count, dtype: int64

In [26]:
df_matches_all_6.to_csv('../joined-data/simplified_matches_uw_tenure_full.csv', index=False)