In [1]:
import pandas as pd
import numpy as np

## Load in the data

In [19]:
# Load CRA-listed organizations in Ontario
df_cra21 = pd.read_csv('../cra-data/cra_2021_original.csv', encoding='latin')
df_cra21 = df_cra21[df_cra21["Province"] == "ON"] 
df_cra21 = df_cra21.iloc[:, :12].reset_index(drop=True) # Remove black columns
df_cra21["Full Address"] = df_cra21["Address Line 1"] + "," + df_cra21["City"] + "," + df_cra21["Province"].copy()

  df_cra21 = pd.read_csv('../cra-data/cra_2021_original.csv', encoding='latin')


## Filter for Toronto, York, and Peel

In [20]:
# Define postal code prefixes for York and Peel
york_postal_codes = [
    "L4M", "L4N", "L7B", "L4G", "L9N", "L3Y", "L3X", "L4E", "L4H", "L6A", 
    "L4C", "L4S", "L4B", "L6C", "L6E", "L4A", "L6B", "L4L", "L4K", "L4J", 
    "L3T", "L6G", "L3P", "L3R", "L3S",
]

peel_postal_codes = [
    "L5N", "L5W", "L5T", "L5S", "L4T", "L5M", "L5V", "L5R", "L4Z", "L4W", 
    "L5P", "L4V", "L5L", "L5C", "L5B", "L5A", "L4Y", "L4X", "L5K", "L5H", 
    "L5G", "L5E", "L5J", "L0N", "L7K", "L7C", "L7E", "L4T", "L6P", "L6R", 
    "L6S", "L6T", "L6V", "L6W", "L6X", "L6Y", "L6Z", "L7A", "L7G"
]

# Clean and extract the first 3 characters of the postal code
df_cra21['Postal Code'] = df_cra21['Postal Code'].str.replace(' ', '')
df_cra21['Postal Prefix'] = df_cra21['Postal Code'].str[:3]

# Filter for Toronto (M), York, or Peel
df_cra_gta = df_cra21[
    (df_cra21['Postal Code'].str.startswith('M')) |  # Toronto
    (df_cra21['Postal Prefix'].isin(york_postal_codes)) |  # York
    (df_cra21['Postal Prefix'].isin(peel_postal_codes))  # Peel
]

# Drop the temporary Postal Prefix column and reset index
df_cra_gta = df_cra_gta.drop(columns=['Postal Prefix'])
df_cra_gta = df_cra_gta.reset_index(drop=True)

## Filter for valid organization type codes

In [24]:
df_categories = pd.read_excel("../cra-data/Organization_Type.xlsx", sheet_name="Categories-for-Filter")

In [33]:
# Perform an inner join on Category and Sub Category columns
df_cra_cat = pd.merge(
    df_cra_gta,
    df_categories[['Category Code', 'Sub-Category Code']],  # We only need these columns for matching
    left_on=['Category', 'Sub Category'],
    right_on=['Category Code', 'Sub-Category Code'],
    how='inner'  # Consider "right"?
)

# Drop the duplicate columns from df_categories if they were included
df_cra_cat = df_cra_cat.drop(columns=['Category Code', 'Sub-Category Code'], errors='ignore')

# Reset index if desired
df_cra_cat = df_cra_cat.reset_index(drop=True)

## Add financial information codes

In [37]:
df_financial = pd.read_csv("../cra-data/financial_d_and_schedule_6_2021.csv", encoding='latin-1')
df_financial = df_financial[["BN", "4050", "4155", "4850"]]

  df_financial = pd.read_csv("../cra-data/financial_d_and_schedule_6_2021.csv", encoding='latin-1')


In [40]:
df_cra_fin = df_cra_cat.join(
    df_financial.set_index("BN"), 
    on="BN", 
    how="left"
).reset_index(drop=True)

In [42]:
df_cra_fin.to_csv("../cra-data/cra_2021_combined_filtered.csv", index=False)