In [98]:
import math
import multiprocessing as mp
import re

import geopandas as gpd
import numpy as np
import pandas as pd
import rtree
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from shapely.ops import unary_union
from tqdm import tqdm

In [2]:
# Read in both CSV files
file1_path = "data/Parcel_Data_2021_Table_-2691831558175163259.csv"
file2_path = "data/TTC_Secured_Property_Taxes.csv"

# Read the CSV files
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Display basic info about the dataframes
print(f"Dataset 1 shape: {df1.shape}")
print(f"Dataset 2 shape: {df2.shape}")


  df1 = pd.read_csv(file1_path)
  df2 = pd.read_csv(file2_path)


Dataset 1 shape: (2424770, 51)
Dataset 2 shape: (2571799, 69)


In [3]:
df1[df1["Property Location"].str.startswith("1017 HYPERION AVE", na=False)]


Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Assessor ID,Property Location,Property Use Type,Property Use Code,Use Code 1st Digit,...,Address House Number Fraction,Direction,Street,Unit Number,City,Zip Code.1,Row ID,Location Latitude,Location Longitude,OBJECTID
1252963,90029-3109,LOS ANGELES,5427007013,2024,13,5427-007-013,1017 HYPERION AVE LOS ANGELES CA 90029,R-I,200,Residential,...,,,HYPERION AVE,,LOS ANGELES CA,90029.0,20245427007013,34.090412,-118.280831,56340440


In [4]:
df1["City Tax Rate Area"].value_counts()

City Tax Rate Area
LOS ANGELES       805738
unincorporated    326824
LONG BEACH        106381
SANTA CLARITA      71910
LANCASTER          50077
                   ...  
ROSEMEAD             269
MAYWOOD              213
PARAMOUNT             94
INDUSTRY              36
EL SEGUNDO             4
Name: count, Length: 123, dtype: int64

In [5]:
df2 = df2.loc[df2.groupby("PARCEL_NUMBER")["PARCEL_HISTORY_YRSEQ"].idxmax()]


In [6]:
df2[df2["PARCEL_NUMBER"] == 5427007013]


Unnamed: 0,RECORD_TYPE,PARCEL_YEAR_8900S,PARCEL_NUMBER,YEAR_ACTIVE_TABLE,MINING_RIGHTS_KEY,TAX_STATUS_KEY,YEAR_TAX_DEFAULTED,SENIOR_CITIZEN_YEAR,F4PAY,F500_ACCOUNT_NUMBER,...,HOMEOWNERS_EXEMPTION,MOBILE_OWNER_EXEMPTION,FIXTURES_EXEMPTION,PERSONAL_PROPERTY_EXEMPTION,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,OBJECTID
1312298,1,,5427007013,111111111,,0,0,0,0,0,...,,,,,0.0,,0.0,0,0.0,1312299


In [7]:
# Merge the dataframes - df1 is the left dataframe, and we're matching AIN from df1
# with PARCEL_NUMBER from df2
merged_df = pd.merge(
    left=df1, right=df2, left_on="AIN", right_on="PARCEL_NUMBER", how="left"
)

# Display info about the merged dataframe
print(f"Merged dataset shape: {merged_df.shape}")

# Preview the merged dataframe
print(merged_df.head())

Merged dataset shape: (2424770, 120)
     Zip Code City Tax Rate Area         AIN  Roll Year  Tax Rate Area Code  \
0  91304-3327        LOS ANGELES  2004001003       2024                  16   
1  91304-3327        LOS ANGELES  2004001004       2024                  16   
2  91304-3327        LOS ANGELES  2004001005       2024                  16   
3  91304-3332        LOS ANGELES  2004001008       2024                  16   
4  91304-3332        LOS ANGELES  2004001009       2024                  16   

    Assessor ID                        Property Location Property Use Type  \
0  2004-001-003    8321 FAUST AVE  LOS ANGELES CA  91304               SFR   
1  2004-001-004    8313 FAUST AVE  LOS ANGELES CA  91304               SFR   
2  2004-001-005    8309 FAUST AVE  LOS ANGELES CA  91304               SFR   
3  2004-001-008  8325 MAYNARD AVE  LOS ANGELES CA  91304               SFR   
4  2004-001-009  8311 MAYNARD AVE  LOS ANGELES CA  91304               SFR   

  Property Use Code

In [8]:
merged_df = merged_df[merged_df["City Tax Rate Area"] == "LOS ANGELES"]
merged_df

Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Assessor ID,Property Location,Property Use Type,Property Use Code,Use Code 1st Digit,...,HOMEOWNERS_EXEMPTION,MOBILE_OWNER_EXEMPTION,FIXTURES_EXEMPTION,PERSONAL_PROPERTY_EXEMPTION,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,OBJECTID_y
0,91304-3327,LOS ANGELES,2004001003,2024,16,2004-001-003,8321 FAUST AVE LOS ANGELES CA 91304,SFR,0101,Residential,...,,,,,0.0,,0.0,0.0,0.0,1.0
1,91304-3327,LOS ANGELES,2004001004,2024,16,2004-001-004,8313 FAUST AVE LOS ANGELES CA 91304,SFR,0101,Residential,...,,,,,0.0,,0.0,0.0,0.0,2.0
2,91304-3327,LOS ANGELES,2004001005,2024,16,2004-001-005,8309 FAUST AVE LOS ANGELES CA 91304,SFR,0100,Residential,...,,,,,0.0,,0.0,0.0,0.0,3.0
3,91304-3332,LOS ANGELES,2004001008,2024,16,2004-001-008,8325 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,Residential,...,7000.0,,,,0.0,,0.0,0.0,0.0,4.0
4,91304-3332,LOS ANGELES,2004001009,2024,16,2004-001-009,8311 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,Residential,...,7000.0,,,,0.0,,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016313,90732-4530,LOS ANGELES,7563037052,2024,20,7563-037-052,2193 WARMOUTH ST LOS ANGELES CA 90732,SFR,0100,Residential,...,7000.0,,,,0.0,,0.0,0.0,0.0,2110629.0
2016314,90732-4519,LOS ANGELES,7563037053,2024,20,7563-037-053,3810 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,Residential,...,,,,,0.0,,0.0,0.0,0.0,2110630.0
2016315,90732-4519,LOS ANGELES,7563037054,2024,20,7563-037-054,3806 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,Residential,...,7000.0,,,,0.0,,0.0,0.0,0.0,2110631.0
2016316,90732-4556,LOS ANGELES,7563037055,2024,20,7563-037-055,2197 W PASEO DEL MAR LOS ANGELES CA 90732,SFR,0100,Residential,...,7000.0,,,,0.0,,0.0,0.0,0.0,2110632.0


In [9]:
# Option 1: Set the display option to show all columns
pd.set_option("display.max_columns", None)

print(list(merged_df.columns))

['Zip Code', 'City Tax Rate Area', 'AIN', 'Roll Year', 'Tax Rate Area Code', 'Assessor ID', 'Property Location', 'Property Use Type', 'Property Use Code', 'Use Code 1st Digit', 'Use Code 2nd Digit', 'Use Code 3rd Digit', 'Use Code 4th Digit', 'Number of Buildings', 'Year Built', 'Effective Year', 'Square Footage', 'Number of Bedrooms', 'Number of Bathrooms', 'Number of Units', 'Recording Date', 'Land Value', 'Land Base Year', 'Improvement Value', 'Improvement Base Year', 'Total Value, Land + Improvement', 'Home Owners Exemption', 'Real Estate Exemption', 'Fixture Value', 'Fixture Exemption', 'Personal Property Value', 'Personal Property Exemption', 'Property taxable?', 'Total Value', 'Total Exemption', 'Taxable Value', 'Classification', 'Region Number', 'Cluster Code', 'Parcel Legal Description', 'Address House Number', 'Address House Number Fraction', 'Direction', 'Street', 'Unit Number', 'City', 'Zip Code.1', 'Row ID', 'Location Latitude', 'Location Longitude', 'OBJECTID_x', 'RECORD_

In [10]:
selected = merged_df[
    [
        "Zip Code",
        "City Tax Rate Area",
        "AIN",
        "Roll Year",
        "Tax Rate Area Code",
        "Property Location",
        "Property Use Type",
        "Property Use Code",
        "Number of Buildings",
        "Year Built",
        "Effective Year",
        "Square Footage",
        "Number of Bedrooms",
        "Number of Bathrooms",
        "Number of Units",
        "Land Value",
        "Land Base Year",
        "Improvement Value",
        "Improvement Base Year",
        "Total Value, Land + Improvement",
        "Home Owners Exemption",
        "Real Estate Exemption",
        "Fixture Value",
        "Fixture Exemption",
        "Personal Property Value",
        "Personal Property Exemption",
        "Property taxable?",
        "Total Value",
        "Total Exemption",
        "Taxable Value",
        "Classification",
        "Region Number",
        "Location Latitude",
        "Location Longitude",
        "F1ST_INSTALLMENT_TAX",
        "F1ST_INSTALLMENT_BALANCE_DUE",
        "F1ST_INSTALLMENT_PENALTY",
        "F1ST_INSTALLMENT_PENALTY_PAID",
        "F1ST_INSTALLMENT_PAID_DATE",
        "F2ND_INSTALLMENT_TAX",
        "F2ND_INSTALLMENT_BALANCE_DUE",
        "F2ND_INSTALLMENT_PENALTY",
        "F2ND_INSTALLMENT_PENALTY_PAID",
        "F2ND_INSTALLMENT_PAID_DATE",
        "COST_DUE",
        "COST_PAID",
        "NONSUFFICIENT_FUNDS_DUE",
        "NONSUFFICIENT_FUNDS_PAID",
        "TOTAL_TAX_DUE",
        "TAX_TYPE_1_AMOUNT",
        "TAXTYPE_2_AMOUNT",
        "TAX_TYPE_3_AMOUNT",
        "TAX_TYPE_4_AMOUNT",
        "TAX_TYPE_8_AMOUNT",
    ]
]

In [11]:
selected["Total_Taxes_Paid_Calc"] = (
    selected["F1ST_INSTALLMENT_TAX"] + selected["F2ND_INSTALLMENT_TAX"]
)
selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected["Total_Taxes_Paid_Calc"] = (


Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Property Location,Property Use Type,Property Use Code,Number of Buildings,Year Built,Effective Year,Square Footage,Number of Bedrooms,Number of Bathrooms,Number of Units,Land Value,Land Base Year,Improvement Value,Improvement Base Year,"Total Value, Land + Improvement",Home Owners Exemption,Real Estate Exemption,Fixture Value,Fixture Exemption,Personal Property Value,Personal Property Exemption,Property taxable?,Total Value,Total Exemption,Taxable Value,Classification,Region Number,Location Latitude,Location Longitude,F1ST_INSTALLMENT_TAX,F1ST_INSTALLMENT_BALANCE_DUE,F1ST_INSTALLMENT_PENALTY,F1ST_INSTALLMENT_PENALTY_PAID,F1ST_INSTALLMENT_PAID_DATE,F2ND_INSTALLMENT_TAX,F2ND_INSTALLMENT_BALANCE_DUE,F2ND_INSTALLMENT_PENALTY,F2ND_INSTALLMENT_PENALTY_PAID,F2ND_INSTALLMENT_PAID_DATE,COST_DUE,COST_PAID,NONSUFFICIENT_FUNDS_DUE,NONSUFFICIENT_FUNDS_PAID,TOTAL_TAX_DUE,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,Total_Taxes_Paid_Calc
0,91304-3327,LOS ANGELES,2004001003,2024,16,8321 FAUST AVE LOS ANGELES CA 91304,SFR,0101,1,1973,1973,2090,4,3,1,711689,2006,301176,2006,1012865,0,0,0,0,0,0,Y,1012865,0,1012865,,2,34.220225,-118.620681,6326.89,0.0,0.0,0.0,11/05/2024,6326.87,0.00,0.0,0.0,02/03/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,12653.76
1,91304-3327,LOS ANGELES,2004001004,2024,16,8313 FAUST AVE LOS ANGELES CA 91304,SFR,0101,1,1973,1973,2479,5,3,1,370538,2010,255879,2010,626417,0,0,0,0,0,0,Y,626417,0,626417,,2,34.220044,-118.620681,4017.94,0.0,0.0,0.0,12/10/2024,4017.94,4017.94,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,8035.88
2,91304-3327,LOS ANGELES,2004001005,2024,16,8309 FAUST AVE LOS ANGELES CA 91304,SFR,0100,1,1973,1973,2057,4,2,1,526360,2018,198577,2018,724937,0,0,0,0,0,0,Y,724937,0,724937,,2,34.219862,-118.620688,4569.88,0.0,0.0,0.0,11/20/2024,4569.88,4569.88,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,9139.76
3,91304-3332,LOS ANGELES,2004001008,2024,16,8325 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,1,1978,1978,2423,4,3,1,128421,1980,221965,1980,350386,7000,0,0,0,0,0,Y,350386,7000,343386,,2,34.220339,-118.622718,2354.27,0.0,0.0,0.0,12/11/2024,2354.26,2354.26,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,4708.53
4,91304-3332,LOS ANGELES,2004001009,2024,16,8311 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,1,1978,1978,2226,4,3,1,139933,1984,210012,1984,349945,7000,0,0,0,0,0,Y,349945,7000,342945,,2,34.220327,-118.623062,2358.49,0.0,0.0,0.0,12/02/2024,2358.49,2358.49,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,4716.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016313,90732-4530,LOS ANGELES,7563037052,2024,20,2193 WARMOUTH ST LOS ANGELES CA 90732,SFR,0100,1,1959,1959,1933,3,3,1,485785,1995,263473,1995,749258,7000,0,0,0,0,0,Y,749258,7000,742258,,14,33.719514,-118.325376,4766.57,0.0,0.0,0.0,11/07/2024,4766.57,0.00,0.0,0.0,02/21/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,9533.14
2016314,90732-4519,LOS ANGELES,7563037053,2024,20,3810 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,1,1965,1965,2206,3,2,1,793428,2020,198356,2020,991784,0,0,0,0,0,0,Y,991784,0,991784,,14,33.719952,-118.325128,6241.31,0.0,0.0,0.0,11/27/2024,6241.31,0.00,0.0,0.0,01/16/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,12482.62
2016315,90732-4519,LOS ANGELES,7563037054,2024,20,3806 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,1,1959,1961,1862,3,2,1,703626,2008,175899,2008,879525,7000,0,0,0,0,0,Y,879525,7000,872525,,14,33.720119,-118.325081,5512.25,0.0,0.0,0.0,12/11/2024,5512.25,5512.25,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,11024.50
2016316,90732-4556,LOS ANGELES,7563037055,2024,20,2197 W PASEO DEL MAR LOS ANGELES CA 90732,SFR,0100,1,1962,1962,2153,4,2,1,47779,1975,92991,1975,140770,7000,0,0,0,0,0,Y,140770,7000,133770,,14,33.720279,-118.325034,1096.85,0.0,0.0,0.0,12/16/2024,1096.85,1096.85,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,2193.70


In [12]:
# selected.to_csv("LA_City_Property_Tax_Data.csv")

In [13]:
zoning = gpd.read_file("data/Zoning.geojson")
zoning = zoning[["ZONE_COMPLT", "geometry"]]
zoning

Unnamed: 0,ZONE_COMPLT,geometry
0,[LF1-WH1-5] [A1-1L],"POLYGON ((-118.24535 34.07054, -118.2455 34.07..."
1,[VF1-WH1-6] [I1-N],"POLYGON ((-118.2239 34.07033, -118.2239 34.070..."
2,[LN1-MU2-5] [P2-FA] [CPIO],"POLYGON ((-118.24139 34.06798, -118.24155 34.0..."
3,[LN1-SH2-5] [RX1-FA] [CPIO],"POLYGON ((-118.23419 34.06842, -118.23428 34.0..."
4,[LM2-MU1-5] [CX2-FA] [CPIO-O],"POLYGON ((-118.24305 34.06724, -118.24302 34.0..."
...,...,...
58883,[Q]C2-2D-CPIO,"POLYGON ((-118.32636 34.0872, -118.32641 34.08..."
58884,C2-2D-SN-CPIO,"POLYGON ((-118.33215 34.09785, -118.33215 34.0..."
58885,[Q]C2-2D-CPIO,"POLYGON ((-118.30238 34.09071, -118.30238 34.0..."
58886,[Q]C2-1XL-CPIO,"POLYGON ((-118.34137 34.08304, -118.34151 34.0..."


In [14]:
# Step 1: Convert the regular dataframe to a GeoDataFrame by creating Point geometries
# First, make a copy to avoid modifying the original
selected_gdf = selected.copy()

# Create a geometry column with Point objects from latitude and longitude
selected_gdf["geometry"] = selected_gdf.apply(
    lambda row: Point(row["Location Longitude"], row["Location Latitude"]), axis=1
)

# Convert to a GeoDataFrame
selected_gdf = gpd.GeoDataFrame(selected_gdf, geometry="geometry")

# Make sure both GeoDataFrames have the same CRS (Coordinate Reference System)
# If you know the CRS of your data, set it explicitly
# For example, if your coordinates are in WGS84:
selected_gdf.crs = "EPSG:4326"

# Ensure zoning has the same CRS, or reproject if needed
if zoning.crs != selected_gdf.crs:
    zoning = zoning.to_crs(selected_gdf.crs)

# Step 2: Perform spatial join - this will add zoning attributes to each point
joined_data = gpd.sjoin(selected_gdf, zoning, how="left", predicate="within")

# Step 3: If you want a regular dataframe with the original columns plus zoning info
# (You might want to drop the extra geometry column and index_right column)
result = pd.DataFrame(joined_data.drop(columns=["geometry", "index_right"]))

# Alternatively, if you want to add specific columns from zoning to your original dataframe:
# selected['zoning_type'] = joined_data['zoning_type']  # Replace with your actual column names

In [15]:
result["ZONE_PREFIX"] = result["ZONE_COMPLT"].str.split("-").str[0]
result

Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Property Location,Property Use Type,Property Use Code,Number of Buildings,Year Built,Effective Year,Square Footage,Number of Bedrooms,Number of Bathrooms,Number of Units,Land Value,Land Base Year,Improvement Value,Improvement Base Year,"Total Value, Land + Improvement",Home Owners Exemption,Real Estate Exemption,Fixture Value,Fixture Exemption,Personal Property Value,Personal Property Exemption,Property taxable?,Total Value,Total Exemption,Taxable Value,Classification,Region Number,Location Latitude,Location Longitude,F1ST_INSTALLMENT_TAX,F1ST_INSTALLMENT_BALANCE_DUE,F1ST_INSTALLMENT_PENALTY,F1ST_INSTALLMENT_PENALTY_PAID,F1ST_INSTALLMENT_PAID_DATE,F2ND_INSTALLMENT_TAX,F2ND_INSTALLMENT_BALANCE_DUE,F2ND_INSTALLMENT_PENALTY,F2ND_INSTALLMENT_PENALTY_PAID,F2ND_INSTALLMENT_PAID_DATE,COST_DUE,COST_PAID,NONSUFFICIENT_FUNDS_DUE,NONSUFFICIENT_FUNDS_PAID,TOTAL_TAX_DUE,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,Total_Taxes_Paid_Calc,ZONE_COMPLT,ZONE_PREFIX
0,91304-3327,LOS ANGELES,2004001003,2024,16,8321 FAUST AVE LOS ANGELES CA 91304,SFR,0101,1,1973,1973,2090,4,3,1,711689,2006,301176,2006,1012865,0,0,0,0,0,0,Y,1012865,0,1012865,,2,34.220225,-118.620681,6326.89,0.0,0.0,0.0,11/05/2024,6326.87,0.00,0.0,0.0,02/03/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,12653.76,RE9-1,RE9
1,91304-3327,LOS ANGELES,2004001004,2024,16,8313 FAUST AVE LOS ANGELES CA 91304,SFR,0101,1,1973,1973,2479,5,3,1,370538,2010,255879,2010,626417,0,0,0,0,0,0,Y,626417,0,626417,,2,34.220044,-118.620681,4017.94,0.0,0.0,0.0,12/10/2024,4017.94,4017.94,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,8035.88,RE9-1,RE9
2,91304-3327,LOS ANGELES,2004001005,2024,16,8309 FAUST AVE LOS ANGELES CA 91304,SFR,0100,1,1973,1973,2057,4,2,1,526360,2018,198577,2018,724937,0,0,0,0,0,0,Y,724937,0,724937,,2,34.219862,-118.620688,4569.88,0.0,0.0,0.0,11/20/2024,4569.88,4569.88,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,9139.76,RE9-1,RE9
3,91304-3332,LOS ANGELES,2004001008,2024,16,8325 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,1,1978,1978,2423,4,3,1,128421,1980,221965,1980,350386,7000,0,0,0,0,0,Y,350386,7000,343386,,2,34.220339,-118.622718,2354.27,0.0,0.0,0.0,12/11/2024,2354.26,2354.26,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,4708.53,RE11-1,RE11
4,91304-3332,LOS ANGELES,2004001009,2024,16,8311 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,1,1978,1978,2226,4,3,1,139933,1984,210012,1984,349945,7000,0,0,0,0,0,Y,349945,7000,342945,,2,34.220327,-118.623062,2358.49,0.0,0.0,0.0,12/02/2024,2358.49,2358.49,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,4716.98,RE11-1,RE11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016313,90732-4530,LOS ANGELES,7563037052,2024,20,2193 WARMOUTH ST LOS ANGELES CA 90732,SFR,0100,1,1959,1959,1933,3,3,1,485785,1995,263473,1995,749258,7000,0,0,0,0,0,Y,749258,7000,742258,,14,33.719514,-118.325376,4766.57,0.0,0.0,0.0,11/07/2024,4766.57,0.00,0.0,0.0,02/21/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,9533.14,R1-1XL,R1
2016314,90732-4519,LOS ANGELES,7563037053,2024,20,3810 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,1,1965,1965,2206,3,2,1,793428,2020,198356,2020,991784,0,0,0,0,0,0,Y,991784,0,991784,,14,33.719952,-118.325128,6241.31,0.0,0.0,0.0,11/27/2024,6241.31,0.00,0.0,0.0,01/16/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,12482.62,R1-1XL,R1
2016315,90732-4519,LOS ANGELES,7563037054,2024,20,3806 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,1,1959,1961,1862,3,2,1,703626,2008,175899,2008,879525,7000,0,0,0,0,0,Y,879525,7000,872525,,14,33.720119,-118.325081,5512.25,0.0,0.0,0.0,12/11/2024,5512.25,5512.25,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,11024.50,R1-1XL,R1
2016316,90732-4556,LOS ANGELES,7563037055,2024,20,2197 W PASEO DEL MAR LOS ANGELES CA 90732,SFR,0100,1,1962,1962,2153,4,2,1,47779,1975,92991,1975,140770,7000,0,0,0,0,0,Y,140770,7000,133770,,14,33.720279,-118.325034,1096.85,0.0,0.0,0.0,12/16/2024,1096.85,1096.85,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,2193.70,R1-1XL,R1


In [16]:
result["ZONE_PREFIX"].value_counts()

ZONE_PREFIX
R1           251579
R3            61505
RS            55093
R2            51851
RD1.5         36204
              ...  
C1(PV)            1
[T]M2             1
M2(PV)            1
[Q]CM(GM)         1
[T]RE9            1
Name: count, Length: 325, dtype: int64

In [17]:
parcels = gpd.read_file("data/Parcels.gdb", columns=["AIN", "geometry"])

In [18]:
parcels["AIN"] = pd.to_numeric(parcels["AIN"], errors="coerce").astype(
    "Int64"
)  # Keeps NaNs


In [19]:
merged_results = pd.merge(
    result,
    parcels,
    on="AIN",  # Join on the AIN column that exists in both dataframes
    how="left",  # Keep all rows from 'result' even if no matching AIN in parcels
)

# If you want to convert the merged result back to a GeoDataFrame
# (in case you need to do more spatial operations later)
# Note: This will use the geometry from the parcels dataframe
merged_gdf = gpd.GeoDataFrame(merged_results, geometry="geometry")

In [20]:
projected_gdf = merged_gdf.to_crs("EPSG:2229")

# Calculate area in square feet first (State Plane uses US Survey Feet)
projected_gdf["area_sq_feet"] = projected_gdf.geometry.area

# Convert to acres (1 acre = 43,560 square feet)
projected_gdf["acreage"] = projected_gdf["area_sq_feet"] / 43560

# Drop the intermediate area calculation if you don't need it
projected_gdf = projected_gdf.drop(columns=["area_sq_feet"])

# If you want to round the acreage to a specific number of decimal places
projected_gdf["acreage"] = projected_gdf["acreage"].round(3)

# Convert back to original CRS if needed for further spatial operations
final_gdf = projected_gdf.to_crs(merged_gdf.crs)

In [21]:
final_gdf[final_gdf["Property Location"].str.startswith("1017 HYPERION AVE", na=False)]


Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Property Location,Property Use Type,Property Use Code,Number of Buildings,Year Built,Effective Year,Square Footage,Number of Bedrooms,Number of Bathrooms,Number of Units,Land Value,Land Base Year,Improvement Value,Improvement Base Year,"Total Value, Land + Improvement",Home Owners Exemption,Real Estate Exemption,Fixture Value,Fixture Exemption,Personal Property Value,Personal Property Exemption,Property taxable?,Total Value,Total Exemption,Taxable Value,Classification,Region Number,Location Latitude,Location Longitude,F1ST_INSTALLMENT_TAX,F1ST_INSTALLMENT_BALANCE_DUE,F1ST_INSTALLMENT_PENALTY,F1ST_INSTALLMENT_PENALTY_PAID,F1ST_INSTALLMENT_PAID_DATE,F2ND_INSTALLMENT_TAX,F2ND_INSTALLMENT_BALANCE_DUE,F2ND_INSTALLMENT_PENALTY,F2ND_INSTALLMENT_PENALTY_PAID,F2ND_INSTALLMENT_PAID_DATE,COST_DUE,COST_PAID,NONSUFFICIENT_FUNDS_DUE,NONSUFFICIENT_FUNDS_PAID,TOTAL_TAX_DUE,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,Total_Taxes_Paid_Calc,ZONE_COMPLT,ZONE_PREFIX,geometry,acreage
618936,90029-3109,LOS ANGELES,5427007013,2024,13,1017 HYPERION AVE LOS ANGELES CA 90029,R-I,200,2,1946,1952,2184,6,2,2,1096500,2023,418200,2023,1514700,0,0,0,0,0,0,Y,1514700,0,1514700,,4,34.090412,-118.280831,9291.82,0.0,0.0,0.0,11/13/2024,9291.8,9291.8,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,18583.62,R3-1VL,R3,"MULTIPOLYGON (((6476718.72 1855390.06, 6476695...",0.168


In [22]:
final_gdf["Property_Tax_Value"] = final_gdf["Taxable Value"] * 0.01

In [23]:
final_gdf = final_gdf.dropna(subset=["acreage"])

In [24]:
final_gdf[final_gdf["acreage"].isna()]


Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Property Location,Property Use Type,Property Use Code,Number of Buildings,Year Built,Effective Year,Square Footage,Number of Bedrooms,Number of Bathrooms,Number of Units,Land Value,Land Base Year,Improvement Value,Improvement Base Year,"Total Value, Land + Improvement",Home Owners Exemption,Real Estate Exemption,Fixture Value,Fixture Exemption,Personal Property Value,Personal Property Exemption,Property taxable?,Total Value,Total Exemption,Taxable Value,Classification,Region Number,Location Latitude,Location Longitude,F1ST_INSTALLMENT_TAX,F1ST_INSTALLMENT_BALANCE_DUE,F1ST_INSTALLMENT_PENALTY,F1ST_INSTALLMENT_PENALTY_PAID,F1ST_INSTALLMENT_PAID_DATE,F2ND_INSTALLMENT_TAX,F2ND_INSTALLMENT_BALANCE_DUE,F2ND_INSTALLMENT_PENALTY,F2ND_INSTALLMENT_PENALTY_PAID,F2ND_INSTALLMENT_PAID_DATE,COST_DUE,COST_PAID,NONSUFFICIENT_FUNDS_DUE,NONSUFFICIENT_FUNDS_PAID,TOTAL_TAX_DUE,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,Total_Taxes_Paid_Calc,ZONE_COMPLT,ZONE_PREFIX,geometry,acreage,Property_Tax_Value


In [25]:
final_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 804534 entries, 0 to 805747
Data columns (total 60 columns):
 #   Column                           Non-Null Count   Dtype   
---  ------                           --------------   -----   
 0   Zip Code                         804534 non-null  object  
 1   City Tax Rate Area               804534 non-null  object  
 2   AIN                              804534 non-null  int64   
 3   Roll Year                        804534 non-null  int64   
 4   Tax Rate Area Code               804534 non-null  int64   
 5   Property Location                766065 non-null  object  
 6   Property Use Type                804534 non-null  object  
 7   Property Use Code                804534 non-null  object  
 8   Number of Buildings              804534 non-null  int64   
 9   Year Built                       804534 non-null  int64   
 10  Effective Year                   804534 non-null  int64   
 11  Square Footage                   804534 non-null 

In [47]:
final_gdf["Property_Tax_Value"].sum()


np.float64(8078718255.569998)

In [27]:
# Function to extract first letter of zoning code
def extract_first_letter(zone_code):
    if pd.isna(zone_code):
        return None

    # Remove any brackets or parentheses and their contents
    # This regex looks for patterns like [...], (...), etc.
    cleaned_code = re.sub(r"[\[\(].*?[\]\)]", "", zone_code)

    # Remove any remaining special characters
    cleaned_code = re.sub(r"[^a-zA-Z0-9]", "", cleaned_code)

    # Take the first letter if there is one
    if cleaned_code and len(cleaned_code) > 0:
        return cleaned_code[0]
    else:
        return None


# Apply the function to create a new column
final_gdf["zone_type"] = final_gdf["ZONE_PREFIX"].apply(extract_first_letter)


In [28]:
final_gdf["zone_type"].value_counts()

zone_type
R    697050
C     53791
M     17263
P      7582
H      6522
D      4399
L      4069
Q      3891
O      3431
A      3273
U       726
N       353
V        51
T        48
G        30
F         8
S         2
Name: count, dtype: int64

In [29]:
# final_gdf.to_csv("LA_City_Property_Parcel_Acreage_Zoning_Tax_Data.csv")

In [30]:
# Group by ZONE_PREFIX and calculate aggregates
zone_summary = {}

# Get unique zone prefixes
zone_prefixes = final_gdf["ZONE_PREFIX"].unique()
print(zone_prefixes)
for zone in zone_prefixes:
    # Filter for just this zone prefix
    zone_data = final_gdf[final_gdf["ZONE_PREFIX"] == zone]

    # Skip if zone is None/NaN
    if pd.isna(zone):
        continue

    try:
        # Calculate metrics with error handling
        total_acreage = zone_data["acreage"].sum()
        total_property_tax = zone_data["Property_Tax_Value"].sum()
    except KeyError as e:
        print(f"Missing key {e} in DataFrame. Skipping zone: {zone}")
        continue  # Skip this zone if keys are missing

    # Avoid division by zero
    if total_acreage > 0:
        tax_per_acre = total_property_tax / total_acreage
    else:
        tax_per_acre = 0

    # Store in dictionary
    zone_summary[zone] = {
        "total_acreage": round(total_acreage, 2),
        "total_property_tax": round(total_property_tax, 2),
        "tax_per_acre": round(tax_per_acre, 2),
    }

# Now zone_summary is a dictionary with all the metrics by zone prefix
print(zone_summary)

['RE9' 'RE11' 'OS' '[Q]OS' '(Q)R1' '(Q)RE11' '(Q)RD3' '[T][Q]M1' 'A1' 'RS'
 'C2' '[Q]M1' 'R3' 'C4' 'P' '[Q]PF' '(T)(Q)C1.5' 'CR' nan '(T)(Q)R4'
 '(Q)R3' '(T)RS' 'R1' 'RA' '[Q]C1' '[Q]C1.5' '(Q)RD5' 'RE20' 'RE40' 'RE15'
 '(T)RE11' 'A2' 'QCR' '(T)(Q)CR' 'QC1' '(Q)RD2' 'QRD3' '[Q]CR' '[Q]C2'
 'C1' '(T)(Q)C2' 'R1P' 'C1.5' 'R2' '(Q)C1.5' '(WC)TOPANGA' '(T)(Q)RD2'
 'RD2' 'RD1.5' '(Q)RD1.5' '(Q)RD6' '(T)(Q)R1' '(Q)CR' '(T)(Q)RD3'
 '(T)(Q)RD5' '(Q)C1' '(Q)C4' '[Q]C4' 'QC2' '(Q)P' '(Q)C2' '(T)(Q)C1' 'PF'
 '(T)R1' '[Q]R3' '(T)R3' 'QRD5' 'R4' '(T)RE9' '(T)(Q)RD4' '(Q)RE9'
 '(T)(Q)R3' 'M2' 'MR1' '(Q)MR1' '[Q]CM' '[T][Q]C2' '(T)(Q)RZ3' '[Q]RD2'
 '[Q]P' '(Q)R3P' '(T)(Q)RD1.5' '(Q)M1' 'RD5' 'M1' 'RD3' '(T)(Q)M1' 'QR3'
 '(Q)RS' '[T]R3' 'QRD1.5' 'R3P' '(Q)R4' '(Q)RZ2.5' '(Q)R2' '[Q]R2' '[Q]R1'
 '[Q]RAS3' '(Q)RD4' '(T)[Q]C2' '[T]R1' '(T)[Q]R4' '(T)(Q)RAS4' '[Q]RD5'
 '(Q)RAS3' '(T)(Q)RAS3' '(T)[Q]RAS4' 'RMP' '[T]RD2' '[T][Q]RD1.5' 'QRD2'
 '(T)(Q)C4' '[Q]RD3' '(WC)RIVER' '(Q)RAS4' '[T][Q]MR1' '[Q]MR1'
 '(

In [31]:
# Group by ZONE_PREFIX and calculate aggregates
zone_summary_types = {}

# Get unique zone prefixes
zone_prefixes = final_gdf["zone_type"].unique()
print(zone_prefixes)
for zone in zone_prefixes:
    # Filter for just this zone prefix
    zone_data = final_gdf[final_gdf["zone_type"] == zone]

    # Skip if zone is None/NaN
    if pd.isna(zone):
        continue

    try:
        # Calculate metrics with error handling
        total_acreage = zone_data["acreage"].sum()
        total_property_tax = zone_data["Property_Tax_Value"].sum()
    except KeyError as e:
        print(f"Missing key {e} in DataFrame. Skipping zone: {zone}")
        continue  # Skip this zone if keys are missing

    # Avoid division by zero
    if total_acreage > 0:
        tax_per_acre = total_property_tax / total_acreage
    else:
        tax_per_acre = 0

    # Store in dictionary
    zone_summary_types[zone] = {
        "total_acreage": round(total_acreage, 2),
        "total_property_tax": round(total_property_tax, 2),
        "tax_per_acre": round(tax_per_acre, 2),
    }

# Now zone_summary is a dictionary with all the metrics by zone prefix
print(zone_summary_types)

['R' 'O' 'M' 'A' 'C' 'P' None 'Q' 'T' 'U' 'D' 'N' 'L' 'H' 'F' 'V' 'G' 'S']
{'R': {'total_acreage': np.float64(405531.82), 'total_property_tax': np.float64(5765060693.26), 'tax_per_acre': np.float64(14216.05)}, 'O': {'total_acreage': np.float64(39112.82), 'total_property_tax': np.float64(979096.22), 'tax_per_acre': np.float64(25.03)}, 'M': {'total_acreage': np.float64(21963.02), 'total_property_tax': np.float64(440195874.7), 'tax_per_acre': np.float64(20042.59)}, 'A': {'total_acreage': np.float64(22128.34), 'total_property_tax': np.float64(33530500.13), 'tax_per_acre': np.float64(1515.27)}, 'C': {'total_acreage': np.float64(32026.78), 'total_property_tax': np.float64(1230219587.52), 'tax_per_acre': np.float64(38412.22)}, 'P': {'total_acreage': np.float64(19584.89), 'total_property_tax': np.float64(47433507.91), 'tax_per_acre': np.float64(2421.94)}, 'Q': {'total_acreage': np.float64(15893.74), 'total_property_tax': np.float64(26300495.05), 'tax_per_acre': np.float64(1654.77)}, 'T': {'tot

In [32]:
zone_summary["R1"]

{'total_acreage': np.float64(41356.13),
 'total_property_tax': np.float64(1606606539.78),
 'tax_per_acre': np.float64(38848.09)}

In [33]:
zone_summary_types["R"]

{'total_acreage': np.float64(405531.82),
 'total_property_tax': np.float64(5765060693.26),
 'tax_per_acre': np.float64(14216.05)}

In [36]:
final_gdf

Unnamed: 0,Zip Code,City Tax Rate Area,AIN,Roll Year,Tax Rate Area Code,Property Location,Property Use Type,Property Use Code,Number of Buildings,Year Built,Effective Year,Square Footage,Number of Bedrooms,Number of Bathrooms,Number of Units,Land Value,Land Base Year,Improvement Value,Improvement Base Year,"Total Value, Land + Improvement",Home Owners Exemption,Real Estate Exemption,Fixture Value,Fixture Exemption,Personal Property Value,Personal Property Exemption,Property taxable?,Total Value,Total Exemption,Taxable Value,Classification,Region Number,Location Latitude,Location Longitude,F1ST_INSTALLMENT_TAX,F1ST_INSTALLMENT_BALANCE_DUE,F1ST_INSTALLMENT_PENALTY,F1ST_INSTALLMENT_PENALTY_PAID,F1ST_INSTALLMENT_PAID_DATE,F2ND_INSTALLMENT_TAX,F2ND_INSTALLMENT_BALANCE_DUE,F2ND_INSTALLMENT_PENALTY,F2ND_INSTALLMENT_PENALTY_PAID,F2ND_INSTALLMENT_PAID_DATE,COST_DUE,COST_PAID,NONSUFFICIENT_FUNDS_DUE,NONSUFFICIENT_FUNDS_PAID,TOTAL_TAX_DUE,TAX_TYPE_1_AMOUNT,TAXTYPE_2_AMOUNT,TAX_TYPE_3_AMOUNT,TAX_TYPE_4_AMOUNT,TAX_TYPE_8_AMOUNT,Total_Taxes_Paid_Calc,ZONE_COMPLT,ZONE_PREFIX,geometry,acreage,Property_Tax_Value,zone_type
0,91304-3327,LOS ANGELES,2004001003,2024,16,8321 FAUST AVE LOS ANGELES CA 91304,SFR,0101,1,1973,1973,2090,4,3,1,711689,2006,301176,2006,1012865,0,0,0,0,0,0,Y,1012865,0,1012865,,2,34.220225,-118.620681,6326.89,0.0,0.0,0.0,11/05/2024,6326.87,0.00,0.0,0.0,02/03/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,12653.76,RE9-1,RE9,"MULTIPOLYGON (((6374111.56 1903072, 6373962.55...",0.222,10128.65,R
1,91304-3327,LOS ANGELES,2004001004,2024,16,8313 FAUST AVE LOS ANGELES CA 91304,SFR,0101,1,1973,1973,2479,5,3,1,370538,2010,255879,2010,626417,0,0,0,0,0,0,Y,626417,0,626417,,2,34.220044,-118.620681,4017.94,0.0,0.0,0.0,12/10/2024,4017.94,4017.94,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,8035.88,RE9-1,RE9,"MULTIPOLYGON (((6374111.3 1903005.49, 6373962....",0.227,6264.17,R
2,91304-3327,LOS ANGELES,2004001005,2024,16,8309 FAUST AVE LOS ANGELES CA 91304,SFR,0100,1,1973,1973,2057,4,2,1,526360,2018,198577,2018,724937,0,0,0,0,0,0,Y,724937,0,724937,,2,34.219862,-118.620688,4569.88,0.0,0.0,0.0,11/20/2024,4569.88,4569.88,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,9139.76,RE9-1,RE9,"MULTIPOLYGON (((6374111.18 1902981.92, 6374110...",0.221,7249.37,R
3,91304-3332,LOS ANGELES,2004001008,2024,16,8325 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,1,1978,1978,2423,4,3,1,128421,1980,221965,1980,350386,7000,0,0,0,0,0,Y,350386,7000,343386,,2,34.220339,-118.622718,2354.27,0.0,0.0,0.0,12/11/2024,2354.26,2354.26,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,4708.53,RE11-1,RE11,"MULTIPOLYGON (((6373473.92 1903186.75, 6373474...",0.271,3433.86,R
4,91304-3332,LOS ANGELES,2004001009,2024,16,8311 MAYNARD AVE LOS ANGELES CA 91304,SFR,0101,1,1978,1978,2226,4,3,1,139933,1984,210012,1984,349945,7000,0,0,0,0,0,Y,349945,7000,342945,,2,34.220327,-118.623062,2358.49,0.0,0.0,0.0,12/02/2024,2358.49,2358.49,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,4716.98,RE11-1,RE11,"MULTIPOLYGON (((6373402.6 1903070.94, 6373399....",0.342,3429.45,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805743,90732-4530,LOS ANGELES,7563037052,2024,20,2193 WARMOUTH ST LOS ANGELES CA 90732,SFR,0100,1,1959,1959,1933,3,3,1,485785,1995,263473,1995,749258,7000,0,0,0,0,0,Y,749258,7000,742258,,14,33.719514,-118.325376,4766.57,0.0,0.0,0.0,11/07/2024,4766.57,0.00,0.0,0.0,02/21/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,9533.14,R1-1XL,R1,"MULTIPOLYGON (((6462789.35 1720524.86, 6462724...",0.261,7422.58,R
805744,90732-4519,LOS ANGELES,7563037053,2024,20,3810 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,1,1965,1965,2206,3,2,1,793428,2020,198356,2020,991784,0,0,0,0,0,0,Y,991784,0,991784,,14,33.719952,-118.325128,6241.31,0.0,0.0,0.0,11/27/2024,6241.31,0.00,0.0,0.0,01/16/2025,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,12482.62,R1-1XL,R1,"MULTIPOLYGON (((6462857.6 1720642.09, 6462845....",0.170,9917.84,R
805745,90732-4519,LOS ANGELES,7563037054,2024,20,3806 S ANCHOVY AVE LOS ANGELES CA 90732,SFR,0100,1,1959,1961,1862,3,2,1,703626,2008,175899,2008,879525,7000,0,0,0,0,0,Y,879525,7000,872525,,14,33.720119,-118.325081,5512.25,0.0,0.0,0.0,12/11/2024,5512.25,5512.25,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,11024.50,R1-1XL,R1,"MULTIPOLYGON (((6462871.53 1720700.47, 6462859...",0.158,8725.25,R
805746,90732-4556,LOS ANGELES,7563037055,2024,20,2197 W PASEO DEL MAR LOS ANGELES CA 90732,SFR,0100,1,1962,1962,2153,4,2,1,47779,1975,92991,1975,140770,7000,0,0,0,0,0,Y,140770,7000,133770,,14,33.720279,-118.325034,1096.85,0.0,0.0,0.0,12/16/2024,1096.85,1096.85,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,2193.70,R1-1XL,R1,"MULTIPOLYGON (((6462793.22 1720781.15, 6462885...",0.157,1337.70,R


In [55]:
final_geom = final_gdf[
    [
        "geometry",
        "AIN",
        "Property Location",
        "zone_type",
        "acreage",
        "Property_Tax_Value",
        "ZONE_COMPLT",
        "Total_Taxes_Paid_Calc",
        "Location Latitude",
        "Location Longitude",
        "Taxable Value",
        "Property taxable?",
    ]
]
final_geom.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 804534 entries, 0 to 805747
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   geometry               804534 non-null  geometry
 1   AIN                    804534 non-null  int64   
 2   Property Location      766065 non-null  object  
 3   zone_type              802489 non-null  object  
 4   acreage                804534 non-null  float64 
 5   Property_Tax_Value     804534 non-null  float64 
 6   ZONE_COMPLT            802489 non-null  object  
 7   Total_Taxes_Paid_Calc  787089 non-null  float64 
 8   Location Latitude      803239 non-null  float64 
 9   Location Longitude     803239 non-null  float64 
 10  Taxable Value          804534 non-null  int64   
 11  Property taxable?      804534 non-null  object  
dtypes: float64(5), geometry(1), int64(2), object(4)
memory usage: 79.8+ MB


In [49]:
# Basic statistics on your tax column
print("Tax value summary statistics:")
print(final_gdf["Property_Tax_Value"].describe())

# Check the largest values - they might be outliers
print("\nTop 10 largest tax values:")
print(
    final_gdf.nlargest(10, "Property_Tax_Value")[
        ["AIN", "ZONE_PREFIX", "Property_Tax_Value", "acreage"]
    ]
)

# Are there any negative or zero values?
print(
    f"\nCount of zero/negative tax values: {len(final_gdf[final_gdf['Property_Tax_Value'] <= 0])}"
)

# Check spatial distribution - are all properties truly in LA City?
if "geometry" in final_gdf.columns:
    # Calculate area of study
    bounds = final_gdf.total_bounds
    area_km2 = (
        (bounds[2] - bounds[0]) * (bounds[3] - bounds[1]) / 1000000
    )  # rough estimate
    print(f"\nApproximate geographic area covered: {area_km2:.2f} km²")

    # LA City is roughly 1,300 km² - if your area is much larger, you have data outside the city

# Calculate mean tax per property
mean_tax = final_gdf["Property_Tax_Value"].mean()
count = len(final_gdf)
print(f"\nMean tax per property: ${mean_tax:.2f}")
print(f"Number of properties: {count}")

Tax value summary statistics:
count    8.045340e+05
mean     1.004149e+04
std      5.519025e+04
min      0.000000e+00
25%      2.491460e+03
50%      4.984505e+03
75%      8.920680e+03
max      9.863504e+06
Name: Property_Tax_Value, dtype: float64

Top 10 largest tax values:
               AIN ZONE_PREFIX  Property_Tax_Value  acreage
439206  4334007008          C2          9863503.84    8.478
570057  5151023400        [HB5          8915393.84    4.195
287869  2673021041          M2          8389656.85   70.930
569856  5151014031        [HB5          7150390.55    4.216
533646  5089008031       (Q)C4          6868097.83    8.421
569859  5151015012        [HB5          6687086.19    2.708
364813  4211010118       M(PV)          5718954.91   13.754
558436  5138007089       LASED          5456448.68    9.166
423884  4319001015          C2          5292837.93    2.150
563638  5144023227        [HB5          5130600.00    0.942

Count of zero/negative tax values: 21191

Approximate geographic

In [54]:
final_gdf["acreage"].sum()

np.float64(582437.1199999999)

In [66]:
# final_gdf[final_gdf["Property Location"].str.startswith("1017 HYPERION AVE", na=False)]

condo = final_geom[
    final_geom["Property Location"].str.startswith("3722 HUGHES AVE", na=False)
]
condo

Unnamed: 0,geometry,AIN,Property Location,zone_type,acreage,Property_Tax_Value,ZONE_COMPLT,Total_Taxes_Paid_Calc,Location Latitude,Location Longitude,Taxable Value,Property taxable?
419180,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014040,"3722 HUGHES AVE, NO 1 LOS ANGELES CA 90034",R,0.493,9333.0,R3-1,11328.97,34.025028,-118.399706,933300,Y
419181,"MULTIPOLYGON (((6440656.24 1831704.45, 6440535...",4313014041,"3722 HUGHES AVE, NO 2 LOS ANGELES CA 90034",R,0.493,7580.02,R3-1,9224.99,34.025028,-118.399706,758002,Y
419182,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014042,"3722 HUGHES AVE, NO 3 LOS ANGELES CA 90034",R,0.493,5505.51,R3-1,6736.22,34.025028,-118.399706,550551,Y
419183,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014043,"3722 HUGHES AVE, NO 4 LOS ANGELES CA 90034",R,0.493,2396.41,R3-1,3006.26,34.025028,-118.399706,239641,Y
419184,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014044,"3722 HUGHES AVE, NO 5 LOS ANGELES CA 90034",R,0.493,1492.26,R3-1,1921.56,34.025028,-118.399706,149226,Y
419185,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014045,"3722 HUGHES AVE, NO 6 LOS ANGELES CA 90034",R,0.493,3094.22,R3-1,3843.41,34.025028,-118.399706,309422,Y
419186,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014046,"3722 HUGHES AVE, NO 7 LOS ANGELES CA 90034",R,0.493,7167.32,R3-1,8729.87,34.025028,-118.399706,716732,Y
419187,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014047,"3722 HUGHES AVE, NO 8 LOS ANGELES CA 90034",R,0.493,2549.15,R3-1,3189.97,34.025028,-118.399706,254915,Y
419188,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014048,"3722 HUGHES AVE, UNIT 9 LOS ANGELES CA 90034",R,0.493,2781.31,R3-1,3468.97,34.025028,-118.399706,278131,Y
419189,"MULTIPOLYGON (((6440598.01 1831795.96, 6440624...",4313014049,"3722 HUGHES AVE, NO 10 LOS ANGELES CA 90034",R,0.493,3691.61,R3-1,4560.57,34.025028,-118.399706,369161,Y


In [86]:
# Define chunk processing function at module level so it can be pickled
def _process_chunk(chunk_data):
    """
    Process a chunk of polygons to find overlaps.

    Parameters:
    chunk_data: Tuple containing (chunk_idx, gdf, areas, chunk_size, n, overlap_threshold, debug)

    Returns:
    Tuple of (rows, cols) with indices of overlapping polygons
    """
    chunk_idx, gdf, areas, chunk_size, n, overlap_threshold, debug = chunk_data

    start_idx = chunk_idx * chunk_size
    end_idx = min(start_idx + chunk_size, n)
    chunk_indices = range(start_idx, end_idx)

    # Create R-tree spatial index for this chunk
    idx = rtree.index.Index()
    for i, idx_val in enumerate(chunk_indices):
        idx.insert(i, gdf.geometry.iloc[idx_val].bounds)

    # Find overlaps
    rows = []
    cols = []

    for i, idx1 in enumerate(chunk_indices):
        geom1 = gdf.geometry.iloc[idx1]
        area1 = areas[idx1]

        # Skip zero area geometries
        if area1 <= 0:
            continue

        # Find potential overlaps using spatial index
        bounds = geom1.bounds
        for j in idx.intersection(bounds):
            idx2 = start_idx + j

            # Only check forward to avoid duplicates
            if idx2 <= idx1:
                continue

            # Get geometry and area
            geom2 = gdf.geometry.iloc[idx2]
            area2 = areas[idx2]

            # Skip zero area geometries
            if area2 <= 0:
                continue

            # Quick intersection check
            if not geom1.intersects(geom2):
                continue

            try:
                # Calculate intersection
                intersection = geom1.intersection(geom2)
                intersection_area = intersection.area

                # Calculate overlap ratios
                ratio1 = intersection_area / area1
                ratio2 = intersection_area / area2

                # Check if either ratio exceeds threshold
                if ratio1 >= overlap_threshold or ratio2 >= overlap_threshold:
                    rows.append(idx1)
                    cols.append(idx2)
            except Exception as e:
                if debug:
                    print(f"Error checking overlap between {idx1} and {idx2}: {str(e)}")

    return rows, cols


# Define component processing function at module level
def _process_component(component_data):
    """
    Process a component (group of overlapping polygons) for merging.

    Parameters:
    component_data: Tuple containing (component_id, gdf, labels)

    Returns:
    Tuple of (merged_data_dict, merge_count)
    """
    component_id, gdf, labels = component_data

    # Get indices for this component
    component_indices = np.where(labels == component_id)[0]

    if len(component_indices) == 1:
        # Single property, no merging needed
        return gdf.iloc[component_indices[0]].to_dict(), 0
    else:
        # Get the properties to merge
        group_data = gdf.iloc[component_indices]

        # Merge geometries
        merged_geom = unary_union(group_data.geometry)
        if not merged_geom.is_valid:
            merged_geom = merged_geom.buffer(0)

        # Prepare aggregated data
        agg_data = {}

        # For all numeric columns, sum them
        for col in group_data.select_dtypes(include=np.number).columns:
            if (
                col != "Location Latitude"
                and col != "Location Longitude"
                and col != "_area"
            ):
                agg_data[col] = group_data[col].sum()

        # For special handling of certain columns
        if "acreage" in group_data.columns:
            agg_data["acreage"] = (
                merged_geom.area / 43560
            )  # Recalculate acreage from merged geometry

        # For lat/long, take median values
        if "Location Latitude" in group_data.columns:
            agg_data["Location Latitude"] = group_data["Location Latitude"].median()
        if "Location Longitude" in group_data.columns:
            agg_data["Location Longitude"] = group_data["Location Longitude"].median()

        # For string columns, join unique values
        for col in group_data.select_dtypes(include=["object"]).columns:
            unique_values = group_data[col].dropna().astype(str).unique()
            agg_data[col] = ", ".join(unique_values) if len(unique_values) > 0 else None

        # For boolean columns, use logical OR
        for col in group_data.select_dtypes(include=["bool"]).columns:
            agg_data[col] = any(group_data[col])

        # Set the geometry
        agg_data["geometry"] = merged_geom

        return agg_data, len(component_indices)


def merge_overlapping_properties_optimized(
    gdf, overlap_threshold=0.8, chunk_size=10000, n_processes=None, debug=True
):
    """
    Optimized version of merge_overlapping_properties for large datasets.
    Uses chunking, parallel processing, and more efficient spatial indexing.

    Parameters:
    gdf (GeoDataFrame): Input GeoDataFrame with property polygons
    overlap_threshold (float): Minimum overlap ratio required for merging (0.0 to 1.0)
    chunk_size (int): Number of polygons to process in each chunk
    n_processes (int): Number of processes to use (defaults to CPU count - 1)
    debug (bool): Whether to print debug messages

    Returns:
    GeoDataFrame: Processed GeoDataFrame with merged properties
    """

    def log_debug(message):
        if debug:
            print(message)

    try:
        # Use all available cores minus 1 by default
        if n_processes is None:
            n_processes = max(1, mp.cpu_count() - 1)

        log_debug(
            f"Starting optimized property polygon merging with {n_processes} processes..."
        )

        # Make a copy to avoid modifying the original
        gdf = gdf.copy()
        original_crs = gdf.crs

        # Convert to UTM Zone 11N for accurate spatial calculations
        projected_crs = "EPSG:32611"  # UTM Zone 11N (meters)
        log_debug(f"Converting from {original_crs} to {projected_crs} for processing")

        # Convert to projected CRS
        gdf = gdf.to_crs(projected_crs)

        # Fix any invalid geometries
        log_debug("Fixing invalid geometries...")
        gdf["geometry"] = gdf.geometry.apply(
            lambda geom: geom.buffer(0) if not geom.is_valid else geom
        )

        # Pre-compute areas for all polygons
        log_debug("Pre-computing areas...")
        gdf["_area"] = gdf.geometry.area
        areas = gdf["_area"].values

        if debug:
            log_debug(
                f"Area statistics: Min={min(areas):.2f}, Max={max(areas):.2f}, Mean={np.mean(areas):.2f}"
            )

        # Calculate number of chunks
        n = len(gdf)
        n_chunks = math.ceil(n / chunk_size)
        log_debug(f"Processing {n} properties in {n_chunks} chunks of {chunk_size}...")

        # Process all chunks in parallel
        all_rows = []
        all_cols = []

        # Prepare chunk data
        chunk_data_list = [
            (i, gdf, areas, chunk_size, n, overlap_threshold, debug)
            for i in range(n_chunks)
        ]

        log_debug("Processing chunks in parallel...")

        # Use smaller chunksize for imap to improve responsiveness
        imap_chunksize = max(1, min(10, n_chunks // (n_processes * 4)))

        with mp.Pool(processes=n_processes) as pool:
            results = list(
                tqdm(
                    pool.imap(
                        _process_chunk, chunk_data_list, chunksize=imap_chunksize
                    ),
                    total=n_chunks,
                    disable=not debug,
                )
            )

        for rows, cols in results:
            all_rows.extend(rows)
            all_cols.extend(cols)

        # Check if any overlaps were found
        if not all_rows:
            log_debug("No overlapping properties detected with the current threshold.")
            log_debug(
                f"Try a lower overlap_threshold value (current: {overlap_threshold})"
            )
            return gdf.drop(columns=["_area"]).to_crs(original_crs)

        # Create sparse matrix for the entire dataset
        log_debug("Creating adjacency matrix...")
        data = [1] * len(all_rows)

        # Add symmetric relationships for undirected graph
        all_rows_copy = all_rows.copy()
        all_cols_copy = all_cols.copy()
        all_rows.extend(all_cols_copy)
        all_cols.extend(all_rows_copy)
        data = [1] * len(all_rows)

        adjacency_matrix = csr_matrix((data, (all_rows, all_cols)), shape=(n, n))

        # Find connected components (groups of overlapping properties)
        log_debug("Finding connected components...")
        n_components, labels = connected_components(adjacency_matrix, directed=False)

        log_debug(f"Found {n_components} distinct property groups")

        # Check if any merging actually happened
        if n_components == n:
            log_debug("No properties were merged.")
            return gdf.drop(columns=["_area"]).to_crs(original_crs)

        # Process components in parallel
        component_data_list = [(i, gdf, labels) for i in range(n_components)]

        # Create final GeoDataFrame
        log_debug("Processing and merging components...")

        # Use thread pool for I/O bound operations
        with mp.Pool(processes=n_processes) as pool:
            results = list(
                tqdm(
                    pool.imap(
                        _process_component,
                        component_data_list,
                        chunksize=max(1, n_components // (n_processes * 4)),
                    ),
                    total=n_components,
                    disable=not debug,
                )
            )

        new_rows = [row for row, _ in results]
        merged_count = sum(count for _, count in results)

        log_debug("Creating final GeoDataFrame...")
        merged_gdf = gpd.GeoDataFrame(new_rows, crs=projected_crs)

        log_debug(
            f"Original properties: {len(gdf)}, Merged properties: {len(merged_gdf)}"
        )
        log_debug(f"Properties merged into groups: {merged_count}")

        if "Property_Tax_Value" in merged_gdf.columns:
            log_debug(
                f"Total tax before merging: ${gdf['Property_Tax_Value'].sum():,.2f}"
            )
            log_debug(
                f"Total tax after merging: ${merged_gdf['Property_Tax_Value'].sum():,.2f}"
            )

        # Convert back to original CRS
        merged_gdf = merged_gdf.to_crs(original_crs)

        return merged_gdf

    except Exception as e:
        log_debug(f"Error during merging: {str(e)}")
        import traceback

        traceback.print_exc()
        return gdf.drop(columns=["_area"] if "_area" in gdf.columns else []).to_crs(
            original_crs
        )


# Batch processing function for extremely large datasets
def batch_process_properties(
    gdf_path,
    output_path,
    overlap_threshold=0.8,
    batch_size=50000,
    chunk_size=10000,
    n_processes=None,
):
    """
    Process a very large GeoDataFrame by loading and processing in batches.

    Parameters:
    gdf_path (str): Path to input GeoDataFrame (.gpkg, .shp, etc.)
    output_path (str): Path to save output GeoDataFrame
    overlap_threshold (float): Overlap threshold for merging
    batch_size (int): Number of rows to load and process at once
    chunk_size (int): Chunk size for processing within each batch
    n_processes (int): Number of processes to use
    """
    import os

    print(f"Starting batch processing of {gdf_path}")

    # Get total number of features
    gdf_sample = gpd.read_file(gdf_path, rows=1)
    crs = gdf_sample.crs

    # Process in batches
    batch_number = 0
    merged_properties = []

    while True:
        # Read the next batch
        print(
            f"Processing batch {batch_number}, rows {batch_number * batch_size} to {(batch_number + 1) * batch_size}"
        )

        # Skip_rows doesn't work well with all geodata formats, so we use rows parameter instead
        # which is available in geopandas 0.7.0+
        gdf_batch = gpd.read_file(
            gdf_path,
            rows=slice(batch_number * batch_size, (batch_number + 1) * batch_size),
        )

        if len(gdf_batch) == 0:
            # No more data to process
            break

        # Process this batch
        processed_batch = merge_overlapping_properties_optimized(
            gdf_batch,
            overlap_threshold=overlap_threshold,
            chunk_size=chunk_size,
            n_processes=n_processes,
        )

        # Store results
        merged_properties.append(processed_batch)

        # Move to next batch
        batch_number += 1

        # Save incremental results
        temp_result = pd.concat(merged_properties)
        temp_result.to_file(f"{output_path}.temp", driver="GPKG")

    # Combine all processed batches
    if merged_properties:
        final_result = pd.concat(merged_properties)
        print(f"Final dataset has {len(final_result)} properties")

        # Save final result
        final_result.to_file(output_path, driver="GPKG")

        # Remove temp file
        if os.path.exists(f"{output_path}.temp"):
            os.remove(f"{output_path}.temp")
    else:
        print("No data processed")


# Alternative approach: serial processing with memory-efficient chunks
def merge_overlapping_properties_serial(
    gdf, overlap_threshold=0.8, chunk_size=10000, debug=True
):
    """
    Memory-efficient version that processes chunks sequentially instead of in parallel.
    Use this if you're experiencing memory issues with the parallel version.

    Parameters:
    gdf (GeoDataFrame): Input GeoDataFrame with property polygons
    overlap_threshold (float): Minimum overlap ratio required for merging (0.0 to 1.0)
    chunk_size (int): Number of polygons to process in each chunk
    debug (bool): Whether to print debug messages

    Returns:
    GeoDataFrame: Processed GeoDataFrame with merged properties
    """

    def log_debug(message):
        if debug:
            print(message)

    try:
        log_debug("Starting memory-efficient property polygon merging...")

        # Make a copy to avoid modifying the original
        gdf = gdf.copy()
        original_crs = gdf.crs

        # Convert to UTM Zone 11N for accurate spatial calculations
        projected_crs = "EPSG:32611"  # UTM Zone 11N (meters)
        log_debug(f"Converting from {original_crs} to {projected_crs} for processing")

        # Convert to projected CRS
        gdf = gdf.to_crs(projected_crs)

        # Fix any invalid geometries
        log_debug("Fixing invalid geometries...")
        gdf["geometry"] = gdf.geometry.apply(
            lambda geom: geom.buffer(0) if not geom.is_valid else geom
        )

        # Pre-compute areas for all polygons
        log_debug("Pre-computing areas...")
        gdf["_area"] = gdf.geometry.area
        areas = gdf["_area"].values

        if debug:
            log_debug(
                f"Area statistics: Min={min(areas):.2f}, Max={max(areas):.2f}, Mean={np.mean(areas):.2f}"
            )

        # Calculate number of chunks
        n = len(gdf)
        n_chunks = math.ceil(n / chunk_size)
        log_debug(f"Processing {n} properties in {n_chunks} chunks of {chunk_size}...")

        # Process all chunks serially to save memory
        all_rows = []
        all_cols = []

        for chunk_idx in tqdm(range(n_chunks), disable=not debug):
            rows, cols = _process_chunk(
                (chunk_idx, gdf, areas, chunk_size, n, overlap_threshold, debug)
            )
            all_rows.extend(rows)
            all_cols.extend(cols)

            # Free memory
            if chunk_idx % 10 == 0:
                import gc

                gc.collect()

        # Check if any overlaps were found
        if not all_rows:
            log_debug("No overlapping properties detected with the current threshold.")
            log_debug(
                f"Try a lower overlap_threshold value (current: {overlap_threshold})"
            )
            return gdf.drop(columns=["_area"]).to_crs(original_crs)

        # Create sparse matrix for the entire dataset
        log_debug("Creating adjacency matrix...")
        data = [1] * len(all_rows)

        # Add symmetric relationships
        all_rows_copy = all_rows.copy()
        all_cols_copy = all_cols.copy()
        all_rows.extend(all_cols_copy)
        all_cols.extend(all_rows_copy)
        data = [1] * len(all_rows)

        adjacency_matrix = csr_matrix((data, (all_rows, all_cols)), shape=(n, n))

        # Find connected components (groups of overlapping properties)
        log_debug("Finding connected components...")
        n_components, labels = connected_components(adjacency_matrix, directed=False)

        log_debug(f"Found {n_components} distinct property groups")

        # Check if any merging actually happened
        if n_components == n:
            log_debug("No properties were merged.")
            return gdf.drop(columns=["_area"]).to_crs(original_crs)

        # Process components sequentially to save memory
        new_rows = []
        merged_count = 0

        log_debug("Processing and merging components...")
        for component_id in tqdm(range(n_components), disable=not debug):
            row_dict, count = _process_component((component_id, gdf, labels))
            new_rows.append(row_dict)
            merged_count += count

            # Free memory periodically
            if component_id % 1000 == 0:
                import gc

                gc.collect()

        log_debug("Creating final GeoDataFrame...")
        merged_gdf = gpd.GeoDataFrame(new_rows, crs=projected_crs)

        log_debug(
            f"Original properties: {len(gdf)}, Merged properties: {len(merged_gdf)}"
        )
        log_debug(f"Properties merged into groups: {merged_count}")

        if "Property_Tax_Value" in merged_gdf.columns:
            log_debug(
                f"Total tax before merging: ${gdf['Property_Tax_Value'].sum():,.2f}"
            )
            log_debug(
                f"Total tax after merging: ${merged_gdf['Property_Tax_Value'].sum():,.2f}"
            )

        # Convert back to original CRS
        merged_gdf = merged_gdf.to_crs(original_crs)

        return merged_gdf

    except Exception as e:
        log_debug(f"Error during merging: {str(e)}")
        import traceback

        traceback.print_exc()
        return gdf.drop(columns=["_area"] if "_area" in gdf.columns else []).to_crs(
            original_crs
        )

In [87]:
# For parallel processing (faster but more memory-intensive)
result = merge_overlapping_properties_optimized(
    final_geom,
    overlap_threshold=0.8,
    chunk_size=10000,
    n_processes=4,  # Try reducing this if memory is an issue
)

Starting optimized property polygon merging with 4 processes...
Converting from EPSG:2229 to EPSG:32611 for processing
Fixing invalid geometries...
Pre-computing areas...
Area statistics: Min=0.00, Max=7132366.79, Mean=2928.50
Processing 804534 properties in 81 chunks of 10000...
Processing chunks in parallel...


  0%|          | 0/81 [00:00<?, ?it/s]Process SpawnPoolWorker-11:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/queues.py", line 387, in get
    return _ForkingPickler.loads(res)
           ~~~~~~~~~~~~~~~~~~~~~^^^^^
AttributeError: Can't get attribute '_process_chunk' on <module '__main__'

KeyboardInterrupt: 

In [88]:
result = merge_overlapping_properties_serial(
    final_geom, overlap_threshold=0.8, chunk_size=10000
)

Starting memory-efficient property polygon merging...
Converting from EPSG:2229 to EPSG:32611 for processing
Fixing invalid geometries...
Pre-computing areas...
Area statistics: Min=0.00, Max=7132366.79, Mean=2928.50
Processing 804534 properties in 81 chunks of 10000...


100%|██████████| 81/81 [47:36<00:00, 35.27s/it]   


Creating adjacency matrix...
Finding connected components...
Found 671581 distinct property groups
Processing and merging components...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Creating final GeoDataFrame...
Original properties: 804534, Merged properties: 671581
Properties merged into groups: 138980
Total tax before merging: $8,078,718,255.57
Total tax after merging: $8,078,718,255.57


In [89]:
result["acreage"].sum()

np.float64(237859.696818805)

In [90]:
result.to_file("output.shp")

  result.to_file("output.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [92]:
result

Unnamed: 0,geometry,AIN,Property Location,zone_type,acreage,Property_Tax_Value,ZONE_COMPLT,Total_Taxes_Paid_Calc,Location Latitude,Location Longitude,Taxable Value,Property taxable?,_area
0,"MULTIPOLYGON (((6374111.56 1903072, 6373962.55...",2004001003,8321 FAUST AVE LOS ANGELES CA 91304,R,0.222,10128.65,RE9-1,12653.76,34.220225,-118.620681,1012865,Y,899.661603
1,"MULTIPOLYGON (((6374111.3 1903005.49, 6373962....",2004001004,8313 FAUST AVE LOS ANGELES CA 91304,R,0.227,6264.17,RE9-1,8035.88,34.220044,-118.620681,626417,Y,920.496991
2,"MULTIPOLYGON (((6374111.18 1902981.92, 6374110...",2004001005,8309 FAUST AVE LOS ANGELES CA 91304,R,0.221,7249.37,RE9-1,9139.76,34.219862,-118.620688,724937,Y,895.786848
3,"MULTIPOLYGON (((6373473.92 1903186.75, 6373474...",2004001008,8325 MAYNARD AVE LOS ANGELES CA 91304,R,0.271,3433.86,RE11-1,4708.53,34.220339,-118.622718,343386,Y,1098.238748
4,"MULTIPOLYGON (((6373402.6 1903070.94, 6373399....",2004001009,8311 MAYNARD AVE LOS ANGELES CA 91304,R,0.342,3429.45,RE11-1,4716.98,34.220327,-118.623062,342945,Y,1382.129193
...,...,...,...,...,...,...,...,...,...,...,...,...,...
671576,"MULTIPOLYGON (((6462789.35 1720524.86, 6462724...",7563037052,2193 WARMOUTH ST LOS ANGELES CA 90732,R,0.261,7422.58,R1-1XL,9533.14,33.719514,-118.325376,742258,Y,1053.669791
671577,"MULTIPOLYGON (((6462857.6 1720642.09, 6462845....",7563037053,3810 S ANCHOVY AVE LOS ANGELES CA 90732,R,0.170,9917.84,R1-1XL,12482.62,33.719952,-118.325128,991784,Y,689.459918
671578,"MULTIPOLYGON (((6462871.53 1720700.47, 6462859...",7563037054,3806 S ANCHOVY AVE LOS ANGELES CA 90732,R,0.158,8725.25,R1-1XL,11024.50,33.720119,-118.325081,872525,Y,640.641221
671579,"MULTIPOLYGON (((6462793.22 1720781.15, 6462885...",7563037055,2197 W PASEO DEL MAR LOS ANGELES CA 90732,R,0.157,1337.70,R1-1XL,2193.70,33.720279,-118.325034,133770,Y,634.574146


In [95]:
result["ZONE_PREFIX"] = result["ZONE_COMPLT"].str.split("-").str[0]

In [96]:
# Group by ZONE_PREFIX and calculate aggregates
zone_summary = {}

# Get unique zone prefixes
zone_prefixes = result["ZONE_PREFIX"].unique()
print(zone_prefixes)
for zone in zone_prefixes:
    # Filter for just this zone prefix
    zone_data = result[result["ZONE_PREFIX"] == zone]

    # Skip if zone is None/NaN
    if pd.isna(zone):
        continue

    try:
        # Calculate metrics with error handling
        total_acreage = zone_data["acreage"].sum()
        total_property_tax = zone_data["Property_Tax_Value"].sum()
    except KeyError as e:
        print(f"Missing key {e} in DataFrame. Skipping zone: {zone}")
        continue  # Skip this zone if keys are missing

    # Avoid division by zero
    if total_acreage > 0:
        tax_per_acre = total_property_tax / total_acreage
    else:
        tax_per_acre = 0

    # Store in dictionary
    zone_summary[zone] = {
        "total_acreage": round(total_acreage, 2),
        "total_property_tax": round(total_property_tax, 2),
        "tax_per_acre": round(tax_per_acre, 2),
    }

# Now zone_summary is a dictionary with all the metrics by zone prefix
print(zone_summary)

['RE9' 'RE11' 'OS' '[Q]OS' '(Q)R1' '(Q)RE11' '(Q)RD3' '[T][Q]M1' 'A1' 'RS'
 'C2' '[Q]M1' 'R3' 'C4' 'P' '[Q]PF' '(T)(Q)C1.5' 'CR' None '(T)(Q)R4'
 '(Q)R3' nan '(T)RS' 'R1' 'RA' '[Q]C1' '[Q]C1.5' '(Q)RD5' 'RE20' 'RE40'
 'RE15' '(T)RE11' 'A2' 'QCR' '(T)(Q)CR' 'QC1' '(Q)RD2' 'QRD3' '[Q]CR'
 '[Q]C2' 'C1' '(T)(Q)C2' 'R1P' 'C1.5' 'R2' '(Q)C1.5' '(WC)TOPANGA'
 '(T)(Q)RD2' 'RD2' 'RD1.5' '(Q)RD1.5' '(Q)RD6' '(T)(Q)R1' '(Q)CR'
 '(T)(Q)RD3' '(T)(Q)RD5' '(Q)C1' '(Q)C4' '[Q]C4' 'QC2' '(Q)P' '(Q)C2'
 '(T)(Q)C1' 'PF' '(T)R1' '[Q]R3' '(T)R3' 'QRD5' 'R4' '(T)RE9' '(T)(Q)RD4'
 '(Q)RE9' '(T)(Q)R3' 'M2' 'MR1' '(Q)MR1' '[Q]CM' '[T][Q]C2' '(T)(Q)RZ3'
 '[Q]RD2' '[Q]P' '(Q)R3P' '(T)(Q)RD1.5' '(Q)M1' 'RD5' 'M1' 'RD3'
 '(T)(Q)M1' 'QR3' '(Q)RS' '[T]R3' 'QRD1.5' 'R3P' '(Q)R4' '(Q)RZ2.5'
 '(Q)R2' '[Q]R2' '[Q]R1' '[Q]RAS3' '(Q)RD4' '(T)[Q]C2' '[T]R1' '(T)[Q]R4'
 '(T)(Q)RAS4' '[Q]RD5' '(Q)RAS3' '(T)(Q)RAS3' '(T)[Q]RAS4' 'RMP' '[T]RD2'
 '[T][Q]RD1.5' 'QRD2' '(T)(Q)C4' '[Q]RD3' '(WC)RIVER' '(Q)RAS4'
 '[T][Q]MR1' '[Q]MR

In [93]:
# Group by ZONE_PREFIX and calculate aggregates
zone_summary_types = {}

# Get unique zone prefixes
zone_prefixes = result["zone_type"].unique()
print(zone_prefixes)
for zone in zone_prefixes:
    # Filter for just this zone prefix
    zone_data = result[result["zone_type"] == zone]

    # Skip if zone is None/NaN
    if pd.isna(zone):
        continue

    try:
        # Calculate metrics with error handling
        total_acreage = zone_data["acreage"].sum()
        total_property_tax = zone_data["Property_Tax_Value"].sum()
    except KeyError as e:
        print(f"Missing key {e} in DataFrame. Skipping zone: {zone}")
        continue  # Skip this zone if keys are missing

    # Avoid division by zero
    if total_acreage > 0:
        tax_per_acre = total_property_tax / total_acreage
    else:
        tax_per_acre = 0

    # Store in dictionary
    zone_summary_types[zone] = {
        "total_acreage": round(total_acreage, 2),
        "total_property_tax": round(total_property_tax, 2),
        "tax_per_acre": round(tax_per_acre, 2),
    }

# Now zone_summary is a dictionary with all the metrics by zone prefix
print(zone_summary_types)

['R' 'O' 'M' 'A' 'C' 'P' None 'Q' 'T' 'U' 'D' 'N' 'L' 'H' 'R, O' 'F'
 'H, V' 'V' 'G' 'C, R' 'S']
{'R': {'total_acreage': np.float64(130910.2), 'total_property_tax': np.float64(5765033681.21), 'tax_per_acre': np.float64(44038.08)}, 'O': {'total_acreage': np.float64(38906.5), 'total_property_tax': np.float64(979096.22), 'tax_per_acre': np.float64(25.17)}, 'M': {'total_acreage': np.float64(18413.3), 'total_property_tax': np.float64(440203869.32), 'tax_per_acre': np.float64(23906.84)}, 'A': {'total_acreage': np.float64(16774.58), 'total_property_tax': np.float64(33689271.99), 'tax_per_acre': np.float64(2008.35)}, 'C': {'total_acreage': np.float64(13247.94), 'total_property_tax': np.float64(1230158259.07), 'tax_per_acre': np.float64(92856.58)}, 'P': {'total_acreage': np.float64(11593.26), 'total_property_tax': np.float64(47433507.91), 'tax_per_acre': np.float64(4091.47)}, 'Q': {'total_acreage': np.float64(341.0), 'total_property_tax': np.float64(26300495.05), 'tax_per_acre': np.float64(7712

In [94]:
zone_summary_types["R"]

{'total_acreage': np.float64(130910.2),
 'total_property_tax': np.float64(5765033681.21),
 'tax_per_acre': np.float64(44038.08)}

In [97]:
zone_summary["R1"]

{'total_acreage': np.float64(40367.76),
 'total_property_tax': np.float64(1606606539.78),
 'tax_per_acre': np.float64(39799.25)}