In [1]:
import sys, os
from pathlib import Path
from joblib import Memory

sys.path.append(os.path.abspath(".."))
# Resolve all data paths relative to the repo root
REPO_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]

from config import load_config
from gee_utils import init_gee, export_image_to_gee, load_image_collection
from data_utils import load_gdb
from mappluto import perform_mappluto_eda
from dependencies import get_nyc_geometry
from naip import calculate_spectral_indices

In [2]:
memory = Memory(Path("cache"), verbose=0)
CONFIG_PATH = "../config.yaml"
CONFIG = load_config(CONFIG_PATH)

01:45:34 | INFO | vacant_lots | YAML loaded


In [3]:
init_gee(CONFIG.EARTH_ENGINE)

01:45:35 | INFO | vacant_lots | Initializing GEE with ADC credentials and project:vacant-lot-detection
01:45:36 | INFO | vacant_lots | GEE sucessfully initialized


# Load Data
- MapPluto 22 v3 (fall time)

In [4]:
mappluto_22_path = REPO_ROOT / Path(CONFIG.DATA.DIR) / CONFIG.DATA.NYC_MAPPLUTO
mappluto_22_gdb = load_gdb(mappluto_22_path, layer='MapPLUTO_22v3_clipped')
mappluto_22_gdb["geom_perimeter"] = mappluto_22_gdb.geometry.length
print(mappluto_22_gdb.crs)
# mappluto_22_gdb.head()

EPSG:2263


## MapPluto Features
- CRS: EPSG:2236 (in feet for NYC -- reproject for clustering and NAIP eventually) 
### Categorical Features
- Borough: BX, BK, MN, QN, SI
- LotType:
    - 0: Unknown
    - 1: Block Assemblage - A tax lot which encompasses an entire block.
    - 2: Waterfront - A tax lot bordering on a body of water. Waterfront lots may contain a small amount of submerged land.
    - 3: Corner - A tax lot bordering on two intersecting streets
    - 4: Through - A tax lot which connects two streets, with frontage on both streets. Note that a lot with two frontages is not necessarily a through lot. For example, an L-shaped lot with two frontages would be coded as an Inside Lot (5).
    - 5: Inside - A tax lot with frontage on only one street. This   - c: A, but is only assigned in PLUTO if CAMA has no other lot types for the tax lot.
    - 6: Interior Lot - A tax lot that has no street frontage.
    - 7: Island Lot - A tax lot that is entirely surrounded by water.
    - 8: Alley Lot - A tax lot that is too narrow to accommodate a building. The lot is usually 12 feet or less in width.
    - 9: Submerged Land Lot - A tax lot that is totally or almost completely submerged.
- LandUse: assigns each BUILDING CLASS (BldgClass?) to each -- so not truely vacant land? we can't trust the labels?
    - 01: One & Two Family Buildings
    - 02: Multi - Family Walk- Up Buldings
    - 03: Multi - Family Elevator Buildings
    - 04: Mixed Residential and Commercial Buildings
    - 05: Commercial and Office Buildings
    - 06: Industrial and Manufacturing
    - 07: Transportation and Utility
    - 08: Public Facilities and Institutions
    - 09: Open Space and Outdoor Recreation
    - 10: Parking Facilities
    - 11: Vacant Land
- BldgClass: Too many 
- OwnerType: from COLP (City Owned and Leased Properties), if not PTS (Property Tax System)
    - C (city), M (mixed C + P), O (other state, feds), P (private), X (tax exempt), blank (unknown usually private)
- ZoneDist1: R (res), C (commercial), M (manufac), BPC (battery park city), PARK, M/R (e.g. M1-1/R5) (Mixed man + res)
    - greatest % of tax lot if SPLIT BOUNDARY INDICATOR is divided

### Numerical Features
- Shape_Area: maybe better est -- generated automatically by ESRI (use instead of LotArea)
    - LotArea: sq feet to nearest int, if irregular shape DOF from 
- BldgArea: Building Floor Area (see AreaSource)
- BuiltFAR: Total building floor area divided by area of tax lot 
    - See ResFar, CommFar, etc 
- NumBldgs: Num buildings on tax lot
- YearBuilt: Correct within decade, some have 0 
- AssessLand: assessed land value
- AssessTot: Total value (multiply by tax class)
- ExemptVal: dollar amount of lot recieving exemption
- NumFloors: Num Floors (can be 2.5)
- UnitsRes: Sum of residential units 
- UnitsTotal: Sum of res and non res units (not updated prob)
- LotFront (feet)
- LotDepth (feet)
- LtdHeight: limited height in district 
- Latitude
- Longitude
- geom_perimeter (computer)
    - Shape_Leng: Perimeter (nulls)

### Notes
- Tax Lots can have multiple buildings
- There are lots underwater which we will not be handling --- but cool nonetheless
- Is latitude/long in the center/centroid?
    - no should recompute 



In [5]:
numerical_features = ['Shape_Area', 'geom_perimeter', 'BldgArea', 'BuiltFAR', 'NumBldgs', 'YearBuilt', 
                         'AssessLand', 'AssessTot', 'NumFloors', 'UnitsRes', 'LotFront', 'LotDepth']
categorical_features = ['LandUse', 'Borough', 'BldgClass', 'OwnerType', 'ZoneDist1']

# Define EDA output directory
eda_output_dir = Path("outputs/mappluto_22")

# Run the orchestrator
eda_results = perform_mappluto_eda(
    gdf=mappluto_22_gdb,
    output_dir=eda_output_dir,
    numerical_features=numerical_features,
    categorical_features=categorical_features,
    top_n_categories=10  # optional ‚Äî controls how many categories to show in plots
)

20:14:27 | INFO | vacant_lots | üöÄ Starting MapPLUTO EDA pipeline
20:14:27 | INFO | vacant_lots | üìà Summarizing numerical features...
20:14:28 | INFO | vacant_lots | üßÆ Computed summary statistics for 12 numerical features.
20:14:28 | INFO | vacant_lots | Saved numerical summary to outputs/mappluto_22/numerical_summary.csv
20:14:28 | INFO | vacant_lots | üìä Plotting numerical distributions...
20:14:28 | INFO | vacant_lots | Clipping data to 0.95 percentile for plotting
20:14:53 | INFO | vacant_lots | üìä Saved numerical distributions to outputs/mappluto_22/mappluto_numerical_distributions.png
20:14:53 | INFO | vacant_lots | üìÇ Summarizing categorical features...
20:14:53 | INFO | vacant_lots | üìä Computed top 10 categories for 'LandUse'.
20:14:53 | INFO | vacant_lots | üìä Computed top 10 categories for 'Borough'.
20:14:53 | INFO | vacant_lots | üìä Computed top 10 categories for 'BldgClass'.
20:14:53 | INFO | vacant_lots | üìä Computed top 10 categories for 'OwnerType

In [None]:
# Shape area and lot area are comparable
mappluto_22_gdb[["Shape_Area","LotArea"]]

Unnamed: 0,Shape_Area,LotArea
0,5.018973e+05,541886.0
1,7.478663e+06,7665016.0
2,6.445095e+04,64450.0
3,8.724425e+04,191502.0
4,9.690237e+04,510025.0
...,...,...
856993,5.078837e+03,7199.0
856994,1.318642e+04,14136.0
856995,1.247200e+04,10664.0
856996,5.976783e+03,6630.0


## visualize some mappluto spatially 
plot_mappluto spatial
## create subset for clustering 
create_subset_for_clustering

## Extract NAIP features using GEE


In [None]:
# --- 6. Extract NAIP Features using Google Earth Engine ---
    print("\n--- Step 6: Extract NAIP Features using GEE ---")
    try:
        # Uncomment the following line if you need to authenticate
        # ee.Authenticate()
        initialize_gee(project_id=GEE_PROJECT_ID)

        naip_features_df = batch_extract_features(
            parcels_gdf=subset_gdf,
            year=NAIP_YEAR,
            batch_size=500,  # Adjust based on GEE limits
            scale=1  # 1m resolution for NAIP
        )
        naip_features_df.to_csv(f'{OUTPUT_DIR}/naip_features.csv', index=False)
        print(f"\nSaved NAIP features to {OUTPUT_DIR}/naip_features.csv")
        print(naip_features_df.head())
    except Exception as e:
        print(f"Could not extract GEE features. Please ensure GEE is authenticated. Error: {e}")
        # For demonstration, create a dummy naip_features_df
        print("Creating dummy NAIP features for demonstration.")
        naip_features_df = pd.DataFrame({
            'BBL': subset_gdf['BBL'],
            'R_mean': np.random.rand(len(subset_gdf)),
            'G_mean': np.random.rand(len(subset_gdf)),
            'B_mean': np.random.rand(len(subset_gdf)),
            'N_mean': np.random.rand(len(subset_gdf)),
            'NDVI_mean': np.random.rand(len(subset_gdf)),
            'SAVI_mean': np.random.rand(len(subset_gdf)),
            'Brightness_mean': np.random.rand(len(subset_gdf)),
            'BareSoilProxy_mean': np.random.rand(len(subset_gdf)),
            'GLCM_N_ASM_mean': np.random.rand(len(subset_gdf)),
            'GLCM_N_Contrast_mean': np.random.rand(len(subset_gdf)),
            'GLCM_N_Entropy_mean': np.random.rand(len(subset_gdf)),
        })

    # --- 7. Prepare Features for Clustering ---
    print("\n--- Step 7: Prepare Features for Clustering ---")
    feature_list = [
        'LotArea', 'BldgArea', 'BuiltFAR', 'NumBldgs',
        'R_mean', 'G_mean', 'B_mean', 'N_mean',
        'NDVI_mean', 'SAVI_mean', 'Brightness_mean', 'BareSoilProxy_mean',
        'GLCM_N_ASM_mean', 'GLCM_N_Contrast_mean', 'GLCM_N_Entropy_mean'
    ]
    features_df, feature_names = prepare_features_for_clustering(
        gdf=subset_gdf,
        naip_features_df=naip_features_df,
        feature_list=feature_list
    )
    print(f"\nFeatures prepared. Shape: {features_df.shape}")
    print(features_df.head())

    # --- 8. Find Optimal Number of Clusters ---
    print("\n--- Step 8: Find Optimal Number of Clusters ---")
    optimal_k = find_optimal_clusters(
        features_df=features_df,
        feature_names=feature_names,
        k_range=range(2, 11),
        output_dir=OUTPUT_DIR
    )
    N_CLUSTERS = optimal_k

    # --- 9. Perform Clustering ---
    print("\n--- Step 9: Perform Clustering ---")
    cluster_labels, scaler, kmeans_model = perform_clustering(
        features_df=features_df,
        feature_names=feature_names,
        n_clusters=N_CLUSTERS,
        method='kmeans'
    )
    subset_gdf['cluster'] = cluster_labels
    print("\nCluster distribution:")
    print(subset_gdf['cluster'].value_counts().sort_index())

    # --- 10. Analyze Clusters ---
    print("\n--- Step 10: Analyze Clusters ---")
    cluster_summary = analyze_clusters(
        features_df=features_df,
        feature_names=feature_names,
        cluster_labels=cluster_labels,
        output_dir=OUTPUT_DIR
    )
    print(cluster_summary)

    # --- 11. Visualize Clusters ---
    print("\n--- Step 11: Visualize Clusters ---")
    plot_clusters_spatial(
        gdf=subset_gdf,
        cluster_column='cluster',
        title='Parcel Clusters',
        figsize=(15, 15),
        save_path=f'{OUTPUT_DIR}/clusters_spatial_map.png'
    )
    plot_feature_distributions_by_cluster(
        df=features_df.merge(subset_gdf[['BBL', 'cluster']], on='BBL'),
        features=['NDVI_mean', 'BuiltFAR', 'LotArea', 'NumBldgs', 'BareSoilProxy_mean'],
        cluster_column='cluster',
        figsize=(20, 12),
        save_path=f'{OUTPUT_DIR}/feature_distributions_by_cluster.png'
    )

    # --- 12. Identify Vacant Lot Candidates ---
    print("\n--- Step 12: Identify Vacant Lot Candidates ---")
    # This is a manual step - review cluster_summary and identify the cluster(s)
    # that best match vacant lot characteristics (e.g., low BuiltFAR, low NumBldgs)
    vacant_cluster_ids = cluster_summary.sort_values(by='BuiltFAR').index[:1].tolist()
    print(f"Identified vacant lot cluster(s): {vacant_cluster_ids}")

    plot_vacant_lot_candidates(
        gdf=subset_gdf,
        cluster_column='cluster',
        vacant_cluster_ids=vacant_cluster_ids,
        title='Potential Vacant Lot Candidates',
        figsize=(15, 15),
        save_path=f'{OUTPUT_DIR}/vacant_lot_candidates_map.png'
    )

    # --- 13. Create Interactive Map ---
    print("\n--- Step 13: Create Interactive Map ---")
    create_interactive_map(
        gdf=subset_gdf,
        column='LandUse',
        cluster_column='cluster',
        zoom_start=11,
        save_path=f'{OUTPUT_DIR}/interactive_clusters_map.html'
    )

    # --- 14. Export Results ---
    print("\n--- Step 14: Export Results ---")
    subset_gdf.to_file(f'{OUTPUT_DIR}/clustered_parcels.geojson', driver='GeoJSON')
    print(f"Saved clustered parcels to {OUTPUT_DIR}/clustered_parcels.geojson")

    vacant_lots = subset_gdf[subset_gdf['cluster'].isin(vacant_cluster_ids)]
    vacant_lots.to_file(f'{OUTPUT_dir}/vacant_lot_candidates.geojson', driver='GeoJSON')
    print(f"Saved {len(vacant_lots)} vacant lot candidates to {OUTPUT_DIR}/vacant_lot_candidates.geojson")

    print("\nWorkflow complete.")

In [21]:
# Check where key building indicators are null
null_mask = (
    mappluto_22_gdb['NumBldgs'].isna()
    | mappluto_22_gdb['BldgArea'].isna()
    | mappluto_22_gdb['BuiltFAR'].isna()
)

null_lots = (
    mappluto_22_gdb.loc[null_mask, ['NumBldgs', 'BldgArea', 'BuiltFAR', 'LandUse']].sort_values(by=['NumBldgs', 'BldgArea', 'BuiltFAR', 'LandUse'], ascending=[False, True, True, True])
)

# View summary + sample
print(f"Nulls in building indicators: {len(null_lots)} / {len(mappluto_22_gdb)} lots")
null_lots

Nulls in building indicators: 428 / 856998 lots


Unnamed: 0,NumBldgs,BldgArea,BuiltFAR,LandUse
33128,2.0,,,
11755,1.0,,,
11929,1.0,,,
18173,1.0,,,
23475,1.0,,,
...,...,...,...,...
831770,,,,
834318,,,,
845490,,,,
853708,,,,


In [22]:
vacant_like = mappluto_22_gdb[
    (mappluto_22_gdb['NumBldgs'].isna()) | (mappluto_22_gdb['NumBldgs'] == 0.0)
][['NumBldgs', 'BldgArea', 'BuiltFAR', 'LandUse']]

vacant_like_sorted = vacant_like.sort_values(
    by=['NumBldgs', 'LandUse'],
    ascending=[True, False]
)

print(f"Lots with NumBldgs == 0 or null: {len(vacant_like_sorted)} / {len(mappluto_22_gdb)}")
vacant_like_sorted


Lots with NumBldgs == 0 or null: 33704 / 856998


Unnamed: 0,NumBldgs,BldgArea,BuiltFAR,LandUse
50,0.0,0.0,0.0,11
74,0.0,0.0,0.0,11
164,0.0,0.0,0.0,11
210,0.0,0.0,0.0,11
239,0.0,0.0,0.0,11
...,...,...,...,...
831770,,,,
834318,,,,
845490,,,,
853708,,,,


# NAIP

In [8]:
nyc_geom = get_nyc_geometry()

@memory.cache
def get_naip_2022_nyc():
    return load_image_collection(
        collection_id="USDA/NAIP/DOQQ",
        start_date="2022-01-01",
        end_date="2022-12-31",
        region=nyc_geom,
        mosaic=True
    )

naip_2022_nyc = get_naip_2022_nyc()

naip_2022_nyc

import geemap
# Create an interactive map
m = geemap.Map(center=[40.7128, -74.0060], zoom=10)

# Add NYC boundary
m.addLayer(naip_2022_nyc, {'color': 'red'}, 'NYC Boundary')

# Show
m

Map(center=[40.7128, -74.006], controls=(WidgetControl(options=['position', 'transparent_bg'], position='topri‚Ä¶

In [9]:
naip_2022_nyc = calculate_spectral_indices(naip_2022_nyc, CONFIG)
naip_2022_nyc.bandNames()

01:47:50 | INFO | vacant_lots | Brightness appears normalized (max ‚â§ 1)


In [18]:
from gee_utils import calculate_brightness

brightness = calculate_brightness(naip_2022_nyc, 'R', 'G', 'B', CONFIG.SENSOR_NORMALIZATION.NAIP)
brightness.getInfo().get("bands")[0].get("data_type").get("min")

0

In [10]:
brightness.select("Brightness").getInfo()

{'type': 'Image',
 'bands': [{'id': 'Brightness',
   'data_type': {'type': 'PixelType',
    'precision': 'float',
    'min': 0,
    'max': 1},
   'dimensions': [48, 10],
   'origin': [-121, 32],
   'crs': 'EPSG:4326',
   'crs_transform': [1, 0, 0, 0, 1, 0]}]}

In [6]:
task = export_image_to_gee(
    image = naip_2022_nyc,
    description="export naip 2022 nyc image scale 1",
    asset_id=f"projects/{CONFIG.EARTH_ENGINE.PROJECT_ID}/assets/test",
    scale = 1
)

21:47:39 | INFO | vacant_lots |  Starting export to GEE Asset: projects/vacant-lot-detection/assets/test
21:47:39 | INFO | vacant_lots | Export task started. Monitor progress in Earth Engine Code Editor Tasks tab


In [9]:
naip_2022_nyc.getInfo()

{'type': 'Image',
 'bands': [{'id': 'R',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 255},
   'dimensions': [48, 10],
   'origin': [-121, 32],
   'crs': 'EPSG:4326',
   'crs_transform': [1, 0, 0, 0, 1, 0]},
  {'id': 'G',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 255},
   'dimensions': [48, 10],
   'origin': [-121, 32],
   'crs': 'EPSG:4326',
   'crs_transform': [1, 0, 0, 0, 1, 0]},
  {'id': 'B',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 255},
   'dimensions': [48, 10],
   'origin': [-121, 32],
   'crs': 'EPSG:4326',
   'crs_transform': [1, 0, 0, 0, 1, 0]},
  {'id': 'N',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 255},
   'dimensions': [48, 10],
   'origin': [-121, 32],
   'crs': 'EPSG:4326',
   'crs_transform': [1, 0, 0, 0, 1, 0]}],
 'properties': {'system:footprint': {'type': 'GeometryCollection',
   'geometrie

In [None]:
#calculate spectral indices