In [1]:
import sys, os
from pathlib import Path

sys.path.append(os.path.abspath(".."))
# Resolve all data paths relative to the repo root
REPO_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]

from config import load_config
from gee_utils import init_gee
from data_utils import load_gdb
from mappluto import perform_mappluto_eda

In [2]:
CONFIG_PATH = "../config.yaml"
CONFIG = load_config(CONFIG_PATH)

00:28:23 | INFO | vacant_lots | YAML loaded


In [4]:
init_gee(CONFIG.EARTH_ENGINE)

00:28:28 | INFO | vacant_lots | Initializing GEE with ADC credentials and project:vacant-lot-detection
00:28:28 | INFO | vacant_lots | GEE sucessfully initialized


# Load Data
- MapPluto 22 v3 (fall time)

In [5]:
mappluto_22_path = REPO_ROOT / Path(CONFIG.DATA.DIR) / CONFIG.DATA.NYC_MAPPLUTO
mappluto_22_gdb = load_gdb(mappluto_22_path, layer='MapPLUTO_22v3_clipped')
mappluto_22_gdb["geom_perimeter"] = mappluto_22_gdb.geometry.length
print(mappluto_22_gdb.crs)
mappluto_22_gdb.head()

00:28:29 | INFO | vacant_lots | üìÇ Loading GDB from: /Users/joyadebi/repos/Vacant_Lot_Detection/data/nyc_mappluto_22v3_arc_fgdb/MapPLUTO22v3.gdb
00:29:10 | INFO | vacant_lots | ‚úÖ Loaded 856998 features from layer 'MapPLUTO_22v3_clipped'.
EPSG:2263


Unnamed: 0,Borough,Block,Lot,CD,BCT2020,BCTCB2020,CT2010,CB2010,SchoolDist,Council,...,PFIRM15_FLAG,Version,DCPEdited,Latitude,Longitude,Notes,Shape_Leng,Shape_Area,geometry,geom_perimeter
0,MN,1,101,101.0,1000100,,1,1001,2,1.0,...,1,22v3,,40.68992,-74.045337,,0.0,501897.3,"MULTIPOLYGON (((972409.69 190685.56, 972428.82...",3940.841176
1,MN,1,10,101.0,1000500,10005000003.0,5,1000,2,1.0,...,1,22v3,,40.688766,-74.018682,,0.0,7478663.0,"MULTIPOLYGON (((980783.787 191526.763, 980898....",13432.373392
2,MN,1,111,101.0,1000500,10005001003.0,5,1000,2,1.0,...,1,22v3,,40.692922,-74.017637,,0.0,64450.95,"MULTIPOLYGON (((979185.332 191646.482, 979184....",1076.168021
3,MN,2,2,101.0,1000900,10009001022.0,9,1025,2,1.0,...,1,22v3,,40.70055,-74.011588,,0.0,87244.25,"MULTIPOLYGON (((980915.002 194319.141, 980795....",2411.869934
4,MN,2,23,101.0,1000900,10009001022.0,9,1025,2,1.0,...,1,22v3,t,40.701171,-74.009243,,0.0,96902.37,"MULTIPOLYGON (((981182.221 194851.167, 981181....",2949.778901


## MapPluto Features
- CRS: EPSG:2236 (in feet for NYC -- reproject for clustering and NAIP eventually) 
### Categorical Features
- Borough: BX, BK, MN, QN, SI
- LotType:
    - 0: Unknown
    - 1: Block Assemblage - A tax lot which encompasses an entire block.
    - 2: Waterfront - A tax lot bordering on a body of water. Waterfront lots may contain a small amount of submerged land.
    - 3: Corner - A tax lot bordering on two intersecting streets
    - 4: Through - A tax lot which connects two streets, with frontage on both streets. Note that a lot with two frontages is not necessarily a through lot. For example, an L-shaped lot with two frontages would be coded as an Inside Lot (5).
    - 5: Inside - A tax lot with frontage on only one street. This   - c: A, but is only assigned in PLUTO if CAMA has no other lot types for the tax lot.
    - 6: Interior Lot - A tax lot that has no street frontage.
    - 7: Island Lot - A tax lot that is entirely surrounded by water.
    - 8: Alley Lot - A tax lot that is too narrow to accommodate a building. The lot is usually 12 feet or less in width.
    - 9: Submerged Land Lot - A tax lot that is totally or almost completely submerged.
- LandUse: assigns each BUILDING CLASS (BldgClass?) to each -- so not truely vacant land? we can't trust the labels?
    - 01: One & Two Family Buildings
    - 02: Multi - Family Walk- Up Buldings
    - 03: Multi - Family Elevator Buildings
    - 04: Mixed Residential and Commercial Buildings
    - 05: Commercial and Office Buildings
    - 06: Industrial and Manufacturing
    - 07: Transportation and Utility
    - 08: Public Facilities and Institutions
    - 09: Open Space and Outdoor Recreation
    - 10: Parking Facilities
    - 11: Vacant Land
- BldgClass: Too many 
- OwnerType: from COLP (City Owned and Leased Properties), if not PTS (Property Tax System)
    - C (city), M (mixed C + P), O (other state, feds), P (private), X (tax exempt), blank (unknown usually private)
- ZoneDist1: R (res), C (commercial), M (manufac), BPC (battery park city), PARK, M/R (e.g. M1-1/R5) (Mixed man + res)
    - greatest % of tax lot if SPLIT BOUNDARY INDICATOR is divided

### Numerical Features
- Shape_Area: maybe better est -- generated automatically by ESRI (use instead of LotArea)
    - LotArea: sq feet to nearest int, if irregular shape DOF from 
- BldgArea: Building Floor Area (see AreaSource)
- BuiltFAR: Total building floor area divided by area of tax lot 
    - See ResFar, CommFar, etc 
- NumBldgs: Num buildings on tax lot
- YearBuilt: Correct within decade, some have 0 
- AssessLand: assessed land value
- AssessTot: Total value (multiply by tax class)
- ExemptVal: dollar amount of lot recieving exemption
- NumFloors: Num Floors (can be 2.5)
- UnitsRes: Sum of residential units 
- UnitsTotal: Sum of res and non res units (not updated prob)
- LotFront (feet)
- LotDepth (feet)
- LtdHeight: limited height in district 
- Latitude
- Longitude
- geom_perimeter (computer)
    - Shape_Leng: Perimeter (nulls)

### Notes
- Tax Lots can have multiple buildings
- There are lots underwater which we will not be handling --- but cool nonetheless
- Is latitude/long in the center/centroid?
    - no should recompute 



In [6]:
numerical_features = ['Shape_Area', 'geom_perimeter', 'BldgArea', 'BuiltFAR', 'NumBldgs', 'YearBuilt', 
                         'AssessLand', 'AssessTot', 'NumFloors', 'UnitsRes', 'LotFront', 'LotDepth']

# Categorical features of interest
categorical_features = ['LandUse', 'Borough', 'BldgClass', 'OwnerType', 'ZoneDist1']

In [7]:
# Define your EDA output directory (can be relative or absolute)
eda_output_dir = Path("outputs/mappluto_22")

# Run the orchestrator
eda_results = perform_mappluto_eda(
    gdf=mappluto_22_gdb,
    output_dir=eda_output_dir,
    numerical_features=[
        'Shape_Area', 'geom_perimeter', 'BldgArea', 'BuiltFAR', 'NumBldgs',
        'YearBuilt', 'AssessLand', 'AssessTot', 'NumFloors', 'UnitsRes',
        'LotFront', 'LotDepth'
    ],
    categorical_features=[
        'LandUse', 'Borough', 'BldgClass', 'OwnerType', 'ZoneDist1'
    ],
    top_n_categories=10  # optional ‚Äî controls how many categories to show in plots
)

00:29:16 | INFO | vacant_lots | üöÄ Starting MapPLUTO EDA pipeline
00:29:16 | INFO | vacant_lots | üìà Summarizing numerical features...
00:29:17 | INFO | vacant_lots | üßÆ Computed summary statistics for 12 numerical features.
00:29:17 | INFO | vacant_lots | Saved numerical summary to outputs/mappluto_22/numerical_summary.csv
00:29:17 | INFO | vacant_lots | üìä Plotting numerical distributions...
00:29:17 | INFO | vacant_lots | Clipping data to 0.95 percentile for plotting
00:29:48 | INFO | vacant_lots | üìä Saved numerical distributions to outputs/mappluto_22/mappluto_numerical_distributions.png
00:29:48 | INFO | vacant_lots | üìÇ Summarizing categorical features...
00:29:48 | INFO | vacant_lots | üìä Computed top 10 categories for 'LandUse'.
00:29:48 | INFO | vacant_lots | üìä Computed top 10 categories for 'Borough'.
00:29:48 | INFO | vacant_lots | üìä Computed top 10 categories for 'BldgClass'.
00:29:48 | INFO | vacant_lots | üìä Computed top 10 categories for 'OwnerType

In [16]:
print(mappluto_22_gdb.geometry.length.mean())
print(mappluto_22_gdb["Shape_Leng"].mean())

299.43301071578645
0.0


In [17]:
mappluto_22_gdb[["Shape_Area","LotArea"]]

Unnamed: 0,Shape_Area,LotArea
0,5.018973e+05,541886.0
1,7.478663e+06,7665016.0
2,6.445095e+04,64450.0
3,8.724425e+04,191502.0
4,9.690237e+04,510025.0
...,...,...
856993,5.078837e+03,7199.0
856994,1.318642e+04,14136.0
856995,1.247200e+04,10664.0
856996,5.976783e+03,6630.0


- Shape_Area is better than LotArea
- Shape_Leng is not there -- needed to get from REST ? -- compute with .length

In [21]:
# Check where key building indicators are null
null_mask = (
    mappluto_22_gdb['NumBldgs'].isna()
    | mappluto_22_gdb['BldgArea'].isna()
    | mappluto_22_gdb['BuiltFAR'].isna()
)

null_lots = (
    mappluto_22_gdb.loc[null_mask, ['NumBldgs', 'BldgArea', 'BuiltFAR', 'LandUse']].sort_values(by=['NumBldgs', 'BldgArea', 'BuiltFAR', 'LandUse'], ascending=[False, True, True, True])
)

# View summary + sample
print(f"Nulls in building indicators: {len(null_lots)} / {len(mappluto_22_gdb)} lots")
null_lots

Nulls in building indicators: 428 / 856998 lots


Unnamed: 0,NumBldgs,BldgArea,BuiltFAR,LandUse
33128,2.0,,,
11755,1.0,,,
11929,1.0,,,
18173,1.0,,,
23475,1.0,,,
...,...,...,...,...
831770,,,,
834318,,,,
845490,,,,
853708,,,,


In [22]:
vacant_like = mappluto_22_gdb[
    (mappluto_22_gdb['NumBldgs'].isna()) | (mappluto_22_gdb['NumBldgs'] == 0.0)
][['NumBldgs', 'BldgArea', 'BuiltFAR', 'LandUse']]

vacant_like_sorted = vacant_like.sort_values(
    by=['NumBldgs', 'LandUse'],
    ascending=[True, False]
)

print(f"Lots with NumBldgs == 0 or null: {len(vacant_like_sorted)} / {len(mappluto_22_gdb)}")
vacant_like_sorted


Lots with NumBldgs == 0 or null: 33704 / 856998


Unnamed: 0,NumBldgs,BldgArea,BuiltFAR,LandUse
50,0.0,0.0,0.0,11
74,0.0,0.0,0.0,11
164,0.0,0.0,0.0,11
210,0.0,0.0,0.0,11
239,0.0,0.0,0.0,11
...,...,...,...,...
831770,,,,
834318,,,,
845490,,,,
853708,,,,
