In [11]:
import os
import pandas as pd
import geopandas as gpd

CRS_NYC = "EPSG:2263"

DATA_RAW = os.path.join("..", "data", "raw")
DATA_PROCESSED = os.path.join("..", "data", "processed")

PATH_NTA_ZIP = os.path.join(DATA_RAW, "nta", "nyc_nta_2020.zip")
PATH_BUILDING_CURRENT_ZIP = os.path.join(DATA_RAW, "building_current", "nyc_building_footprints_current.zip")


In [12]:
nta = gpd.read_file(f"zip://{PATH_NTA_ZIP}")

print("NTA rows & cols:", nta.shape)
print("NTA CRS:", nta.crs)
print("NTA columns:", list(nta.columns))

nta.head()

NTA rows & cols: (262, 12)
NTA CRS: EPSG:4326
NTA columns: ['borocode', 'boroname', 'countyfips', 'nta2020', 'ntaname', 'ntaabbrev', 'ntatype', 'cdta2020', 'cdtaname', 'shape_leng', 'shape_area', 'geometry']


Unnamed: 0,borocode,boroname,countyfips,nta2020,ntaname,ntaabbrev,ntatype,cdta2020,cdtaname,shape_leng,shape_area,geometry
0,3.0,Brooklyn,47,BK0101,Greenpoint,Grnpt,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28919.561151,35321810.0,"POLYGON ((-73.93213 40.72816, -73.93238 40.727..."
1,3.0,Brooklyn,47,BK0102,Williamsburg,Wllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28134.082661,28852850.0,"POLYGON ((-73.95814 40.7244, -73.95772 40.7242..."
2,3.0,Brooklyn,47,BK0103,South Williamsburg,SWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),18250.280091,15208960.0,"POLYGON ((-73.95024 40.70547, -73.94984 40.705..."
3,3.0,Brooklyn,47,BK0104,East Williamsburg,EWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),43184.800376,52267410.0,"POLYGON ((-73.92406 40.71411, -73.92404 40.714..."
4,3.0,Brooklyn,47,BK0201,Brooklyn Heights,BkHts,0,BK02,BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...,14312.192285,9982023.0,"POLYGON ((-73.99236 40.68969, -73.99436 40.690..."


In [13]:
# standardize names
nta = nta.rename(columns={"nta2020": "NTACode", "ntaname": "NTAName"})

nta = nta[["NTACode", "NTAName", "boroname", "geometry"]]

# reproject to 2263 to match other data sources and allow square footage to match other datasets
nta_2263 = nta.to_crs(CRS_NYC)

# compute square footage
nta_2263["nta_area_sqft"] = nta_2263.geometry.area

print(nta_2263.crs)
nta_2263.head()


EPSG:2263


Unnamed: 0,NTACode,NTAName,boroname,geometry,nta_area_sqft
0,BK0101,Greenpoint,Brooklyn,"POLYGON ((1003059.997 204572.025, 1002991.367 ...",35321740.0
1,BK0102,Williamsburg,Brooklyn,"POLYGON ((995851.916 203199.332, 995969.193 20...",28852800.0
2,BK0103,South Williamsburg,Brooklyn,"POLYGON ((998047.21 196303.325, 998157.901 196...",15208960.0
3,BK0104,East Williamsburg,Brooklyn,"POLYGON ((1005302.497 199455.73, 1005307.792 1...",52267470.0
4,BK0201,Brooklyn Heights,Brooklyn,"POLYGON ((986367.736 190549.239, 985813.836 19...",9982088.0


In [14]:
buildings = gpd.read_file(f"zip://{PATH_BUILDING_CURRENT_ZIP}")

print("Buildings rows & cols:", buildings.shape)
print("Buildings CRS:", buildings.crs)
print("Buildings columns:", list(buildings.columns))

buildings.head()

Buildings rows & cols: (1082999, 17)
Buildings CRS: EPSG:4326
Buildings columns: ['name', 'bin', 'doitt_id', 'shape_area', 'base_bbl', 'objectid', 'constructi', 'feature_co', 'geom_sourc', 'ground_ele', 'height_roo', 'date_last_', 'time_last_', 'last_statu', 'mappluto_b', 'shape_leng', 'geometry']


Unnamed: 0,name,bin,doitt_id,shape_area,base_bbl,objectid,constructi,feature_co,geom_sourc,ground_ele,height_roo,date_last_,time_last_,last_statu,mappluto_b,shape_leng,geometry
0,,4451699.0,321944.0,177.746094,4075320028,507357.0,1950.0,2100.0,Other (Manual),93.0,27.0,2017-08-22,19:18:38.000,Constructed,4075327501,59.004939,"POLYGON ((-73.75416 40.7542, -73.75402 40.7542..."
1,,4558952.0,255026.0,34.742188,4105630045,137879.0,1930.0,5110.0,Photogrammetric,72.0,13.06,2017-08-17,16:20:43.000,Constructed,4105630045,24.548387,"POLYGON ((-73.75283 40.71895, -73.75289 40.718..."
2,,3176483.0,759005.0,180.890625,3066450044,982953.0,1915.0,2100.0,Photogrammetric,18.0,36.761589,2017-08-22,15:37:34.000,Constructed,3066450044,61.475641,"POLYGON ((-73.98372 40.60334, -73.98381 40.603..."
3,,3393369.0,949392.0,106.035156,3038010128,244121.0,1997.0,2100.0,Photogrammetric,32.0,21.95,2017-08-22,15:31:10.000,Constructed,3038010128,43.732572,"POLYGON ((-73.89828 40.66547, -73.89836 40.665..."
4,,2019299.0,353927.0,217.175781,2033800084,229537.0,1910.0,2100.0,Photogrammetric,197.0,33.49,2017-08-22,18:57:18.000,Constructed,2033800084,63.26829,"POLYGON ((-73.86996 40.90031, -73.86995 40.900..."


In [19]:
buildings.crs

# Below shows the need for reprojecting to 2263
# area would be measured in degrees which is not useful

buildings_test = buildings

buildings_test["area_test"] = buildings_test.geometry.area

buildings_test["area_test"].head()




  buildings_test["area_test"] = buildings_test.geometry.area


0    1.086543e-08
1    2.125210e-09
2    1.108269e-08
3    6.490474e-09
4    1.324628e-08
Name: area_test, dtype: float64

In [5]:
# crs to 2263 for same reasoning as above

buildings_2263 = buildings.to_crs(CRS_NYC)
buildings_2263["footprint_sqft"] = buildings_2263.geometry.area

print(buildings_2263.crs)
buildings_2263[["shape_area", "footprint_sqft"]].head()


EPSG:2263


Unnamed: 0,shape_area,footprint_sqft
0,177.746094,1096.759439
1,34.742188,214.631781
2,180.890625,1121.202926
3,106.035156,656.015069
4,217.175781,1334.18395


In [None]:
# spatial join to assign NTAs to buildings and notna used to determine how many were assigned (accuracy check)

buildings_nta = gpd.sjoin(
    buildings_2263,
    nta_2263[["NTACode", "NTAName", "boroname", "geometry"]],
    how="left",
    predicate="intersects",
).drop(columns=["index_right"])

print("Assigned NTAs (full):", buildings_nta["NTACode"].notna().mean())

buildings_nta[["NTACode", "NTAName", "boroname", "footprint_sqft"]].head()
# inspect and sort by 

Assigned NTAs (full): 0.9999723037466568


Unnamed: 0,NTACode,NTAName,boroname,footprint_sqft
0,QN1102,Bayside,Queens,1096.759439
1,QN1303,Queens Village,Queens,214.631781
2,BK1103,Gravesend (West),Brooklyn,1121.202926
3,BK0503,East New York-New Lots,Brooklyn,656.015069
4,BX1203,Wakefield-Woodlawn,Bronx,1334.18395


In [None]:
# inspect and sort by total footprint sqft per NTA, staten island at top makes sense due to larger NTA areas
built_agg = (
    buildings_nta
    .groupby(["NTACode", "NTAName", "boroname"])
    .agg(
        building_count=("footprint_sqft", "size"),
        total_footprint_sqft=("footprint_sqft", "sum"),
        median_building_sqft=("footprint_sqft", "median"),
    )
    .reset_index()
)

built_agg.sort_values("total_footprint_sqft", ascending=False).head(10)


Unnamed: 0,NTACode,NTAName,boroname,building_count,total_footprint_sqft,median_building_sqft
248,SI0204,New Springville-Willowbrook-Bulls Head-Travis,Staten Island,11285,21555420.0,844.023205
251,SI0302,Great Kills-Eltingville,Staten Island,17630,19742340.0,980.039549
174,QN0501,Maspeth,Queens,11584,19501540.0,830.502944
205,QN1001,South Ozone Park,Queens,23688,19363700.0,738.544785
20,BK0601,Carroll Gardens-Cobble Hill-Gowanus-Red Hook,Brooklyn,7462,18800170.0,1019.357148
253,SI0304,Annadale-Huguenot-Prince's Bay-Woodrow,Staten Island,12519,18143800.0,1312.394611
35,BK1101,Bensonhurst,Brooklyn,15186,17899310.0,1041.152814
31,BK1001,Bay Ridge,Brooklyn,13882,17555570.0,994.388946
3,BK0104,East Williamsburg,Brooklyn,4939,17450020.0,1354.539352
61,BK1803,Canarsie,Brooklyn,14365,17237750.0,906.686818


In [None]:
# merge back to nta to compute built density metrics and calculate ratios

nta_built = nta_2263.merge(built_agg, on=["NTACode", "NTAName", "boroname"], how="left")

nta_built[["building_count", "total_footprint_sqft", "median_building_sqft"]] = (
    nta_built[["building_count", "total_footprint_sqft", "median_building_sqft"]]
)

SQFT_PER_SQMI = 27878400
nta_built["nta_area_sqmi"] = nta_built["nta_area_sqft"] / SQFT_PER_SQMI

nta_built["built_sqft_per_sqmi"] = nta_built["total_footprint_sqft"] / nta_built["nta_area_sqmi"]
nta_built["built_area_ratio"] = nta_built["total_footprint_sqft"] / nta_built["nta_area_sqft"]

nta_built.sort_values("built_area_ratio", ascending=False).head(10)


Unnamed: 0,NTACode,NTAName,boroname,geometry,nta_area_sqft,building_count,total_footprint_sqft,median_building_sqft,nta_area_sqmi,built_sqft_per_sqmi,built_area_ratio
130,MN0502,Midtown-Times Square,Manhattan,"POLYGON ((991725.244 217725.299, 992169.505 21...",24552540.0,1444.0,13479900.0,5031.327741,0.880701,15305870.0,0.549023
129,MN0501,Midtown South-Flatiron-Union Square,Manhattan,"POLYGON ((989555.974 211702.854, 989420.216 21...",14879020.0,1306.0,7998146.0,3405.174031,0.533711,14985900.0,0.537545
121,MN0201,SoHo-Little Italy-Hudson Square,Manhattan,"POLYGON ((983469.159 204638.902, 983496.09 204...",12916760.0,1662.0,6500400.0,2274.76968,0.463325,14029890.0,0.503253
134,MN0604,East Midtown-Turtle Bay,Manhattan,"POLYGON ((995598.36 215579.024, 995626.716 215...",13138090.0,1101.0,6393684.0,2142.036303,0.471264,13567090.0,0.486652
128,MN0402,Hell's Kitchen,Manhattan,"POLYGON ((985324.423 221001.9, 985817.17 22072...",18382320.0,1666.0,8732165.0,1972.715628,0.659375,13243100.0,0.475031
140,MN0802,Upper East Side-Carnegie Hill,Manhattan,"POLYGON ((998281.392 225347.968, 998145.231 22...",20065360.0,2679.0,9498898.0,1908.545717,0.719746,13197570.0,0.473398
122,MN0202,Greenwich Village,Manhattan,"POLYGON ((987048.27 206851.147, 987057.852 206...",10600460.0,1224.0,4996409.0,2088.616231,0.380239,13140180.0,0.471339
132,MN0602,Gramercy,Manhattan,"POLYGON ((990196.892 207745.371, 990187.613 20...",7526000.0,818.0,3486676.0,1996.561202,0.269958,12915620.0,0.463284
127,MN0401,Chelsea-Hudson Yards,Manhattan,"POLYGON ((983754.619 217001.185, 983857.657 21...",29671750.0,2075.0,13187830.0,2007.850335,1.064328,12390760.0,0.444458
119,MN0102,Tribeca-Civic Center,Manhattan,"POLYGON ((984440.604 200699.422, 984402.913 20...",13578250.0,902.0,5706854.0,2960.023311,0.487053,11717110.0,0.420294


In [None]:
# save to parquet for future analysis and efficient file types

out_path = os.path.join(DATA_PROCESSED, "nta_built_baseline.parquet")
nta_built.to_parquet(out_path, index=False)

In [10]:
nta_built["built_area_ratio"].describe()


count    259.000000
mean       0.227445
std        0.133138
min        0.000189
25%        0.149094
50%        0.247417
75%        0.324738
max        0.549023
Name: built_area_ratio, dtype: float64