In [1]:
import os
import pandas as pd
import geopandas as gpd

# Display options
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)


In [2]:
DATA_PROCESSED = os.path.join("..", "data", "processed")

building_current_2263 = gpd.read_parquet(
    os.path.join(DATA_PROCESSED, "building_current_2263.parquet")
)

building_historic_2263 = gpd.read_parquet(
    os.path.join(DATA_PROCESSED, "building_historic_2263.parquet")
)

print(building_current_2263.shape, building_historic_2263.shape)


(1082999, 18) (49059, 19)


In [3]:
PATH_NTA = os.path.join("..", "data", "raw", "nta", "nyc_nta_2020.zip")

nta_gdf = gpd.read_file(f"zip://{PATH_NTA}")

print(nta_gdf.shape)
nta_gdf.head()


(262, 12)


Unnamed: 0,borocode,boroname,countyfips,nta2020,ntaname,ntaabbrev,ntatype,cdta2020,cdtaname,shape_leng,shape_area,geometry
0,3.0,Brooklyn,47,BK0101,Greenpoint,Grnpt,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28919.561151,35321810.0,"POLYGON ((-73.93213 40.72816, -73.93238 40.727..."
1,3.0,Brooklyn,47,BK0102,Williamsburg,Wllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28134.082661,28852850.0,"POLYGON ((-73.95814 40.7244, -73.95772 40.7242..."
2,3.0,Brooklyn,47,BK0103,South Williamsburg,SWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),18250.280091,15208960.0,"POLYGON ((-73.95024 40.70547, -73.94984 40.705..."
3,3.0,Brooklyn,47,BK0104,East Williamsburg,EWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),43184.800376,52267410.0,"POLYGON ((-73.92406 40.71411, -73.92404 40.714..."
4,3.0,Brooklyn,47,BK0201,Brooklyn Heights,BkHts,0,BK02,BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...,14312.192285,9982023.0,"POLYGON ((-73.99236 40.68969, -73.99436 40.690..."


In [10]:
nta_2263 = nta_gdf.to_crs("EPSG:2263")

print("NTA CRS after reprojection:", nta_2263.crs)



NTA CRS after reprojection: EPSG:2263


In [7]:
print("NTA columns:")
for c in nta_gdf.columns:
    print(" ", c)

print("\nDtypes:")
print(nta_gdf.dtypes)

nta_gdf.head(3)

print("NTA CRS:", nta_gdf.crs)


NTA columns:
  borocode
  boroname
  countyfips
  nta2020
  ntaname
  ntaabbrev
  ntatype
  cdta2020
  cdtaname
  shape_leng
  shape_area
  geometry

Dtypes:
borocode       float64
boroname        object
countyfips      object
nta2020         object
ntaname         object
ntaabbrev       object
ntatype         object
cdta2020        object
cdtaname        object
shape_leng     float64
shape_area     float64
geometry      geometry
dtype: object
NTA CRS: EPSG:4326


In [8]:
nta_gdf = nta_gdf.rename(columns={
    "nta2020": "NTACode",
    "ntaname": "NTAName"
})

nta_gdf[["NTACode", "NTAName"]].head()


Unnamed: 0,NTACode,NTAName
0,BK0101,Greenpoint
1,BK0102,Williamsburg
2,BK0103,South Williamsburg
3,BK0104,East Williamsburg
4,BK0201,Brooklyn Heights


In [11]:
building_current_nta = gpd.sjoin(
    building_current_2263,
    nta_2263[["NTACode", "NTAName", "geometry"]],
    how="left",
    predicate="intersects",
)

building_historic_nta = gpd.sjoin(
    building_historic_2263,
    nta_2263[["NTACode", "NTAName", "geometry"]],
    how="left",
    predicate="intersects",
)


In [12]:
current_nta_area = (
    building_current_nta
    .groupby(["NTACode", "NTAName"])["geom_area_sqft"]
    .sum()
    .reset_index()
    .rename(columns={"geom_area_sqft": "current_area_sqft"})
)

historic_nta_area = (
    building_historic_nta
    .groupby(["NTACode", "NTAName"])["geom_area_sqft"]
    .sum()
    .reset_index()
    .rename(columns={"geom_area_sqft": "historic_area_sqft"})
)

nta_growth = current_nta_area.merge(historic_nta_area, on=["NTACode", "NTAName"], how="left")
nta_growth["historic_area_sqft"] = nta_growth["historic_area_sqft"].fillna(0)
nta_growth["growth_sqft"] = nta_growth["current_area_sqft"] - nta_growth["historic_area_sqft"]
nta_growth["growth_pct"] = (nta_growth["growth_sqft"] / nta_growth["historic_area_sqft"].replace(0, pd.NA)) * 100


In [13]:
nta_growth.sort_values("growth_sqft", ascending=False).head(20)


Unnamed: 0,NTACode,NTAName,current_area_sqft,historic_area_sqft,growth_sqft,growth_pct
248,SI0204,New Springville-Willowbrook-Bulls Head-Travis,21555420.0,972042.8,20583380.0,2117.538309
251,SI0302,Great Kills-Eltingville,19742340.0,740755.1,19001580.0,2565.163961
205,QN1001,South Ozone Park,19363700.0,531189.5,18832510.0,3545.346825
174,QN0501,Maspeth,19501540.0,1391238.0,18110300.0,1301.740555
35,BK1101,Bensonhurst,17899310.0,508943.4,17390370.0,3416.955042
31,BK1001,Bay Ridge,17555570.0,253649.2,17301920.0,6821.199682
253,SI0304,Annadale-Huguenot-Prince's Bay-Woodrow,18143800.0,1052359.0,17091450.0,1624.108125
61,BK1803,Canarsie,17237750.0,763445.1,16474310.0,2157.890403
20,BK0601,Carroll Gardens-Cobble Hill-Gowanus-Red Hook,18800170.0,2996632.0,15803540.0,527.376643
3,BK0104,East Williamsburg,17450020.0,1690818.0,15759200.0,932.046087


In [14]:
nta_growth.sort_values("growth_pct", ascending=False).head(20)


Unnamed: 0,NTACode,NTAName,current_area_sqft,historic_area_sqft,growth_sqft,growth_pct
257,SI9592,Miller Field,230894.2,354.442511,230539.8,65042.929899
222,QN1304,Cambria Heights,6472448.0,34045.296897,6438403.0,18911.284958
168,QN0271,Calvary & Mount Zion Cemeteries,355436.0,2186.234453,353249.7,16157.907243
130,MN0601,Stuyvesant Town-Peter Cooper Village,1146778.0,7441.864033,1139336.0,15309.815874
57,BK1704,East Flatbush-Remsen Village,6371341.0,42205.42227,6329136.0,14996.024676
13,BK0471,The Evergreens Cemetery,101780.4,775.290035,101005.1,13028.03975
116,BX2891,Pelham Bay Park,288338.5,2302.077713,286036.4,12425.141698
178,QN0571,Mount Olivet & All Faiths Cemeteries,56429.63,532.887433,55896.75,10489.409765
199,QN0891,Cunningham Park,89961.3,850.787407,89110.51,10473.886814
204,QN0905,Woodhaven,7805141.0,99692.946058,7705448.0,7729.180537


In [15]:
nta_growth.to_parquet(os.path.join(DATA_PROCESSED, "nta_building_growth.parquet"))


In [16]:
pct_current = building_current_nta["NTACode"].notna().mean()
pct_hist = building_historic_nta["NTACode"].notna().mean()

print("Current buildings assigned to NTA:", pct_current)
print("Historic buildings assigned to NTA:", pct_hist)

print("\nMissing NTA in current (sample rows):")
display(building_current_nta.loc[building_current_nta["NTACode"].isna(), ["base_bbl", "bin", "geom_area_sqft"]].head(10))


Current buildings assigned to NTA: 0.9999723037466568
Historic buildings assigned to NTA: 0.9994702742349537

Missing NTA in current (sample rows):


Unnamed: 0,base_bbl,bin,geom_area_sqft
34497,2059580065,2125133.0,24087.683158
69713,4043460075,4605702.0,42499.33735
97214,4163500002,4559530.0,893.539085
190611,4088100083,4617281.0,238.642943
200875,4141630037,4616016.0,405.645568
265913,4082680103,4613314.0,1076.089052
289292,4155100050,4616233.0,188.152446
391384,4082730019,4613763.0,295.110228
475941,3020230001,3395497.0,3050.70777
502781,4155100048,4516754.0,150.012258


In [17]:
print("Current last_statu counts:")
print(building_current_2263["last_statu"].value_counts(dropna=False).head(20))

print("\nHistoric last_statu counts:")
print(building_historic_2263["last_statu"].value_counts(dropna=False).head(20))


Current last_statu counts:
last_statu
Constructed                 1079925
Alteration                     1736
None                            383
Split                           359
Correction                      133
Merged                          123
Initialization                  109
Marked for Demolition            76
Marked for Construction          66
Investigate Construction         52
Geometry                         17
Demolition                       15
Investigate Demolition            5
Name: count, dtype: int64

Historic last_statu counts:
last_statu
Demolition                  46189
Alteration                   1637
Constructed                   393
Merged                        327
Correction                    267
Split                         155
Demollition                    34
None                           22
Demolitian                     16
Geometry                        7
Marked for Demolition           6
Investigate Demolition          2
Marked for Construct

In [19]:
current_nta_stats = (
    building_current_nta
    .groupby(["NTACode", "NTAName"])
    .agg(
        current_area_sqft=("geom_area_sqft", "sum"),
        current_building_count=("geom_area_sqft", "size"),
    )
    .reset_index()
)

historic_nta_stats = (
    building_historic_nta
    .groupby(["NTACode", "NTAName"])
    .agg(
        historic_area_sqft=("geom_area_sqft", "sum"),
        historic_building_count=("geom_area_sqft", "size"),
    )
    .reset_index()
)

nta_growth = current_nta_stats.merge(historic_nta_stats, on=["NTACode", "NTAName"], how="left")
nta_growth[["historic_area_sqft", "historic_building_count"]] = nta_growth[["historic_area_sqft", "historic_building_count"]].fillna(0)

nta_growth["growth_sqft"] = nta_growth["current_area_sqft"] - nta_growth["historic_area_sqft"]
nta_growth["growth_pct"] = (nta_growth["growth_sqft"] / nta_growth["historic_area_sqft"].replace(0, pd.NA)) * 100

nta_growth["building_count_delta"] = nta_growth["current_building_count"] - nta_growth["historic_building_count"]


nta_growth.sort_values("growth_sqft", ascending=False).head(20)



Unnamed: 0,NTACode,NTAName,current_area_sqft,current_building_count,historic_area_sqft,historic_building_count,growth_sqft,growth_pct,building_count_delta
248,SI0204,New Springville-Willowbrook-Bulls Head-Travis,21555420.0,11285,972042.8,350.0,20583380.0,2117.538309,10935.0
251,SI0302,Great Kills-Eltingville,19742340.0,17630,740755.1,640.0,19001580.0,2565.163961,16990.0
205,QN1001,South Ozone Park,19363700.0,23688,531189.5,528.0,18832510.0,3545.346825,23160.0
174,QN0501,Maspeth,19501540.0,11584,1391238.0,373.0,18110300.0,1301.740555,11211.0
35,BK1101,Bensonhurst,17899310.0,15186,508943.4,448.0,17390370.0,3416.955042,14738.0
31,BK1001,Bay Ridge,17555570.0,13882,253649.2,203.0,17301920.0,6821.199682,13679.0
253,SI0304,Annadale-Huguenot-Prince's Bay-Woodrow,18143800.0,12519,1052359.0,843.0,17091450.0,1624.108125,11676.0
61,BK1803,Canarsie,17237750.0,14365,763445.1,290.0,16474310.0,2157.890403,14075.0
20,BK0601,Carroll Gardens-Cobble Hill-Gowanus-Red Hook,18800170.0,7462,2996632.0,553.0,15803540.0,527.376643,6909.0
3,BK0104,East Williamsburg,17450020.0,4939,1690818.0,637.0,15759200.0,932.046087,4302.0


In [20]:
# Bring in NTA land area in sqft from NTA geometry
nta_land = nta_2263.copy()
nta_land["nta_land_area_sqft"] = nta_land.geometry.area
nta_land = nta_land[["NTACode", "nta_land_area_sqft"]]

nta_growth = nta_growth.merge(nta_land, on="NTACode", how="left")

nta_growth["current_built_area_ratio"] = nta_growth["current_area_sqft"] / nta_growth["nta_land_area_sqft"]
nta_growth["historic_built_area_ratio"] = nta_growth["historic_area_sqft"] / nta_growth["nta_land_area_sqft"]
nta_growth["built_area_ratio_delta"] = nta_growth["current_built_area_ratio"] - nta_growth["historic_built_area_ratio"]
