### Standard Python and R imports

In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports

require('tidyverse')

R[write to console]: Loading required package: tidyverse



── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


In [4]:
pd.set_option('display.max_columns', None)

### Load the data

In [None]:
# URL: https://data.cityofnewyork.us/City-Government/Building-Footprints/5zhs-2jue/about_data

In [5]:
%%R
merged_data_new <- read.csv('merged_data_new.csv')

In [6]:
%%R
Building_Footprints <- read.csv('Building Footprints.csv')

In [7]:
%%R
merged_with_footprint <- full_join(merged_data_new, Building_Footprints, 
                               by = c("BIN.Number" = "BIN")) 

In [8]:
%%R
# Create a binary variable 'active_shed' based on GEOID
merged_with_footprint <- merged_with_footprint %>%
  mutate(active_shed = ifelse(is.na(GEOID), 0, 1))  # 0 for no active shed, 1 for active shed

In [9]:
%%R
# Save the merged data as a CSV file:
write.csv(merged_with_footprint, "merged_with_footprint.csv", row.names = FALSE)

In [10]:
merged_with_footprint = pd.read_csv("merged_with_footprint.csv")
merged_with_footprint.head(-30)

Unnamed: 0,GEOID,active_shed_licenses,Job.Number,Borough.Name,Count.Permits,First.Permit.Date,Current.Date,Age..in.years.,Permit.Expiration.Date,Sidewalk.Shed.Linear.Feet,Construction.Material,Current.Job.Status,BIN.Number,Community.Board,Latitude.Point,Longitude.Point,House.Number,Street.Name,Borough.Digit,Block,Lot,Applicant.Business.Name,ProCert,Source,activity,Commercial,STATE,COUNTY,TRACT,BLOCK,NAME.x,population_estimate,black_african_estimate,occupied_estimate,vacant_estimate,owner_occupied_estimate,renter_occupied_estimate,owner_income_estimate,renter_income_estimate,population_moe,black_african_moe,occupied_moe,vacant_moe,owner_occupied_moe,renter_occupied_moe,owner_income_moe,renter_income_moe,the_geom,NAME.y,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE,GLOBALID,active_shed
0,3.600500e+10,1.0,X01112467-I1,Bronx,,2024-10-18,2025-04-19,0.498630,2025-06-02,35.0,,Permit Entire,2020305,209.0,40.81055,-73.85100,216,HUSSON AVENUE,2.0,3460.0,13.0,MSM ENGINEERING SERVICES PLLC,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,400.0,2002.0,"Census Tract 4, Bronx County, New York",6000.0,1572.0,2199.0,58.0,1505.0,694.0,99121.0,,903.0,462.0,312.0,47.0,191.0,347.0,30528.0,,MULTIPOLYGON (((-73.85099338682896 40.81058940...,,1901.0,08/22/2017,Constructed,682075.0,20.44,2100.0,11.0,0.0,0.0,2.034600e+09,2.034600e+09,Photogramm,{FE86AF37-43DA-4C27-8FF4-5F5A6584D885},1
1,3.600500e+10,3.0,X00974554-I1,Bronx,,2023-12-13,2025-04-19,1.347945,2025-12-02,342.0,,Permit Entire,2130803,209.0,40.82163,-73.85979,1847,SEWARD AVENUE,2.0,3600.0,30.0,ASHRAF CORP,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1600.0,1000.0,"Census Tract 16, Bronx County, New York",6038.0,2593.0,2187.0,0.0,356.0,1831.0,100833.0,35774.0,665.0,531.0,260.0,18.0,119.0,260.0,44227.0,6704.0,,,,,,,,,,,,,,,,1
2,3.600500e+10,3.0,X00974555-I1,Bronx,,2023-12-13,2025-04-19,1.347945,2025-12-03,130.0,,Permit Entire,2130802,209.0,40.81942,-73.86132,1843,SEWARD AVENUE,2.0,3600.0,40.0,ASHRAF CORP,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1600.0,1000.0,"Census Tract 16, Bronx County, New York",6038.0,2593.0,2187.0,0.0,356.0,1831.0,100833.0,35774.0,665.0,531.0,260.0,18.0,119.0,260.0,44227.0,6704.0,,,,,,,,,,,,,,,,1
3,3.600500e+10,3.0,X00974557-I1,Bronx,,2023-12-13,2025-04-19,1.347945,2025-12-02,350.0,,Permit Entire,2130800,209.0,40.81969,-73.86113,1841,SEWARD AVENUE,2.0,3600.0,50.0,..,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1600.0,1000.0,"Census Tract 16, Bronx County, New York",6038.0,2593.0,2187.0,0.0,356.0,1831.0,100833.0,35774.0,665.0,531.0,260.0,18.0,119.0,260.0,44227.0,6704.0,,,,,,,,,,,,,,,,1
4,3.600500e+10,3.0,X00002036-I1,Bronx,,2017-11-11,2025-04-19,7.438356,2025-05-03,110.0,,Permit Entire,2000765,201.0,40.80814,-73.92984,9,BRUCKNER BOULEVARD,2.0,2317.0,19.0,JOEL PHAGOO P.E. PLLC,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1901.0,1006.0,"Census Tract 19.01, Bronx County, New York",2168.0,904.0,885.0,46.0,0.0,885.0,,55924.0,263.0,246.0,108.0,33.0,13.0,108.0,,12028.0,MULTIPOLYGON (((-73.92983345477573 40.80829915...,,1931.0,08/22/2017,Constructed,166782.0,27.64,2100.0,9.0,0.0,0.0,2.023170e+09,2.023170e+09,Photogramm,{B2396520-C561-4F0B-97A4-F7F09F394335},1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083786,,,,,,,,,,,,,5117018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-74.16387443590938 40.58601992...,,1985.0,08/22/2017,Constructed,805065.0,35.00,2100.0,42.0,0.0,0.0,5.024010e+09,5.024018e+09,Other (Man,{8E548335-0EDF-48E3-AF74-0C50E7F97EAC},0
1083787,,,,,,,,,,,,,5117052,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-74.16381076494602 40.58552891...,,1985.0,08/22/2017,Constructed,35281.0,38.00,2100.0,44.0,0.0,0.0,5.024010e+09,5.024018e+09,Other (Man,{2B7074BD-79DA-4EE1-8961-69F02C75D984},0
1083788,,,,,,,,,,,,,5117049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-74.16367038393751 40.58567637...,,1985.0,08/22/2017,Constructed,845898.0,40.00,2100.0,43.0,0.0,0.0,5.024010e+09,5.024018e+09,Other (Man,{7E28860A-32BD-4669-9479-92D8568EBE92},0
1083789,,,,,,,,,,,,,5117016,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-74.16378171491016 40.58611706...,,1985.0,08/22/2017,Constructed,608894.0,34.00,2100.0,42.0,0.0,0.0,5.024010e+09,5.024018e+09,Other (Man,{DA0E9033-62A5-4F2C-A8BC-A3B0C4463C63},0


In [11]:
no_construction_year = merged_with_footprint[merged_with_footprint['CNSTRCT_YR'].isna()]
no_construction_year

Unnamed: 0,GEOID,active_shed_licenses,Job.Number,Borough.Name,Count.Permits,First.Permit.Date,Current.Date,Age..in.years.,Permit.Expiration.Date,Sidewalk.Shed.Linear.Feet,Construction.Material,Current.Job.Status,BIN.Number,Community.Board,Latitude.Point,Longitude.Point,House.Number,Street.Name,Borough.Digit,Block,Lot,Applicant.Business.Name,ProCert,Source,activity,Commercial,STATE,COUNTY,TRACT,BLOCK,NAME.x,population_estimate,black_african_estimate,occupied_estimate,vacant_estimate,owner_occupied_estimate,renter_occupied_estimate,owner_income_estimate,renter_income_estimate,population_moe,black_african_moe,occupied_moe,vacant_moe,owner_occupied_moe,renter_occupied_moe,owner_income_moe,renter_income_moe,the_geom,NAME.y,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE,GLOBALID,active_shed
1,3.600500e+10,3.0,X00974554-I1,Bronx,,2023-12-13,2025-04-19,1.347945,2025-12-02,342.0,,Permit Entire,2130803,209.0,40.82163,-73.85979,1847,SEWARD AVENUE,2.0,3600.0,30.0,ASHRAF CORP,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1600.0,1000.0,"Census Tract 16, Bronx County, New York",6038.0,2593.0,2187.0,0.0,356.0,1831.0,100833.0,35774.0,665.0,531.0,260.0,18.0,119.0,260.0,44227.0,6704.0,,,,,,,,,,,,,,,,1
2,3.600500e+10,3.0,X00974555-I1,Bronx,,2023-12-13,2025-04-19,1.347945,2025-12-03,130.0,,Permit Entire,2130802,209.0,40.81942,-73.86132,1843,SEWARD AVENUE,2.0,3600.0,40.0,ASHRAF CORP,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1600.0,1000.0,"Census Tract 16, Bronx County, New York",6038.0,2593.0,2187.0,0.0,356.0,1831.0,100833.0,35774.0,665.0,531.0,260.0,18.0,119.0,260.0,44227.0,6704.0,,,,,,,,,,,,,,,,1
3,3.600500e+10,3.0,X00974557-I1,Bronx,,2023-12-13,2025-04-19,1.347945,2025-12-02,350.0,,Permit Entire,2130800,209.0,40.81969,-73.86113,1841,SEWARD AVENUE,2.0,3600.0,50.0,..,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,1600.0,1000.0,"Census Tract 16, Bronx County, New York",6038.0,2593.0,2187.0,0.0,356.0,1831.0,100833.0,35774.0,665.0,531.0,260.0,18.0,119.0,260.0,44227.0,6704.0,,,,,,,,,,,,,,,,1
75,3.600500e+10,3.0,X01079144-I1,Bronx,,2024-07-15,2025-04-19,0.758904,2025-06-26,270.0,,Permit Entire,2022155,209.0,40.81968,-73.86323,714,BEACH AVENUE,2.0,3598.0,12.0,PAUL G. JONES P.E.,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,3800.0,1001.0,"Census Tract 38, Bronx County, New York",1313.0,355.0,343.0,14.0,146.0,197.0,121000.0,,153.0,232.0,25.0,19.0,55.0,58.0,21236.0,,,,,,,,,,,,,,,,,1
94,3.600500e+10,8.0,X01199911-I1,Bronx,,2025-03-25,2025-04-19,0.065753,2025-10-06,27.0,,Permit Entire,2130939,201.0,40.81164,-73.91844,463,EAST 143 STREET,2.0,2288.0,70.0,..,1.0,DOB NOW,Construction or Maintenance,Other Zoning Districts,36.0,5.0,4100.0,2000.0,"Census Tract 41, Bronx County, New York",5798.0,2436.0,2318.0,86.0,190.0,2128.0,91316.0,17477.0,1391.0,945.0,339.0,99.0,155.0,348.0,30577.0,7204.0,,,,,,,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083244,,,,,,,,,,,,,1090511,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-73.94625319720076 40.84705412...,,,03/14/2025,Constructed,1283782.0,0.000000,2100.0,8.0,0.0,0.0,1.021400e+09,1.021400e+09,Other (Man,{09BDB8EB-B32D-402F-A6CD-1F1C2F82413F},0
1083300,,,,,,,,,,,,,3397444,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-73.99984919403173 40.68862382...,,,01/22/2019,Constructed,1114440.0,73.179002,2100.0,16.0,0.0,0.0,3.003040e+09,3.003048e+09,Photogramm,{9B137E5C-68B4-408A-B449-E5BF57F3B6E3},0
1083327,,,,,,,,,,,,,4461648,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-73.87578515464118 40.74229868...,,,04/07/2025,,1303308.0,10.000000,5110.0,50.0,0.0,0.0,4.015860e+09,4.015860e+09,,{813BF228-46B1-496A-9759-5A98F5A85EB3},0
1083772,,,,,,,,,,,,,3414135,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-73.99119497770594 40.68891604...,,,01/22/2019,Constructed,1113606.0,72.000000,2100.0,43.0,0.0,0.0,3.002780e+09,3.002788e+09,Other (Man,{FC13BA9F-ABC8-4B21-9006-01E4CF46ECC8},0


In [12]:
%%R 

# print count of merged_with_footprint
print(nrow(merged_with_footprint))

merged_with_footprint <- merged_with_footprint %>% 
  filter(!is.na(active_shed) & !is.na(CNSTRCT_YR))

print(nrow(merged_with_footprint))

[1] 1083821
[1] 1072949


In [13]:
%%R

library(DescTools) 

logistic_model <- glm(active_shed ~ CNSTRCT_YR + HEIGHTROOF + GROUNDELEV, 
                      data=merged_with_footprint,
                      family = binomial)

print(summary(logistic_model))


Call:
glm(formula = active_shed ~ CNSTRCT_YR + HEIGHTROOF + GROUNDELEV, 
    family = binomial, data = merged_with_footprint)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) 20.2962970  0.8001236  25.366  < 2e-16 ***
CNSTRCT_YR  -0.0135381  0.0004157 -32.566  < 2e-16 ***
HEIGHTROOF   0.0215634  0.0001932 111.593  < 2e-16 ***
GROUNDELEV   0.0015316  0.0002986   5.129 2.91e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 91200  on 1072546  degrees of freedom
Residual deviance: 77132  on 1072543  degrees of freedom
  (402 observations deleted due to missingness)
AIC: 77140

Number of Fisher Scoring iterations: 8



In [14]:
%%R 
print(PseudoR2(logistic_model, which="McFadden"))

# CNSTRCT_YR: 0.002195937
# GROUNDELEV: 1.339341e-06 
# HEIGHTROOF: 0.141834 

# CNSTRCT_YR + GROUNDELEV: 0.002251043 
# CNSTRCT_YR + HEIGHTROOF: 0.1539
# HEIGHTROOF + GROUNDELEV: 0.1421619 

# CNSTRCT_YR + HEIGHTROOF + GROUNDELEV: 0.1542526

 McFadden 
0.1542526 


In [15]:
%%R 

df_with_predictions <- merged_with_footprint %>% 
    mutate(
        prediction_proba = predict(logistic_model, newdata = merged_with_footprint, type = "response"),
        prediction = ifelse(prediction_proba > 0.0365, 1, 0)
    )

df_with_predictions

           GEOID active_shed_licenses   Job.Number Borough.Name Count.Permits
1    36005000400                    1 X01112467-I1        Bronx            NA
2    36005001901                    3 X00002036-I1        Bronx            NA
3    36005001901                    3 X00677889-I1        Bronx            NA
4    36005001901                    3 X01059624-I1        Bronx            NA
5    36005001902                    8 X00372376-I1        Bronx            NA
6    36005001902                    8 X00763478-I1        Bronx            NA
7    36005001902                    8 X01013799-I1        Bronx            NA
8    36005001902                    8 X01057734-I1        Bronx            NA
9    36005001902                    8 X01082169-I1        Bronx            NA
10   36005001902                    8 X01140314-I1        Bronx            NA
11   36005001902                    8 X01166927-I1        Bronx            NA
12   36005001902                    8 X08026975-I1        Bronx 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




369         2025-02-26   2025-04-19    0.139726027             2026-02-04
370         2025-03-05   2025-04-19    0.120547945             2026-02-04
371         2025-04-18   2025-04-19    0.000000000             2026-02-04
372         2022-11-15   2025-04-19    2.424657534             2026-02-28
373         2023-01-24   2025-04-19    2.232876712             2026-02-28
374         2023-01-23   2025-04-19    2.235616438             2026-02-28
375         2023-01-23   2025-04-19    2.235616438             2026-02-28
376         2023-01-24   2025-04-19    2.232876712             2026-02-28
377         2022-09-23   2025-04-19    2.569863014             2025-07-01
378         2020-06-01   2025-04-19    4.882191781             2025-08-01
379         2022-02-28   2025-04-19    3.136986301             2025-08-01
380         2024-06-10   2025-04-19    0.854794521             2025-06-10
381         2023-10-05   2025-04-19    1.536986301             2025-12-19
382         2024-03-19   2025-04-19  

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                     406.0                            Permit Entire
1263                     285.0                            Permit Entire
1264                     300.0                            Permit Entire
1265                     245.0                            Permit Entire
1266                     250.0                            Permit Entire
1267                     170.0                            Permit Entire
1268                     215.0                            Permit Entire
1269                     290.0                            Permit Entire
1270                     230.0                            Permit Entire
1271                     235.0                            Permit Entire
1272                     225.0                            Permit Entire
1273                     140.0                            Permit Entire
1274                     260.0                            Permit Entire
1275                     172.0                            Permit Ent

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



    1
119             BRUCKNER BOULEVARD             2  3717    1
120                MORRISON AVENUE             2  3744   34
121                   MANOR AVENUE             2  3743    5
122                   MANOR AVENUE             2  3717   32
123             BRUCKNER BOULEVARD             2  3716    1
124                 BOYNTON AVENUE             2  3715   11
125                 BOYNTON AVENUE             2  3714   50
126                 BOYNTON AVENUE             2  3714    1
127                 BOYNTON AVENUE             2  3714   42
128                 BOYNTON AVENUE             2  3714   46
129                 BOYNTON AVENUE             2  3741   30
130                 BOYNTON AVENUE             2  3741   22
131                 BOYNTON AVENUE             2  3741   30
132                 LINCOLN AVENUE             2  2318   22
133               EAST  138 STREET             2  2340    3
134              CANAL STREET WEST             2  2322   28
135                       3 AVENUE

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



       1 DOB NOW
997              JOEL PHAGOO P.E. PLLC       1 DOB NOW
998                                 PR       1 DOB NOW
999       PAUL PERDEK  PROF. ENG. PLLC       1 DOB NOW
1000                                PR       1 DOB NOW
1001                                ..       1 DOB NOW
1002        CRIBSTONE ENGINEERING PLLC       1 DOB NOW
1003    MSM ENGINEERING SERVICES  PLLC       1 DOB NOW
1004                                PR       1 DOB NOW
1005      PAUL PERDEK  PROF. ENG. PLLC       1 DOB NOW
1006            BITOPI CONSULTANTS LLC       1 DOB NOW
1007            BITOPI CONSULTANTS LLC       1 DOB NOW
1008               ARQUITECTURA VARELA       1 DOB NOW
1009               ARQUITECTURA VARELA       1 DOB NOW
1010               ARQUITECTURA VARELA       1 DOB NOW
1011            URBAN INTEGRATIONS LLC       1 DOB NOW
1012    MSM ENGINEERING SERVICES  PLLC       1 DOB NOW
1013         MJE PROFESSIONAL SERVICES       0 DOB NOW
1014            GEIGER E&A GROUP CORP.       1 D

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



      Other Zoning Districts    36     47  2300
1412                Local Law 11      Other Zoning Districts    36     47  2300
1413                Local Law 11      Other Zoning Districts    36     47  2300
1414                Local Law 11      Other Zoning Districts    36     47  2300
1415                Local Law 11 Commercial District/Overlay    36     47  2901
1416 Construction or Maintenance Commercial District/Overlay    36     47  3300
1417                Local Law 11      Other Zoning Districts    36     47  3300
1418 Construction or Maintenance      Other Zoning Districts    36     47  3300
1419                Local Law 11 Commercial District/Overlay    36     47  3300
1420 Construction or Maintenance      Other Zoning Districts    36     47  3300
1421                Local Law 11      Other Zoning Districts    36     47  3300
1422 Construction or Maintenance      Other Zoning Districts    36     47  3400
1423 Construction or Maintenance      Other Zoning Districts    36     4

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




873                    1295              1758              60
874                    1295              1758              60
875                    1295              1758              60
876                     853              1058              60
877                     853              1058              60
878                     853              1058              60
879                     853              1058              60
880                     853              1058              60
881                     853              1058              60
882                     853              1058              60
883                     414              1788              17
884                     414              1788              17
885                     414              1788              17
886                     414              1788              17
887                     414              1788              17
888                      78               480              31
889    

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                    357              1785             292
1439                    357              1785             292
1440                    357              1785             292
1441                    357              1785             292
1442                    357              1785             292
1443                    357              1785             292
1444                    408              1010             189
1445                    408              1010             189
1446                    408              1010             189
1447                    408              1010             189
1448                    408              1010             189
1449                    311              1312             234
1450                    311              1312             234
1451                    311              1312             234
1452                    311              1312             234
1453                    531              1805              69
1454        

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




460                   26875            885               504          545
461                   35000            616               549          172
462                   35000            616               549          172
463                   35000            616               549          172
464                   35000            616               549          172
465                   35000            616               549          172
466                   35000            616               549          172
467                   57159            637               546          360
468                   57159            637               546          360
469                   57159            637               546          360
470                   88587            747               152          145
471                   47609            713               543          230
472                   22781            962               643          205
473                   22781          

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



               454          201
898                  101786            228               137           84
899                  101786            228               137           84
900                  101786            228               137           84
901                  102865            811               418          281
902                  102865            811               418          281
903                  102865            811               418          281
904                  102865            811               418          281
905                   81056            621               312          225
906                   81056            621               312          225
907                   81056            621               312          225
908                   81056            621               312          225
909                   81056            621               312          225
910                   81056            621               312          225
911   

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



         38                 87                 150            38219
1014         56                 65                 165           101596
1015         56                 65                 165           101596
1016         56                 65                 165           101596
1017         56                 65                 165           101596
1018         56                 65                 165           101596
1019         56                 65                 165           101596
1020         56                 65                 165           101596
1021         45                 76                 152            73691
1022         45                 76                 152            73691
1023         45                 76                 152            73691
1024         45                 76                 152            73691
1025         45                 76                 152            73691
1026         45                 76                 152            73

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




247               9767
248              14087
249              14087
250              12912
251               4148
252               4148
253               4148
254               8280
255               8280
256               8280
257               8280
258               8280
259               8280
260               8280
261               8280
262               8280
263               8280
264              12227
265              12227
266              12227
267               6307
268               6307
269               6307
270              18575
271              11086
272              11086
273              11086
274              11086
275              11086
276              11086
277              11086
278              11086
279              11086
280              11086
281              11086
282              11086
283              11086
284              11086
285              24343
286              24343
287              24343
288               8024
289               8024
290       

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



       1912 09/08/2021
606                                                1928 08/22/2017
607                                                1922 08/22/2017
608                                                1922 08/22/2017
609                                                1926 08/22/2017
610                                                1927 08/22/2017
611                                                1930 08/22/2017
612                                                1939 08/22/2017
613                                                1930 08/22/2017
614                                                1939 08/22/2017
615                                                1939 08/22/2017
616                                                1939 08/22/2017
617                                                1939 08/22/2017
618                                                1939 08/22/2017
619                                                1939 08/22/2017
620                                    

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                                               1918 08/22/2017
1374                                               1905 08/22/2017
1375                                               1939 08/22/2017
1376                                               2005 08/22/2017
1377                                               2022 02/08/2024
1378                                               2022 02/08/2024
1379                                               1996 08/22/2017
1380                                               1930 08/22/2017
1381                                               1930 08/22/2017
1382                                               1990 08/22/2017
1383                                               2020 03/08/2021
1384                                               1920 08/22/2017
1385                                               1917 08/22/2017
1386                                               1917 08/22/2017
1387                                               1931 08/22/2017

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



    83689   64.32000      2100        204
1232              Constructed   623752   56.76000      2100        173
1233              Constructed  1104857   27.00000      2100        137
1234                   Merged  1292457    0.00000      2100        141
1235              Constructed   288308  129.84000      2100        101
1236              Constructed    21585  130.06000      2100        110
1237              Constructed   159962  127.99000      2100        108
1238              Constructed   143466  129.23079      2100        113
1239              Constructed   412252   32.43000      2100        114
1240              Constructed   546707  130.87000      2100        108
1241              Constructed    35342   34.60000      2100        109
1242              Constructed   632074  124.15000      2100        116
1243              Constructed   174089   34.93000      2100         99
1244              Constructed   185951   33.84000      2100         89
1245              Constructed   716

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 2023640010 Photogramm
219           0         0 2037920042 2037920042 Photogramm
220           0         0 2037940013 2037940013 Photogramm
221           0         0 2037680020 2037680020 Photogramm
222           0         0 2037930039 2037930039 Photogramm
223           0         0 2037670082 2037670082 Photogramm
224           0         0 2026230180 2026230180 Photogramm
225           0         0 2026230135 2026230135 Photogramm
226           0         0 2026160001 2026160001 Photogramm
227           0         0 2026230135 2026230135 Photogramm
228           0         0 2026230135 2026230135 Photogramm
229           0         0 2026230135 2026230135 Photogramm
230           0         0 2026230135 2026230135 Photogramm
231           0         0 2036090016 2036090016 Photogramm
232           0         0 2026280001 2026280001 Photogramm
233           0         0 2026280001 2026280001 Photogramm
234           0         0 2026280001 2026280001 Photogramm
235           0         0 2026280

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



     0.0042067408
202  {5B3A9128-6FA3-45C9-9527-FC58DBE028DF}           1     0.0138654014
203  {DDC107A0-BA42-4217-B062-A5D22C5BC4CF}           1     0.0332908939
204  {DC96B46D-9DD4-4796-ABD4-B2B3CB0DCDAF}           1     0.0317298298
205  {42D8E2EE-BE2B-40F2-92EF-6210D72A65CC}           1     0.0331893924
206  {33948D4E-8542-410C-8B7C-123D2D2AFBA0}           1     0.0255689954
207  {020F2826-E7EA-4CE3-B308-CFA01940B3D7}           1     0.0143632147
208  {E3B99A41-C7F8-4299-B1EF-84A14654820F}           1     0.0407401703
209  {2970B28D-94F2-4857-873B-CC36E960528E}           1     0.0129212680
210  {5F8356BD-3F30-4E56-9C74-28F36BA9541D}           1     0.0052265177
211  {5D1F47AF-4588-4D82-BDEB-2985CE7E0F94}           1     0.0138694963
212  {9AC0751D-B19F-493D-B4C8-BB99E7F10A3B}           1     0.0400521740
213  {9AC0751D-B19F-493D-B4C8-BB99E7F10A3B}           1     0.0400521740
214  {28D1FBAA-81E5-4F92-A22C-E7B9D368E693}           1     0.0081693487
215  {F876E6FF-534F-43FD-85D5-583

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



     0.0112235182
857  {F2AADBBC-4E06-4F50-8426-D20E2BF93968}           1     0.0097314662
858  {26FF4865-65EF-49F8-8520-EA21D4C6F4A6}           1     0.0092880155
859  {4A88C64D-42AD-4E23-9B69-07BC037BF694}           1     0.0127272826
860  {6C54CBEA-5E9D-4FFD-A1DA-88B943FDB862}           1     0.0097298357
861  {3251F353-3A90-426A-9632-6CD320FFB011}           1     0.0128037173
862  {4C4177F3-E49A-4FF7-B64A-F8525293BAA6}           1     0.0076245783
863  {0C967590-D1A8-4A47-9440-C5632114D5E6}           1     0.0039348931
864  {62AF6539-DFCC-449D-8582-3A42D9EDB033}           1     0.1071672871
865  {631FED1B-80C9-46B0-89CA-4DFD62B676F4}           1     0.0120585783
866  {8D59CB7B-169F-42C7-839B-26FE25FA6436}           1     0.0172940056
867  {B49E1698-05EC-4AE2-8593-EEB9C8FC24B3}           1     0.0077704856
868  {3251F353-3A90-426A-9632-6CD320FFB011}           1     0.0128037173
869  {3AE89CC7-D4E8-4E33-AD35-615630A75A63}           1     0.0149031950
870  {97513BC1-91DB-4EE8-BBFA-6FD

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [16]:
%%R 
colnames(df_with_predictions)

 [1] "GEOID"                     "active_shed_licenses"     
 [3] "Job.Number"                "Borough.Name"             
 [5] "Count.Permits"             "First.Permit.Date"        
 [7] "Current.Date"              "Age..in.years."           
 [9] "Permit.Expiration.Date"    "Sidewalk.Shed.Linear.Feet"
[11] "Construction.Material"     "Current.Job.Status"       
[13] "BIN.Number"                "Community.Board"          
[15] "Latitude.Point"            "Longitude.Point"          
[17] "House.Number"              "Street.Name"              
[19] "Borough.Digit"             "Block"                    
[21] "Lot"                       "Applicant.Business.Name"  
[23] "ProCert"                   "Source"                   
[25] "activity"                  "Commercial"               
[27] "STATE"                     "COUNTY"                   
[29] "TRACT"                     "BLOCK"                    
[31] "NAME.x"                    "population_estimate"      
[33] "black_african_esti

#### ? How to find the outlier buildings?

In [18]:
%%R 
# this is based on the logistics model
df_with_predictions %>%
  filter(prediction == 1, active_shed == 1) %>%       
  arrange(CNSTRCT_YR) %>%                      
  select(CNSTRCT_YR, GROUNDELEV, prediction_proba, Age..in.years., active_shed_licenses, Job.Number, House.Number, Street.Name, Borough.Name) %>%
  head(5) 

  CNSTRCT_YR GROUNDELEV prediction_proba Age..in.years. active_shed_licenses
1       1807          5       0.04257910      1.1753425                   23
2       1813         42       0.04280925      1.9589041                   21
3       1819          6       0.04470918      0.6164384                   23
4       1819          6       0.04473682      0.6136986                   23
5       1819          5       0.04149036      0.4136986                   23
    Job.Number House.Number    Street.Name Borough.Name
1 M00982341-I1          160   SOUTH STREET    Manhattan
2 M00865840-I1            6 BLEEKER STREET    Manhattan
3 M01101110-I1          112   SOUTH STREET    Manhattan
4 M01101124-I1          113   SOUTH STREET    Manhattan
5 M01143166-I1          107   SOUTH STREET    Manhattan


In [19]:
%%R 

df_with_predictions %>%
  filter(prediction == 1, active_shed == 1) %>%       
  arrange(-GROUNDELEV) %>%                      
  select(CNSTRCT_YR, GROUNDELEV, prediction_proba, Age..in.years., active_shed_licenses, Job.Number, House.Number, Street.Name, Borough.Name) %>%
  head(5) 

  CNSTRCT_YR GROUNDELEV prediction_proba Age..in.years. active_shed_licenses
1       1961        238       0.15960971      0.4986301                    4
2       1939        235       0.08337378      3.1534247                    8
3       1973        218       0.16116460      3.4739726                    6
4       1973        213       0.08975300      3.7315068                    6
5       1926        206       0.06657379      3.6520548                   12
    Job.Number House.Number       Street.Name Borough.Name
1 X01125006-I1         5800  ARLINGTON AVENUE        Bronx
2 M00680578-I1          200 CABRINI BOULEVARD    Manhattan
3 M00609734-I1          525    AUDUBON AVENUE    Manhattan
4 M00550796-I1          515    AUDUBON AVENUE    Manhattan
5 M00581621-I1         4117          BROADWAY    Manhattan


In [20]:
%%R 

df_with_predictions %>%
  filter(prediction == 1, active_shed == 1) %>%       
  arrange(-Age..in.years.) %>%                      
  select(CNSTRCT_YR, GROUNDELEV, prediction_proba, Age..in.years., active_shed_licenses, Job.Number, House.Number, Street.Name, Borough.Name) %>%
  head(5) 

  CNSTRCT_YR GROUNDELEV prediction_proba Age..in.years. active_shed_licenses
1       1914         26       0.04352438       13.33699                    8
2       1923         70       0.06144088       13.32877                    6
3       1931         18       0.03852408       13.13973                   15
4       1965         24       0.99728393       12.99452                   28
5       1915         76       0.07976522       11.61096                   29
  Job.Number House.Number      Street.Name Borough.Name
1  420516873        41-43        28 STREET       Queens
2  220161278          900  GRAND CONCOURSE        Bronx
3  120987236          444 WEST   21 STREET    Manhattan
4  121045127           41      MAIDEN LANE    Manhattan
5  140113339          571  WEST END AVENUE    Manhattan


In [21]:
%%R 

df_with_predictions %>%
  filter(prediction == 1, active_shed == 1) %>%       
  arrange(-active_shed_licenses) %>%                      
  select(CNSTRCT_YR, GROUNDELEV, prediction_proba, Age..in.years., active_shed_licenses, Job.Number, House.Number, Street.Name, Borough.Name) %>%
  head(5) 

  CNSTRCT_YR GROUNDELEV prediction_proba Age..in.years. active_shed_licenses
1       1915         31       0.08080598       8.016438                   46
2       1861         27       0.04003515       7.871233                   46
3       1948         19       0.18388883       4.356164                   46
4       1910         32       0.61775834       3.534247                   46
5       1860         31       0.04821838       3.556164                   46
    Job.Number House.Number     Street.Name Borough.Name
1    140629862          349        BROADWAY    Manhattan
2    140638139           71 FRANKLIN STREET    Manhattan
3 M00432484-I1          250   CHURCH STREET    Manhattan
4 M00554934-I1          291        BROADWAY    Manhattan
5 M00592917-I1           58    READE STREET    Manhattan


In [22]:
%%R 

df_with_predictions %>%
  filter(prediction == 1, active_shed == 1) %>%       
  arrange(-prediction_proba) %>%                      
  select(CNSTRCT_YR, GROUNDELEV, prediction_proba, Age..in.years., active_shed_licenses, Job.Number, House.Number, Street.Name, Borough.Name) %>%
  head(5) 

  CNSTRCT_YR GROUNDELEV prediction_proba Age..in.years. active_shed_licenses
1       2022         40        0.9999998      9.2657534                    8
2       2022         40        0.9999998      5.1095890                    8
3       1932         15        0.9999925      2.5205479                   23
4       1930         31        0.9999910      0.9123288                   23
5       1913         33        0.9999872      1.1753425                   28
    Job.Number House.Number    Street.Name Borough.Name
1    321298387            9 DE KALB AVENUE     Brooklyn
2 B00326128-I1            9  DEKALB AVENUE     Brooklyn
3 M08011299-I1           70    PINE STREET    Manhattan
4 M01053567-I1           40    WALL STREET    Manhattan
5 M01003639-I1          233       BROADWAY    Manhattan


In [23]:
%%R 

conf_mat <- table(df_with_predictions$prediction, df_with_predictions$active_shed)
print(conf_mat)

# Extract values from confusion matrix
TP <- conf_mat["1", "1"]
FP <- conf_mat["1", "0"]
FN <- conf_mat["0", "1"]

# Precision and recall calculations
precision <- TP / (TP + FP) * 100
recall <- TP / (TP + FN) * 100

# Print them
cat("Precision:", round(precision, 3), "\n")
cat("Recall:", round(recall, 3), "\n")

   
          0       1
  0 1059363    5677
  1    5501    2006
Precision: 26.722 
Recall: 26.11 


In [24]:
%%R 
f1 <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", round(f1, 3), "\n")

F1 Score: 26.412 
