# Merge Census Data

## Setup Python and R environment

In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports

require('tidyverse')

R[write to console]: Loading required package: tidyverse



── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


In [4]:
# See all columns
pd.set_option('display.max_columns', None)

## Load & Clean Data

👉 Load the data along with the census connectors below (the output of the `connect-to-census.ipynb` notebook) and do any cleanup you'd like to do.

### Load Data:

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("Active_Sheds.csv")
df.head()

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,5468,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,5342,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,5329,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,5069,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,4810,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts


In [7]:
%pip install requests-cache

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import requests_cache

# Initialize the cache for geocoding API to avoid hitting the API repeatedly
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

# Geocode function that retrieves census geography data based on latitude and longitude
def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()  # Raise error for bad status codes
        data = response.json()
        
        census_data = data['result']['geographies']['Census Blocks'][0]
        
        # Extract required census geography fields
        return {
            "SUFFIX": census_data.get("SUFFIX", None),
            "POP100": census_data.get("POP100", None),
            "GEOID": census_data.get("GEOID", None),
            "CENTLAT": census_data.get("CENTLAT", None),
            "BLOCK": census_data.get("BLOCK", None),
            "AREAWATER": census_data.get("AREAWATER", None),
            "STATE": census_data.get("STATE", None),
            "BASENAME": census_data.get("BASENAME", None),
            "OID": census_data.get("OID", None),
            "LSADC": census_data.get("LSADC", None),
            "INTPTLAT": census_data.get("INTPTLAT", None),
            "FUNCSTAT": census_data.get("FUNCSTAT", None),
            "NAME": census_data.get("NAME", None),
            "OBJECTID": census_data.get("OBJECTID", None),
            "TRACT": census_data.get("TRACT", None),
            "CENTLON": census_data.get("CENTLON", None),
            "BLKGRP": census_data.get("BLKGRP", None),
            "AREALAND": census_data.get("AREALAND", None),
            "HU100": census_data.get("HU100", None),
            "INTPTLON": census_data.get("INTPTLON", None),
            "MTFCC": census_data.get("MTFCC", None),
            "LWBLKTYP": census_data.get("LWBLKTYP", None),
            "UR": census_data.get("UR", None),
            "COUNTY": census_data.get("COUNTY", None),
        }
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

# Function to geocode data in chunks (to avoid consuming too much RAM)
def geocode_in_chunks(df, chunk_size=100):
    header = ['SUFFIX', 'POP100', 'GEOID', 'CENTLAT', 'BLOCK', 'AREAWATER', 'STATE', 'BASENAME', 'OID', 
              'LSADC', 'INTPTLAT', 'FUNCSTAT', 'NAME', 'OBJECTID', 'TRACT', 'CENTLON', 'BLKGRP', 'AREALAND', 
              'HU100', 'INTPTLON', 'MTFCC', 'LWBLKTYP', 'UR', 'COUNTY']
    
    with open('censusgeos.csv', 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()

        # Process the dataframe in chunks
        for i in tqdm(range(0, len(df), chunk_size), desc="Processing chunks"):
            chunk = df.iloc[i:i + chunk_size]
            latitudes = chunk['Latitude Point']
            longitudes = chunk['Longitude Point']

            # Geocode the chunk in parallel
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(geocode, latitudes, longitudes))

            # Write the results for this chunk to the CSV file
            for result in results:
                if result:
                    writer.writerow(result)

# Load your dataset
df = pd.read_csv("Active_Sheds.csv")

# Ensure 'Latitude Point' and 'Longitude Point' are numeric
df['Latitude Point'] = pd.to_numeric(df['Latitude Point'], errors='coerce')
df['Longitude Point'] = pd.to_numeric(df['Longitude Point'], errors='coerce')

# Handle any missing or invalid lat/long by dropping those rows
df = df.dropna(subset=['Latitude Point', 'Longitude Point'])

# Call the geocoding function to process the data in chunks
geocode_in_chunks(df, chunk_size=100)  # Adjust chunk_size as needed

# Optionally, read the results from the CSV after processing
census_geos_df = pd.read_csv('censusgeos.csv')

# Ensure that the numeric columns like STATE, COUNTY, and TRACT are treated as strings
census_geos_df['STATE'] = census_geos_df['STATE'].astype(str)
census_geos_df['COUNTY'] = census_geos_df['COUNTY'].astype(str)
census_geos_df['TRACT'] = census_geos_df['TRACT'].astype(str)
census_geos_df['BLOCK'] = census_geos_df['BLOCK'].astype(str)

# Select only the columns you want to keep
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]

# Concatenate the geocoded data with the original dataframe
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Processing chunks:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,5468,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,5342,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,5329,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,5069,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,4810,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001


In [9]:
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]
census_geos_df

Unnamed: 0,GEOID,STATE,COUNTY,TRACT,BLOCK
0,360610154023000,36,61,15402,3000
1,360610050002000,36,61,5000,2000
2,360610028003001,36,61,2800,3001
3,360610115002002,36,61,11500,2002
4,360610089004001,36,61,8900,4001
...,...,...,...,...,...
8472,360610277001000,36,61,27700,1000
8473,360610161001000,36,61,16100,1000
8474,360610002021002,36,61,202,1002
8475,360470033001001,36,47,3300,1001


In [10]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,5468,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,5342,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,5329,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,5069,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,4810,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001


### Clean Data: 

In [11]:
# Convert 'Age (in days)' to age in years (ignoring leap years)
df_with_geos['Age'] = df_with_geos['Age'] / 365
df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.600000,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8472,103652192,Manhattan,0.0,2019-09-19,2025-05-03,5.621918,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610277001000,36,61,27700,1000
8473,104213637,Manhattan,0.0,2020-08-28,2025-05-03,4.679452,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610161001000,36,61,16100,1000
8474,104213682,Manhattan,0.0,2020-10-01,2025-05-03,4.586301,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610002021002,36,61,202,1002
8475,302585815,Brooklyn,0.0,2019-02-22,2025-05-03,6.194521,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,360470033001001,36,47,3300,1001


In [12]:
# Rename the 'Age' column to 'Age (in years)'
df_with_geos = df_with_geos.rename(columns={'Age': 'Age (in years)'})

df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age (in years),Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.600000,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8472,103652192,Manhattan,0.0,2019-09-19,2025-05-03,5.621918,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610277001000,36,61,27700,1000
8473,104213637,Manhattan,0.0,2020-08-28,2025-05-03,4.679452,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610161001000,36,61,16100,1000
8474,104213682,Manhattan,0.0,2020-10-01,2025-05-03,4.586301,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610002021002,36,61,202,1002
8475,302585815,Brooklyn,0.0,2019-02-22,2025-05-03,6.194521,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,360470033001001,36,47,3300,1001


In [13]:
df_with_geos['GEOID'] = \
    df_with_geos['STATE'] + \
    df_with_geos['COUNTY'].str.pad(width=3, side='left', fillchar='0') + \
    df_with_geos['TRACT'].str.pad(width=6, side='left', fillchar='0')

df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age (in years),Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061015402,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061005000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.600000,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061002800,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061011500,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,36061008900,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8472,103652192,Manhattan,0.0,2019-09-19,2025-05-03,5.621918,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061027700,36,61,27700,1000
8473,104213637,Manhattan,0.0,2020-08-28,2025-05-03,4.679452,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061016100,36,61,16100,1000
8474,104213682,Manhattan,0.0,2020-10-01,2025-05-03,4.586301,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061000202,36,61,202,1002
8475,302585815,Brooklyn,0.0,2019-02-22,2025-05-03,6.194521,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,36047003300,36,47,3300,1001


In [14]:
df_with_geos['Borough Name'].value_counts()

Borough Name
MANHATTAN        3815
BROOKLYN         2029
BRONX            1434
QUEENS           1050
STATEN ISLAND      90
Manhattan          33
Brooklyn           13
Bronx               8
Queens              3
Staten Island       2
Name: count, dtype: int64

In [15]:
# Standardize the 'Borough Name' column to proper case (first letter capitalized)
df_with_geos['Borough Name'] = df_with_geos['Borough Name'].replace({
    "MANHATTAN": "Manhattan",
    "BROOKLYN": "Brooklyn",
    "BRONX": "Bronx",
    "QUEENS": "Queens",
    "STATEN ISLAND": "Staten Island"
})

In [16]:
df_with_geos['Borough Name'].value_counts()

Borough Name
Manhattan        3848
Brooklyn         2042
Bronx            1442
Queens           1053
Staten Island      92
Name: count, dtype: int64

In [17]:
# BIN Number: Building Identification Number
# BIS: Building Information System
# SCA: School Construction Authority
# Permit-Entire	Permit Issued - Entire Job/Work

#### Current Job Status: R = Permit-Entire = Permit Issued - Entire Job/Work, but they use diff symbols depending whether the source is BIS or DOB NOW
#### , based on https://www.nyc.gov/site/buildings/industry/permit-type-and-job-status-codes.page

#### ProCert: The Department offers a Professional Certification (Pro Cert) Program which enables Professional Engineers (PE) and Registered Architects (RA) to certify that the plans they file with the Department are in compliance with all applicable laws. This reduces the amount of time a builder normally has to wait for a DOB permit by eliminating the process of Department plan examination and approval.

# Q: Would 1 refer to with certificate and 0 refer to no certificate? 

## 👉 Grab Census Data

1. loading the Census API key

In [18]:
# pip install dotenv

In [19]:
from dotenv import load_dotenv
load_dotenv() # <- searches for a file named .env and loads the environment variables in it

False

In [20]:
%%R 

require('tidycensus')

# because it an environment variable, we don't have to 
# explicitly pass this string to R, it is readable here
# in this R cell.
census_api_key(Sys.getenv("CENSUS_API_KEY"))

R[write to console]: Loading required package: tidycensus

R[write to console]: To install your API key for use in future sessions, run this function with `install = TRUE`.



2. Decide which Census variables you want

    Use <https://censusreporter.org/> to figure out which tables you want. (if censusreporter is down, check out the code in the cell below)

    -   Scroll to the bottom of the page to see the tables.
    -   If you already know the table ID, stick that in the "Explore" section to learn more about that table.

    By default this code loads (B01003_001) which we found in censusreporter here: https://censusreporter.org/tables/B01003/

    - find some other variables that you're also interested in
    - don't forget to pick a geography like "tract", "county" or "block group". here is the list of [all geographies](https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus
    ).


In [21]:
%%R 
require("tigris")

R[write to console]: Loading required package: tigris

R[write to console]: To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.



# Q: What are the related columns and how to deal with them?

In [22]:
%%R 

# long-form data: with only one numeric data

# the variable B01003_001 was selectd from the census table 
# for population, which we found in censusreporter here:
# https://censusreporter.org/tables/B01003/


# Here are the various geographies you can use with tidycensus
# https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus

# Get variable from ACS
nyc_census_data <- get_acs(geography = "tract", 
                      state='NY',
                      county = c("New York", "Kings", "Bronx", "Queens", "Richmond"),
                      variables = c(
                        population = "B01003_001",
                        occupied = "B25002_002", 
                        vacant = "B25002_003",
                        owner_occupied = "B25003_002",
                        renter_occupied = "B25003_003",
                        owner_income = "B25119_002",
                        renter_income = "B25119_003",
                        black_african = "B02009_001"
                      ), 
                      year = 2021,
                      survey="acs5",
                      geometry=F,
                      cb = T)

nyc_census_data <- nyc_census_data #%>% 
    #erase_water() 

R[write to console]: Getting data from the 2017-2021 5-year ACS

R[write to console]: Using FIPS code '36' for state 'NY'

R[write to console]: Using FIPS code '061' for 'New York County'

R[write to console]: Using FIPS code '047' for 'Kings County'

R[write to console]: Using FIPS code '005' for 'Bronx County'

R[write to console]: Using FIPS code '081' for 'Queens County'

R[write to console]: Using FIPS code '085' for 'Richmond County'



### Possible Related Variables:

#### From Housing: 
##### 1. population: B01003_001

##### 2. Occupancy Status - Total: B25002_001
##### 3. Occupancy Status - Occupied: B25002_002
##### 4. Occupancy Status - Vacant: B25002_003

##### 5. Tenure - Total: B25003_001
##### 6. Tenure - Owner occupied: B25003_002
##### 7. Tenure - Renter occupied: B25003_003

##### (8. Median Year Structure Built by Tenure - Total: B25037_001
##### 9. Median Year Structure Built by Tenure - Owner occupied: B25037_002
##### 10. Median Year Structure Built by Tenure - Renter occupied: B25037_003)

##### 11. Median Household Income by Tenure - Total: B25119_001
##### 12. Median Household Income by Tenure - Owner occupied: B25119_002
##### 13. Median Household Income by Tenure - Renter occupied: B25119_003

##### 14. Value??

#### From Income & Earnings: 

In [23]:
%%R 
nyc_census_data

# A tibble: 18,616 × 5
   GEOID       NAME                                   variable    estimate   moe
   <chr>       <chr>                                  <chr>          <dbl> <dbl>
 1 36005000100 Census Tract 1, Bronx County, New York population      6661   702
 2 36005000100 Census Tract 1, Bronx County, New York black_afri…     3346   468
 3 36005000100 Census Tract 1, Bronx County, New York occupied           0    18
 4 36005000100 Census Tract 1, Bronx County, New York vacant             0    18
 5 36005000100 Census Tract 1, Bronx County, New York owner_occu…        0    18
 6 36005000100 Census Tract 1, Bronx County, New York renter_occ…        0    18
 7 36005000100 Census Tract 1, Bronx County, New York owner_inco…       NA    NA
 8 36005000100 Census Tract 1, Bronx County, New York renter_inc…       NA    NA
 9 36005000200 Census Tract 2, Bronx County, New York population      4453   563
10 36005000200 Census Tract 2, Bronx County, New York black_afri…     1450   546
# ℹ 1

In [24]:
%%R 

# pivot from wide to long
nyc_census_data <- nyc_census_data %>% 
  pivot_wider(
    names_from=variable, 
    values_from = c(estimate, moe),
    names_glue = "{variable}_{.value}"
  ) 

nyc_census_data

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
# ℹ 2

In [25]:
%%R 
nyc_census_data %>% print(n=20)

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
11 36

In [26]:
%%R -o nyc_census_data
nyc_census_data

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
# ℹ 2

## 👉 Merge it with your data

hint...`tidycensus` provides you data in long format you may need to pivot the census data from long to wide format before merging it with your data

In [27]:
df_with_geos.to_csv('df_with_geos.csv', index=False)

In [28]:
%%R
df_with_geos <- read.csv('df_with_geos.csv')

In [29]:
%%R
df_with_geos$GEOID <- as.character(df_with_geos$GEOID)
nyc_census_data$GEOID <- as.character(nyc_census_data$GEOID)

In [30]:
%%R
# Merge the data: 
library(dplyr)

merged_data <- left_join(df_with_geos, nyc_census_data, by = "GEOID")
merged_data

       Job.Number  Borough.Name Count.Permits First.Permit.Date Current.Date
1       120351662     Manhattan             0         5/13/2010     5/3/2025
2       120470409     Manhattan             0         9/16/2010     5/3/2025
3       120486633     Manhattan             0         9/29/2010     5/3/2025
4       120725705     Manhattan             0         6/16/2011     5/3/2025
5       120987236     Manhattan             0          3/1/2012     5/3/2025
6       121045127     Manhattan             0         4/23/2012     5/3/2025
7       121844941     Manhattan             0         12/5/2013     5/3/2025
8       122106078     Manhattan             0         8/19/2014     5/3/2025
9       122199236     Manhattan             0         12/9/2014     5/3/2025
10      122212131     Manhattan             0        12/16/2014     5/3/2025
11      122538995     Manhattan             0         12/7/2015     5/3/2025
12      122574570     Manhattan             0          1/6/2016     5/3/2025

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



   2025-05-03
1604 B01161144-I1      Brooklyn            NA        2025-01-17   2025-05-03
1605 B01161200-I1      Brooklyn            NA        2024-12-31   2025-05-03
1606 B01161370-I1      Brooklyn            NA        2024-12-31   2025-05-03
1607 B01161440-I1      Brooklyn            NA        2025-01-02   2025-05-03
1608 B01161962-I1      Brooklyn            NA        2025-01-06   2025-05-03
1609 B01161965-I1      Brooklyn            NA        2025-01-06   2025-05-03
1610 B01161991-I1      Brooklyn            NA        2025-01-03   2025-05-03
1611 B01162185-I1      Brooklyn            NA        2025-01-03   2025-05-03
1612 B01162268-I1      Brooklyn            NA        2025-01-30   2025-05-03
1613 B01162274-I1      Brooklyn            NA        2025-01-06   2025-05-03
1614 B01162278-I1      Brooklyn            NA        2025-01-30   2025-05-03
1615 B01162429-I1      Brooklyn            NA        2025-01-06   2025-05-03
1616 B01162516-I1      Brooklyn            NA        2025-01-0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



      Brooklyn            NA        2025-04-23   2025-05-03
1994 B01212089-I1      Brooklyn            NA        2025-04-22   2025-05-03
1995 B01212135-I1      Brooklyn            NA        2025-04-22   2025-05-03
1996 B01212138-I1      Brooklyn            NA        2025-04-22   2025-05-03
1997 B01212242-I1      Brooklyn            NA        2025-04-18   2025-05-03
1998 B01212249-I1      Brooklyn            NA        2025-04-17   2025-05-03
1999 B01212281-I1      Brooklyn            NA        2025-04-28   2025-05-03
2000 B01212622-I1      Brooklyn            NA        2025-04-21   2025-05-03
2001 B01212640-I1      Brooklyn            NA        2025-04-21   2025-05-03
2002 B01212746-I1      Brooklyn            NA        2025-04-21   2025-05-03
2003 B01212801-I1      Brooklyn            NA        2025-04-24   2025-05-03
2004 B01212965-I1      Brooklyn            NA        2025-04-21   2025-05-03
2005 B01213074-I1      Brooklyn            NA        2025-04-25   2025-05-03
2006 B01213288-I

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




1717    0.180821918             2025-05-25                     132.0
1718    0.197260274             2025-08-04                      35.0
1719    0.210958904             2025-06-02                      30.0
1720    0.216438356             2026-02-12                     176.0
1721    0.213698630             2026-01-26                      82.0
1722    0.200000000             2025-09-29                     143.0
1723    0.202739726             2025-07-07                     172.0
1724    0.041095890             2025-05-23                     200.0
1725    0.213698630             2026-01-04                     120.0
1726    0.175342466             2025-10-18                      82.0
1727    0.202739726             2025-08-04                     152.0
1728    0.191780822             2025-12-20                     235.0
1729    0.200000000             2025-08-20                      50.0
1730    0.175342466             2026-02-27                     326.0
1731    0.197260274             2

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




33              WOOD/STEEL                  R    2109600             203
34              STEEL/WOOD                  R    2010481             203
35              WOOD/STEEL                  R    2004795             201
36              WOOD/STEEL                  R    2012677             206
37              STEEL/WOOD                  R    2002893             204
38                  TIMBER                  R    3327720             309
39                  TIMBER                  R    3327719             309
40              WOOD/METAL                  R    3059929             301
41            WOOD & STEEL                  R    3000370             302
42          STEEL AND WOOD                  R    3338422             301
43          STEEL AND WOOD                  R    3071200             301
44          STEEL AND WOOD                  R    3338423             301
45              STEEL/WOOD                  R    3120574             314
46              STEEL/WOOD                  R    3

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



             302
1737                            Permit Entire    3014872             307
1738                            Permit Entire    3006986             306
1739                            Permit Entire    3332257             303
1740                            Permit Entire    3063710             301
1741                            Permit Entire    3400816             304
1742                            Permit Entire    3054750             303
1743                            Permit Entire    3074262             304
1744                            Permit Entire    3330672             309
1745                            Permit Entire    3061857             301
1746                            Permit Entire    3062778             301
1747                            Permit Entire    3061847             301
1748                            Permit Entire    3391186             302
1749                            Permit Entire    3326949             316
1750                            Pe

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




1381       40.72574       -73.95324         1005            LORIMER STREET
1382       40.69978       -73.92099          196           STANHOPE STREET
1383       40.67297       -73.95688          753           FRANKLIN AVENUE
1384       40.71296       -73.94087           67           BUSHWICK AVENUE
1385       40.69278       -73.90983          265          WEIRFIELD STREET
1386       40.58520       -73.93451         2800              COYLE STREET
1387       40.67766       -73.99955          147             LUQUER STREET
1388       40.65514       -73.95944          733           FLATBUSH AVENUE
1389       40.67167       -73.90927         1550      EAST NEW YORK AVENUE
1390       40.67167       -73.90927           40           GLENMORE AVENUE
1391       40.67167       -73.90927         1570      EAST NEW YORK AVENUE
1392       40.67167       -73.90927         1590      EAST NEW YORK AVENUE
1393       40.67167       -73.90927          300   MOTHER GASTON BOULEVARD
1394       40.71014     

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



           MJE PROFESSIONAL SERVICES       1
1002             3  7584   35        RIGID STRUCTURAL DESIGN  LLC       1
1003             3  5611   22        PAUL PERDEK  PROF. ENG. PLLC       1
1004             3    35 7501                                  ..       1
1005             3  4165   39      MSM ENGINEERING SERVICES  PLLC       1
1006             3   351   34                    LAWSON CHRISTIAN       1
1007             3  7224  128    STAVROS I. MALLIAROS  P.E.  P.C.       1
1008             3  2840    2        RIGID STRUCTURAL DESIGN  LLC       1
1009             3  2393   25                  ASR ENGINEERING PC       1
1010             3  3119    6                  ASR ENGINEERING PC       1
1011             3  7560    1                                  ..       1
1012             3  1822   55                                  ..       1
1013             3  1007  172       BLACK SHEEP ENGINEERING GROUP       1
1014             3  7625    1      DSENY ENGINEERING SERVICES  PC  

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



      Other Zoning Districts
692  DOB NOW Construction or Maintenance Commercial District/Overlay
693  DOB NOW Construction or Maintenance      Other Zoning Districts
694  DOB NOW Construction or Maintenance      Other Zoning Districts
695  DOB NOW Construction or Maintenance      Other Zoning Districts
696  DOB NOW                Local Law 11      Other Zoning Districts
697  DOB NOW                Local Law 11      Other Zoning Districts
698  DOB NOW Construction or Maintenance Commercial District/Overlay
699  DOB NOW Construction or Maintenance      Other Zoning Districts
700  DOB NOW Construction or Maintenance      Other Zoning Districts
701  DOB NOW Construction or Maintenance      Other Zoning Districts
702  DOB NOW Construction or Maintenance      Other Zoning Districts
703  DOB NOW Construction or Maintenance      Other Zoning Districts
704  DOB NOW                Local Law 11 Commercial District/Overlay
705  DOB NOW                Local Law 11 Commercial District/Overlay
706  

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 Construction or Maintenance      Other Zoning Districts
1359 DOB NOW Construction or Maintenance Commercial District/Overlay
1360 DOB NOW Construction or Maintenance      Other Zoning Districts
1361 DOB NOW Construction or Maintenance      Other Zoning Districts
1362 DOB NOW Construction or Maintenance Commercial District/Overlay
1363 DOB NOW Construction or Maintenance      Other Zoning Districts
1364 DOB NOW Construction or Maintenance      Other Zoning Districts
1365 DOB NOW Construction or Maintenance      Other Zoning Districts
1366 DOB NOW Construction or Maintenance Commercial District/Overlay
1367 DOB NOW Construction or Maintenance      Other Zoning Districts
1368 DOB NOW Construction or Maintenance      Other Zoning Districts
1369 DOB NOW Construction or Maintenance      Other Zoning Districts
1370 DOB NOW Construction or Maintenance      Other Zoning Districts
1371 DOB NOW Construction or Maintenance Commercial District/Overlay
1372 DOB NOW Construction or Maintenance Comme

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  2002
675  36047022800    36     47  22800  3002
676  36047028700    36     47  28700  2000
677  36047024600    36     47  24600  3000
678  36047118400    36     47 118400  2000
679  36047026100    36     47  26100  4002
680  36047043900    36     47  43900  1006
681  36047011901    36     47  11901  2018
682  36047008200    36     47   8200  1001
683  36047020100    36     47  20100  1001
684  36047119000    36     47 119000  1000
685  36047006400    36     47   6400  1000
686  36047074000    36     47  74000  1002
687  36047048000    36     47  48000  1003
688  36047092400    36     47  92400  1002
689  36047034700    36     47  34700  3001
690  36047034700    36     47  34700  3001
691  36047034700    36     47  34700  3001
692  36047090000    36     47  90000  5000
693  36047034700    36     47  34700  3001
694  36047034901    36     47  34901  1000
695  36047034901    36     47  34901  1000
696  36047088200    36     47  88200  1001
697  36047041900    36     47  41900  1001
698 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



       Census Tract 806, Kings County, New York                3466
622        Census Tract 882, Kings County, New York                6411
623        Census Tract 333, Kings County, New York                4578
624        Census Tract 222, Kings County, New York                4034
625        Census Tract 177, Kings County, New York                   0
626        Census Tract 485, Kings County, New York                2329
627        Census Tract 221, Kings County, New York                4058
628        Census Tract 571, Kings County, New York                4036
629        Census Tract 764, Kings County, New York                4230
630        Census Tract 772, Kings County, New York                3560
631     Census Tract 563.01, Kings County, New York                4387
632        Census Tract 253, Kings County, New York                4003
633        Census Tract 253, Kings County, New York                4003
634         Census Tract 66, Kings County, New York                3

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



              2231             129
1350                    105              1509             157
1351                   2523              1932             237
1352                   4048              1375             132
1353                    532              1239             131
1354                    274              1262              48
1355                    532              1239             131
1356                      9               714             101
1357                    392              2212             100
1358                    357              1292             151
1359                   2712              2152             162
1360                   1975              1684             182
1361                    203              1318             151
1362                   3913              1660             231
1363                    574              1769             146
1364                    712              1928             369
1365                   1590        

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




1785                    189              1778             197
1786                    235              2469             120
1787                    429              1039              44
1788                    101              2266             164
1789                   1836              1236             107
1790                   3140              1326             155
1791                    187              1363              92
1792                   1384              1961              92
1793                   2039              1506              82
1794                    222              1186             284
1795                    153              1810             172
1796                    366               697             122
1797                    266              1128             179
1798                    354              1196             124
1799                   3632              1865              64
1800                   1473              1162             129
1801   

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                     459                     1042                152083
1832                     361                      477                 83558
1833                     427                     1060                184688
1834                    1265                     1247                 68644
1835                      39                      527                249583
1836                     199                     2653                    NA
1837                     419                     1374                164279
1838                     288                      572                226250
1839                     281                     1606                148819
1840                    1005                     1357                186979
1841                     553                     1081                142321
1842                     177                     1027                    NA
1843                     310                      911                 96094
1844            

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                  51197           1024               210          143
207                   53886            664               239          339
208                   49655            456               465          120
209                  120975            404               197          143
210                  120975            404               197          143
211                  104135            388               349          119
212                   34946            651               229          155
213                   41875            179                13           52
214                   68533            539                88          189
215                   68533            539                88          189
216                  128772            404               239           87
217                   54420            557               496          165
218                  121354            529               167          258
219                   23657           1221

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




2141                 130625            368               158          158
2142                  83643            132                15          121
2143                 130239            767               349          431
2144                 124231           1669               190          801
2145                  93824            974               631          381
2146                  85017           1286               221          581
2147                 139432            566               158          423
2148                     NA           2718               596          747
2149                 198539            993               156          424
2150                 172548            322                48          178
2151                 102006            345                94          179
2152                 191097            728               235          284
2153                 198539            993               156          424
2154                  52500          

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




2075         16                109                 249            60083
2076         69                109                 127            39607
2077        207                265                 256               NA
2078        116                177                 241            61048
2079        207                265                 256               NA
2080         64                119                 137            17440
2081         75                 90                 249            72940
2082        174                196                 329            17406
2083        264                198                 374            33232
2084        256                480                 209               NA
2085        217                248                 183           127806
2086        242                171                 318            40318
2087        486                746                 716            72377
2088        207                 98                 636         

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [31]:
%%R
head(merged_data)

  Job.Number Borough.Name Count.Permits First.Permit.Date Current.Date
1  120351662    Manhattan             0         5/13/2010     5/3/2025
2  120470409    Manhattan             0         9/16/2010     5/3/2025
3  120486633    Manhattan             0         9/29/2010     5/3/2025
4  120725705    Manhattan             0         6/16/2011     5/3/2025
5  120987236    Manhattan             0          3/1/2012     5/3/2025
6  121045127    Manhattan             0         4/23/2012     5/3/2025
  Age..in.years. Permit.Expiration.Date Sidewalk.Shed.Linear.Feet
1       14.98082              4/22/2026                        56
2       14.63562              6/17/2025                        65
3       14.60000              9/20/2025                       100
4       13.88767              3/11/2026                       314
5       13.17808              6/18/2025                        77
6       13.03288              1/27/2026                       339
  Construction.Material Current.Job.Statu

In [32]:
%%R
# Save the merged data as a CSV file:
write.csv(merged_data, "merged_data.csv", row.names = FALSE)

In [33]:
merged_data_python = pd.read_csv("merged_data.csv")
merged_data_python.head()

Unnamed: 0,Job.Number,Borough.Name,Count.Permits,First.Permit.Date,Current.Date,Age..in.years.,Permit.Expiration.Date,Sidewalk.Shed.Linear.Feet,Construction.Material,Current.Job.Status,BIN.Number,Community.Board,Latitude.Point,Longitude.Point,House.Number,Street.Name,Borough.Digit,Block,Lot,Applicant.Business.Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK,NAME,population_estimate,black_african_estimate,occupied_estimate,vacant_estimate,owner_occupied_estimate,renter_occupied_estimate,owner_income_estimate,renter_income_estimate,population_moe,black_african_moe,occupied_moe,vacant_moe,owner_occupied_moe,renter_occupied_moe,owner_income_moe,renter_income_moe
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061015402,36,61,15402,3000,"Census Tract 154.02, New York County, New York",3579,931,1681,427,659,1022,212008.0,103182.0,1259,1007,314,227,235,206,127402.0,28807.0
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061005000,36,61,5000,2000,"Census Tract 50, New York County, New York",5042,70,2852,622,1424,1428,195956.0,138364.0,814,74,299,246,309,222,72736.0,33353.0
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.6,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061002800,36,61,2800,3001,"Census Tract 28, New York County, New York",6483,1146,3073,303,482,2591,156071.0,44028.0,901,639,306,168,162,321,73124.0,12781.0
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061011500,36,61,11500,2002,"Census Tract 115, New York County, New York",3367,409,1750,565,239,1511,217356.0,134427.0,701,229,194,163,89,189,85738.0,48688.0
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,36061008900,36,61,8900,4001,"Census Tract 89, New York County, New York",5844,306,3095,603,974,2121,158555.0,88665.0,620,231,345,206,224,344,23669.0,37651.0


In [34]:
print(len(merged_data_python))
print(len(merged_data_python.query("population_estimate.isna()")))

8477
0
