# Merge Census Data

## Setup Python and R environment

In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports

require('tidyverse')

R[write to console]: Loading required package: tidyverse



── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


In [4]:
# See all columns
pd.set_option('display.max_columns', None)

## Load & Clean Data

👉 Load the data along with the census connectors below (the output of the `connect-to-census.ipynb` notebook) and do any cleanup you'd like to do.

### Load Data:

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("Active_Sheds.csv")
df.head()

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,5454,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,5328,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,5315,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,5055,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,4796,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts


In [7]:
%pip install requests-cache

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import requests_cache

# Initialize the cache for geocoding API to avoid hitting the API repeatedly
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

# Geocode function that retrieves census geography data based on latitude and longitude
def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()  # Raise error for bad status codes
        data = response.json()
        
        census_data = data['result']['geographies']['Census Blocks'][0]
        
        # Extract required census geography fields
        return {
            "SUFFIX": census_data.get("SUFFIX", None),
            "POP100": census_data.get("POP100", None),
            "GEOID": census_data.get("GEOID", None),
            "CENTLAT": census_data.get("CENTLAT", None),
            "BLOCK": census_data.get("BLOCK", None),
            "AREAWATER": census_data.get("AREAWATER", None),
            "STATE": census_data.get("STATE", None),
            "BASENAME": census_data.get("BASENAME", None),
            "OID": census_data.get("OID", None),
            "LSADC": census_data.get("LSADC", None),
            "INTPTLAT": census_data.get("INTPTLAT", None),
            "FUNCSTAT": census_data.get("FUNCSTAT", None),
            "NAME": census_data.get("NAME", None),
            "OBJECTID": census_data.get("OBJECTID", None),
            "TRACT": census_data.get("TRACT", None),
            "CENTLON": census_data.get("CENTLON", None),
            "BLKGRP": census_data.get("BLKGRP", None),
            "AREALAND": census_data.get("AREALAND", None),
            "HU100": census_data.get("HU100", None),
            "INTPTLON": census_data.get("INTPTLON", None),
            "MTFCC": census_data.get("MTFCC", None),
            "LWBLKTYP": census_data.get("LWBLKTYP", None),
            "UR": census_data.get("UR", None),
            "COUNTY": census_data.get("COUNTY", None),
        }
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

# Function to geocode data in chunks (to avoid consuming too much RAM)
def geocode_in_chunks(df, chunk_size=100):
    header = ['SUFFIX', 'POP100', 'GEOID', 'CENTLAT', 'BLOCK', 'AREAWATER', 'STATE', 'BASENAME', 'OID', 
              'LSADC', 'INTPTLAT', 'FUNCSTAT', 'NAME', 'OBJECTID', 'TRACT', 'CENTLON', 'BLKGRP', 'AREALAND', 
              'HU100', 'INTPTLON', 'MTFCC', 'LWBLKTYP', 'UR', 'COUNTY']
    
    with open('censusgeos.csv', 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()

        # Process the dataframe in chunks
        for i in tqdm(range(0, len(df), chunk_size), desc="Processing chunks"):
            chunk = df.iloc[i:i + chunk_size]
            latitudes = chunk['Latitude Point']
            longitudes = chunk['Longitude Point']

            # Geocode the chunk in parallel
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(geocode, latitudes, longitudes))

            # Write the results for this chunk to the CSV file
            for result in results:
                if result:
                    writer.writerow(result)

# Load your dataset
df = pd.read_csv("Active_Sheds.csv")

# Ensure 'Latitude Point' and 'Longitude Point' are numeric
df['Latitude Point'] = pd.to_numeric(df['Latitude Point'], errors='coerce')
df['Longitude Point'] = pd.to_numeric(df['Longitude Point'], errors='coerce')

# Handle any missing or invalid lat/long by dropping those rows
df = df.dropna(subset=['Latitude Point', 'Longitude Point'])

# Call the geocoding function to process the data in chunks
geocode_in_chunks(df, chunk_size=100)  # Adjust chunk_size as needed

# Optionally, read the results from the CSV after processing
census_geos_df = pd.read_csv('censusgeos.csv')

# Ensure that the numeric columns like STATE, COUNTY, and TRACT are treated as strings
census_geos_df['STATE'] = census_geos_df['STATE'].astype(str)
census_geos_df['COUNTY'] = census_geos_df['COUNTY'].astype(str)
census_geos_df['TRACT'] = census_geos_df['TRACT'].astype(str)
census_geos_df['BLOCK'] = census_geos_df['BLOCK'].astype(str)

# Select only the columns you want to keep
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]

# Concatenate the geocoded data with the original dataframe
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Processing chunks:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,5454,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,5328,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,5315,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,5055,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,4796,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001


In [9]:
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]
census_geos_df

Unnamed: 0,GEOID,STATE,COUNTY,TRACT,BLOCK
0,360610154023000,36,61,15402,3000
1,360610050002000,36,61,5000,2000
2,360610028003001,36,61,2800,3001
3,360610115002002,36,61,11500,2002
4,360610089004001,36,61,8900,4001
...,...,...,...,...,...
8463,360610277001000,36,61,27700,1000
8464,360610161001000,36,61,16100,1000
8465,360610002021002,36,61,202,1002
8466,360470033001001,36,47,3300,1001


In [10]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,5454,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,5328,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,5315,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,5055,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,4796,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001


### Clean Data: 

In [11]:
# Convert 'Age (in days)' to age in years (ignoring leap years)
df_with_geos['Age'] = df_with_geos['Age'] / 365
df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,14.942466,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,14.597260,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,14.561644,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,13.849315,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,13.139726,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,103652192,Manhattan,0.0,2019-09-19,2025-04-19,5.583562,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610277001000,36,61,27700,1000
8464,104213637,Manhattan,0.0,2020-08-28,2025-04-19,4.641096,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610161001000,36,61,16100,1000
8465,104213682,Manhattan,0.0,2020-10-01,2025-04-19,4.547945,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610002021002,36,61,202,1002
8466,302585815,Brooklyn,0.0,2019-02-22,2025-04-19,6.156164,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,360470033001001,36,47,3300,1001


In [12]:
# Rename the 'Age' column to 'Age (in years)'
df_with_geos = df_with_geos.rename(columns={'Age': 'Age (in years)'})

df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age (in years),Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,14.942466,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,14.597260,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,14.561644,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,13.849315,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,13.139726,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,103652192,Manhattan,0.0,2019-09-19,2025-04-19,5.583562,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610277001000,36,61,27700,1000
8464,104213637,Manhattan,0.0,2020-08-28,2025-04-19,4.641096,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610161001000,36,61,16100,1000
8465,104213682,Manhattan,0.0,2020-10-01,2025-04-19,4.547945,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610002021002,36,61,202,1002
8466,302585815,Brooklyn,0.0,2019-02-22,2025-04-19,6.156164,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,360470033001001,36,47,3300,1001


In [13]:
df_with_geos['GEOID'] = \
    df_with_geos['STATE'] + \
    df_with_geos['COUNTY'].str.pad(width=3, side='left', fillchar='0') + \
    df_with_geos['TRACT'].str.pad(width=6, side='left', fillchar='0')

df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age (in years),Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,14.942466,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061015402,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,14.597260,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061005000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,14.561644,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061002800,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,13.849315,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061011500,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,13.139726,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,36061008900,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,103652192,Manhattan,0.0,2019-09-19,2025-04-19,5.583562,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061027700,36,61,27700,1000
8464,104213637,Manhattan,0.0,2020-08-28,2025-04-19,4.641096,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061016100,36,61,16100,1000
8465,104213682,Manhattan,0.0,2020-10-01,2025-04-19,4.547945,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061000202,36,61,202,1002
8466,302585815,Brooklyn,0.0,2019-02-22,2025-04-19,6.156164,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,36047003300,36,47,3300,1001


In [14]:
df_with_geos['Borough Name'].value_counts()

Borough Name
MANHATTAN        3815
BROOKLYN         2038
BRONX            1425
QUEENS           1038
STATEN ISLAND      91
Manhattan          35
Brooklyn           13
Bronx               8
Queens              3
Staten Island       2
Name: count, dtype: int64

In [15]:
# Standardize the 'Borough Name' column to proper case (first letter capitalized)
df_with_geos['Borough Name'] = df_with_geos['Borough Name'].replace({
    "MANHATTAN": "Manhattan",
    "BROOKLYN": "Brooklyn",
    "BRONX": "Bronx",
    "QUEENS": "Queens",
    "STATEN ISLAND": "Staten Island"
})

In [16]:
df_with_geos['Borough Name'].value_counts()

Borough Name
Manhattan        3850
Brooklyn         2051
Bronx            1433
Queens           1041
Staten Island      93
Name: count, dtype: int64

In [17]:
# BIN Number: Building Identification Number
# BIS: Building Information System
# SCA: School Construction Authority
# Permit-Entire	Permit Issued - Entire Job/Work

#### Current Job Status: R = Permit-Entire = Permit Issued - Entire Job/Work, but they use diff symbols depending whether the source is BIS or DOB NOW
#### , based on https://www.nyc.gov/site/buildings/industry/permit-type-and-job-status-codes.page

#### ProCert: The Department offers a Professional Certification (Pro Cert) Program which enables Professional Engineers (PE) and Registered Architects (RA) to certify that the plans they file with the Department are in compliance with all applicable laws. This reduces the amount of time a builder normally has to wait for a DOB permit by eliminating the process of Department plan examination and approval.

# Q: Would 1 refer to with certificate and 0 refer to no certificate? 

## 👉 Grab Census Data

1. loading the Census API key

In [18]:
# pip install dotenv

In [19]:
from dotenv import load_dotenv
load_dotenv() # <- searches for a file named .env and loads the environment variables in it

True

In [20]:
%%R 

require('tidycensus')

# because it an environment variable, we don't have to 
# explicitly pass this string to R, it is readable here
# in this R cell.
census_api_key(Sys.getenv("CENSUS_API_KEY"))

R[write to console]: Loading required package: tidycensus

R[write to console]: To install your API key for use in future sessions, run this function with `install = TRUE`.



2. Decide which Census variables you want

    Use <https://censusreporter.org/> to figure out which tables you want. (if censusreporter is down, check out the code in the cell below)

    -   Scroll to the bottom of the page to see the tables.
    -   If you already know the table ID, stick that in the "Explore" section to learn more about that table.

    By default this code loads (B01003_001) which we found in censusreporter here: https://censusreporter.org/tables/B01003/

    - find some other variables that you're also interested in
    - don't forget to pick a geography like "tract", "county" or "block group". here is the list of [all geographies](https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus
    ).


In [21]:
%%R 
require("tigris")

R[write to console]: Loading required package: tigris

R[write to console]: To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.



# Q: What are the related columns and how to deal with them?

In [22]:
%%R 

# long-form data: with only one numeric data

# the variable B01003_001 was selectd from the census table 
# for population, which we found in censusreporter here:
# https://censusreporter.org/tables/B01003/


# Here are the various geographies you can use with tidycensus
# https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus

# Get variable from ACS
nyc_census_data <- get_acs(geography = "tract", 
                      state='NY',
                      county = c("New York", "Kings", "Bronx", "Queens", "Richmond"),
                      variables = c(
                        population = "B01003_001",
                        occupied = "B25002_002", 
                        vacant = "B25002_003",
                        owner_occupied = "B25003_002",
                        renter_occupied = "B25003_003",
                        owner_income = "B25119_002",
                        renter_income = "B25119_003",
                        black_african = "B02009_001"
                      ), 
                      year = 2021,
                      survey="acs5",
                      geometry=F,
                      cb = T)

nyc_census_data <- nyc_census_data #%>% 
    #erase_water() 

R[write to console]: Getting data from the 2017-2021 5-year ACS

R[write to console]: Using FIPS code '36' for state 'NY'

R[write to console]: Using FIPS code '061' for 'New York County'

R[write to console]: Using FIPS code '047' for 'Kings County'

R[write to console]: Using FIPS code '005' for 'Bronx County'

R[write to console]: Using FIPS code '081' for 'Queens County'

R[write to console]: Using FIPS code '085' for 'Richmond County'



### Possible Related Variables:

#### From Housing: 
##### 1. population: B01003_001

##### 2. Occupancy Status - Total: B25002_001
##### 3. Occupancy Status - Occupied: B25002_002
##### 4. Occupancy Status - Vacant: B25002_003

##### 5. Tenure - Total: B25003_001
##### 6. Tenure - Owner occupied: B25003_002
##### 7. Tenure - Renter occupied: B25003_003

##### (8. Median Year Structure Built by Tenure - Total: B25037_001
##### 9. Median Year Structure Built by Tenure - Owner occupied: B25037_002
##### 10. Median Year Structure Built by Tenure - Renter occupied: B25037_003)

##### 11. Median Household Income by Tenure - Total: B25119_001
##### 12. Median Household Income by Tenure - Owner occupied: B25119_002
##### 13. Median Household Income by Tenure - Renter occupied: B25119_003

##### 14. Value??

#### From Income & Earnings: 

In [23]:
%%R 
nyc_census_data

# A tibble: 18,616 × 5
   GEOID       NAME                                   variable    estimate   moe
   <chr>       <chr>                                  <chr>          <dbl> <dbl>
 1 36005000100 Census Tract 1, Bronx County, New York population      6661   702
 2 36005000100 Census Tract 1, Bronx County, New York black_afri…     3346   468
 3 36005000100 Census Tract 1, Bronx County, New York occupied           0    18
 4 36005000100 Census Tract 1, Bronx County, New York vacant             0    18
 5 36005000100 Census Tract 1, Bronx County, New York owner_occu…        0    18
 6 36005000100 Census Tract 1, Bronx County, New York renter_occ…        0    18
 7 36005000100 Census Tract 1, Bronx County, New York owner_inco…       NA    NA
 8 36005000100 Census Tract 1, Bronx County, New York renter_inc…       NA    NA
 9 36005000200 Census Tract 2, Bronx County, New York population      4453   563
10 36005000200 Census Tract 2, Bronx County, New York black_afri…     1450   546
# ℹ 1

In [24]:
%%R 

# pivot from long to wide
nyc_census_data <- nyc_census_data %>% 
  pivot_wider(
    names_from=variable, 
    values_from = c(estimate, moe),
    names_glue = "{variable}_{.value}"
  ) 

nyc_census_data

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
# ℹ 2

In [25]:
%%R 
nyc_census_data %>% print(n=20)

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
11 36

In [26]:
%%R -o nyc_census_data
nyc_census_data

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
# ℹ 2

## 👉 Merge it with your data

hint...`tidycensus` provides you data in long format you may need to pivot the census data from long to wide format before merging it with your data

In [27]:
df_with_geos.to_csv('df_with_geos.csv', index=False)

In [28]:
%%R
df_with_geos <- read.csv('df_with_geos.csv')

In [29]:
%%R
df_with_geos$GEOID <- as.character(df_with_geos$GEOID)
nyc_census_data$GEOID <- as.character(nyc_census_data$GEOID)

In [30]:
%%R
# Merge the data: 
library(dplyr)

merged_data <- left_join(df_with_geos, nyc_census_data, by = "GEOID")
merged_data

       Job.Number  Borough.Name Count.Permits First.Permit.Date Current.Date
1       120351662     Manhattan             0         5/13/2010    4/19/2025
2       120470409     Manhattan             0         9/16/2010    4/19/2025
3       120486633     Manhattan             0         9/29/2010    4/19/2025
4       120725705     Manhattan             0         6/16/2011    4/19/2025
5       120987236     Manhattan             0          3/1/2012    4/19/2025
6       121045127     Manhattan             0         4/23/2012    4/19/2025
7       121844941     Manhattan             0         12/5/2013    4/19/2025
8       122008781     Manhattan             0         6/19/2014    4/19/2025
9       122106078     Manhattan             0         8/19/2014    4/19/2025
10      122199236     Manhattan             0         12/9/2014    4/19/2025
11      122212131     Manhattan             0        12/16/2014    4/19/2025
12      122538995     Manhattan             0         12/7/2015    4/19/2025

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




1885 B01192558-I1      Brooklyn            NA        2025-03-17   2025-04-19
1886 B01192682-I1      Brooklyn            NA        2025-03-11   2025-04-19
1887 B01192790-I1      Brooklyn            NA        2025-03-11   2025-04-19
1888 B01193015-I1      Brooklyn            NA        2025-03-17   2025-04-19
1889 B01193031-I1      Brooklyn            NA        2025-03-17   2025-04-19
1890 B01193118-I1      Brooklyn            NA        2025-03-21   2025-04-19
1891 B01193155-I1      Brooklyn            NA        2025-03-12   2025-04-19
1892 B01193266-I1      Brooklyn            NA        2025-03-12   2025-04-19
1893 B01193283-I1      Brooklyn            NA        2025-03-12   2025-04-19
1894 B01193466-I1      Brooklyn            NA        2025-04-15   2025-04-19
1895 B01193551-I1      Brooklyn            NA        2025-03-13   2025-04-19
1896 B01193610-I1      Brooklyn            NA        2025-03-12   2025-04-19
1897 B01193613-I1      Brooklyn            NA        2025-03-12   2025-04-1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



             301
237                             Permit Entire    3024466             306
238                             Permit Entire    3044501             303
239                             Permit Entire    3032720             308
240                             Permit Entire    3065464             301
241                             Permit Entire    3053786             303
242                             Permit Entire    3824802             308
243                             Permit Entire    3319419             302
244                             Permit Entire    3335929             302
245                             Permit Entire    3138725             312
246                             Permit Entire    3203569             315
247                             Permit Entire    3429214             308
248                             Permit Entire    3155853             310
249                             Permit Entire    3047914             303
250                             Pe

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



             SIDNEY PLACE
365        40.57716       -73.95331           65       ORIENTAL BOULEVARD
366        40.73149       -73.95351          176              JAVA STREET
367        40.73167       -73.95918           32             INDIA STREET
368        40.66268       -73.94975          345          LEFFERTS AVENUE
369        40.63482       -73.95773         1111             OCEAN AVENUE
370        40.72596       -73.93859          296            NASSAU AVENUE
371        40.68392       -73.99012          378            BALTIC STREET
372        40.73531       -73.95575          129             EAGLE STREET
373        40.69234       -73.98466          111        WILLOUGHBY STREET
374        40.57674       -73.96194         3091     BRIGHTON    5 STREET
375        40.70226       -73.97277            1            DOCK   72 WAY
376        40.64267       -73.92487         5455            KINGS HIGHWAY
377        40.63954       -74.00004          950                49 STREET
378        4

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



       1
377              3  5638   19                    LAWSON CHRISTIAN       1
378              3  2538    1            G. BLINN CONSULTING  LLC       1
379              3  7456    6             ENTHINK ENGINEERING LLC       0
380              3   231   19                         ASHRAF CORP       1
381              3  3177   25                                  PR       1
382              3  3177   15                                  PR       1
383              3  3520   32                         ASHRAF CORP       1
384              3  3787    1                 IKRA CONSULTING LLC       1
385              3  3787    1                 IKRA CONSULTING LLC       1
386              3  7137    1                 IKRA CONSULTING LLC       1
387              3  7140   16                 IKRA CONSULTING LLC       1
388              3  7140   16                 IKRA CONSULTING LLC       1
389              3  7140   16                 IKRA CONSULTING LLC       1
390              3  7140   16

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




248  DOB NOW                Local Law 11 Commercial District/Overlay
249  DOB NOW Construction or Maintenance      Other Zoning Districts
250  DOB NOW Construction or Maintenance      Other Zoning Districts
251  DOB NOW                Local Law 11      Other Zoning Districts
252  DOB NOW                Local Law 11      Other Zoning Districts
253  DOB NOW Construction or Maintenance      Other Zoning Districts
254  DOB NOW Construction or Maintenance      Other Zoning Districts
255  DOB NOW Construction or Maintenance Commercial District/Overlay
256  DOB NOW Construction or Maintenance      Other Zoning Districts
257  DOB NOW Construction or Maintenance      Other Zoning Districts
258  DOB NOW Construction or Maintenance Commercial District/Overlay
259  DOB NOW                Local Law 11      Other Zoning Districts
260  DOB NOW Construction or Maintenance      Other Zoning Districts
261  DOB NOW Construction or Maintenance Commercial District/Overlay
262  DOB NOW Construction or Main

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



    36     47  33900  3000
675  36047012200    36     47  12200  4001
676  36047048000    36     47  48000  2000
677  36047051602    36     47  51602  2001
678  36047005202    36     47   5202  2001
679  36047022000    36     47  22000  2001
680  36047029800    36     47  29800  2003
681  36047050100    36     47  50100  1002
682  36047115000    36     47 115000  1003
683  36047002200    36     47   2200  1001
684  36047020300    36     47  20300  2002
685  36047022800    36     47  22800  3002
686  36047028700    36     47  28700  2000
687  36047041700    36     47  41700  4001
688  36047024600    36     47  24600  3000
689  36047118400    36     47 118400  2000
690  36047026100    36     47  26100  4002
691  36047043900    36     47  43900  1006
692  36047011901    36     47  11901  2018
693  36047008200    36     47   8200  1001
694  36047020100    36     47  20100  1001
695  36047119000    36     47 119000  1000
696  36047006400    36     47   6400  1000
697  36047074000    36     

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




714         Census Tract 75, Kings County, New York                4811
715       Census Tract 3.01, Kings County, New York                3892
716        Census Tract 906, Kings County, New York                3476
717        Census Tract 906, Kings County, New York                3476
718        Census Tract 282, Kings County, New York                3623
719        Census Tract 333, Kings County, New York                4578
720          Census Tract 9, Kings County, New York                4687
721         Census Tract 59, Kings County, New York                1234
722         Census Tract 59, Kings County, New York                1234
723        Census Tract 183, Kings County, New York                2749
724        Census Tract 305, Kings County, New York                7144
725       Census Tract 5.01, Kings County, New York                4604
726        Census Tract 101, Kings County, New York                4393
727        Census Tract 525, Kings County, New York            

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



              1753              74
1830                    829               608              94
1831                   4907              3092             149
1832                    392              2212             100
1833                    470              2327             208
1834                   2018              2512              92
1835                   2466              1142             167
1836                   4907              3092             149
1837                    239              1121             182
1838                    625              1894             122
1839                    216               820             120
1840                   1857              1625             171
1841                    156               838             171
1842                   1653              1595             217
1843                     32              1654             156
1844                    392              2212             100
1845                    221        

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




273                   86016            356                76          133
274                   68533            539                88          189
275                  138966            554               295          233
276                  107319            379                83          210
277                   55417            233               174          107
278                   65437            805               775          193
279                   72813            530               438          199
280                   37031            858               651          273
281                   95568            375               159          176
282                   74006            581               532          173
283                   30536           1141                18          143
284                  128772            404               239           87
285                   34004            535               429          170
286                   38271          

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                129                 131               NA
362         129                161                 226           153771
363          72                 84                 249            40284
364         143                149                 157               NA
365         113                170                 343            16410
366         116                177                 241            61048
367         105                113                 206           123826
368         146                233                 207            22852
369          44                 63                  89            92132
370          27                 62                  98            45045
371          45                 84                 106            85678
372         116                103                 200            70462
373         109                123                 296               NA
374          86                 90                 279            21454
375    

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



             14973
1054              5635
1055             37572
1056             31800
1057             40373
1058             19077
1059             67544
1060              9653
1061             20174
1062             18593
1063                NA
1064             29335
1065             43105
1066             15532
1067              8680
1068             26429
1069             17187
1070              9639
1071             12681
1072                NA
1073                NA
1074             34872
1075             43895
1076             40167
1077             46576
1078             21857
1079             18870
1080             10574
1081             14973
1082             11438
1083             11438
1084             11438
1085             23415
1086             36275
1087             16253
1088              7615
1089             18795
1090              6040
1091             27459
1092             29046
1093             27459
1094             12240
1095             62978
1096           

In [31]:
%%R
head(merged_data)

  Job.Number Borough.Name Count.Permits First.Permit.Date Current.Date
1  120351662    Manhattan             0         5/13/2010    4/19/2025
2  120470409    Manhattan             0         9/16/2010    4/19/2025
3  120486633    Manhattan             0         9/29/2010    4/19/2025
4  120725705    Manhattan             0         6/16/2011    4/19/2025
5  120987236    Manhattan             0          3/1/2012    4/19/2025
6  121045127    Manhattan             0         4/23/2012    4/19/2025
  Age..in.years. Permit.Expiration.Date Sidewalk.Shed.Linear.Feet
1       14.94247              5/22/2025                        56
2       14.59726              6/17/2025                        65
3       14.56164              9/20/2025                       100
4       13.84932              3/11/2026                       314
5       13.13973              6/18/2025                        77
6       12.99452              1/27/2026                       339
  Construction.Material Current.Job.Statu

In [32]:
%%R
# Save the merged data as a CSV file:
write.csv(merged_data, "merged_data.csv", row.names = FALSE)

In [33]:
merged_data_python = pd.read_csv("merged_data.csv")
merged_data_python.head()

Unnamed: 0,Job.Number,Borough.Name,Count.Permits,First.Permit.Date,Current.Date,Age..in.years.,Permit.Expiration.Date,Sidewalk.Shed.Linear.Feet,Construction.Material,Current.Job.Status,BIN.Number,Community.Board,Latitude.Point,Longitude.Point,House.Number,Street.Name,Borough.Digit,Block,Lot,Applicant.Business.Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK,NAME,population_estimate,black_african_estimate,occupied_estimate,vacant_estimate,owner_occupied_estimate,renter_occupied_estimate,owner_income_estimate,renter_income_estimate,population_moe,black_african_moe,occupied_moe,vacant_moe,owner_occupied_moe,renter_occupied_moe,owner_income_moe,renter_income_moe
0,120351662,Manhattan,0.0,5/13/2010,4/19/2025,14.942466,5/22/2025,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061015402,36,61,15402,3000,"Census Tract 154.02, New York County, New York",3579,931,1681,427,659,1022,212008.0,103182.0,1259,1007,314,227,235,206,127402.0,28807.0
1,120470409,Manhattan,0.0,9/16/2010,4/19/2025,14.59726,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061005000,36,61,5000,2000,"Census Tract 50, New York County, New York",5042,70,2852,622,1424,1428,195956.0,138364.0,814,74,299,246,309,222,72736.0,33353.0
2,120486633,Manhattan,0.0,9/29/2010,4/19/2025,14.561644,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061002800,36,61,2800,3001,"Census Tract 28, New York County, New York",6483,1146,3073,303,482,2591,156071.0,44028.0,901,639,306,168,162,321,73124.0,12781.0
3,120725705,Manhattan,0.0,6/16/2011,4/19/2025,13.849315,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061011500,36,61,11500,2002,"Census Tract 115, New York County, New York",3367,409,1750,565,239,1511,217356.0,134427.0,701,229,194,163,89,189,85738.0,48688.0
4,120987236,Manhattan,0.0,3/1/2012,4/19/2025,13.139726,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,36061008900,36,61,8900,4001,"Census Tract 89, New York County, New York",5844,306,3095,603,974,2121,158555.0,88665.0,620,231,345,206,224,344,23669.0,37651.0


In [34]:
print(len(merged_data_python))
print(len(merged_data_python.query("population_estimate.isna()")))

8468
0
