# Merge Census Data

## Setup Python and R environment

In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports

require('tidyverse')

R[write to console]: Loading required package: tidyverse



── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


In [4]:
# See all columns
pd.set_option('display.max_columns', None)

## Load & Clean Data

👉 Load the data along with the census connectors below (the output of the `connect-to-census.ipynb` notebook) and do any cleanup you'd like to do.

### Load Data:

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("Active_Sheds.csv")
df.head()

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,5468,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,5342,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,5329,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,5069,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,4810,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts


In [7]:
%pip install requests-cache

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import requests_cache

# Initialize the cache for geocoding API to avoid hitting the API repeatedly
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

# Geocode function that retrieves census geography data based on latitude and longitude
def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()  # Raise error for bad status codes
        data = response.json()
        
        census_data = data['result']['geographies']['Census Blocks'][0]
        
        # Extract required census geography fields
        return {
            "SUFFIX": census_data.get("SUFFIX", None),
            "POP100": census_data.get("POP100", None),
            "GEOID": census_data.get("GEOID", None),
            "CENTLAT": census_data.get("CENTLAT", None),
            "BLOCK": census_data.get("BLOCK", None),
            "AREAWATER": census_data.get("AREAWATER", None),
            "STATE": census_data.get("STATE", None),
            "BASENAME": census_data.get("BASENAME", None),
            "OID": census_data.get("OID", None),
            "LSADC": census_data.get("LSADC", None),
            "INTPTLAT": census_data.get("INTPTLAT", None),
            "FUNCSTAT": census_data.get("FUNCSTAT", None),
            "NAME": census_data.get("NAME", None),
            "OBJECTID": census_data.get("OBJECTID", None),
            "TRACT": census_data.get("TRACT", None),
            "CENTLON": census_data.get("CENTLON", None),
            "BLKGRP": census_data.get("BLKGRP", None),
            "AREALAND": census_data.get("AREALAND", None),
            "HU100": census_data.get("HU100", None),
            "INTPTLON": census_data.get("INTPTLON", None),
            "MTFCC": census_data.get("MTFCC", None),
            "LWBLKTYP": census_data.get("LWBLKTYP", None),
            "UR": census_data.get("UR", None),
            "COUNTY": census_data.get("COUNTY", None),
        }
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

# Function to geocode data in chunks (to avoid consuming too much RAM)
def geocode_in_chunks(df, chunk_size=100):
    header = ['SUFFIX', 'POP100', 'GEOID', 'CENTLAT', 'BLOCK', 'AREAWATER', 'STATE', 'BASENAME', 'OID', 
              'LSADC', 'INTPTLAT', 'FUNCSTAT', 'NAME', 'OBJECTID', 'TRACT', 'CENTLON', 'BLKGRP', 'AREALAND', 
              'HU100', 'INTPTLON', 'MTFCC', 'LWBLKTYP', 'UR', 'COUNTY']
    
    with open('censusgeos.csv', 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()

        # Process the dataframe in chunks
        for i in tqdm(range(0, len(df), chunk_size), desc="Processing chunks"):
            chunk = df.iloc[i:i + chunk_size]
            latitudes = chunk['Latitude Point']
            longitudes = chunk['Longitude Point']

            # Geocode the chunk in parallel
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(geocode, latitudes, longitudes))

            # Write the results for this chunk to the CSV file
            for result in results:
                if result:
                    writer.writerow(result)

# Load your dataset
df = pd.read_csv("Active_Sheds.csv")

# Ensure 'Latitude Point' and 'Longitude Point' are numeric
df['Latitude Point'] = pd.to_numeric(df['Latitude Point'], errors='coerce')
df['Longitude Point'] = pd.to_numeric(df['Longitude Point'], errors='coerce')

# Handle any missing or invalid lat/long by dropping those rows
df = df.dropna(subset=['Latitude Point', 'Longitude Point'])

# Call the geocoding function to process the data in chunks
geocode_in_chunks(df, chunk_size=100)  # Adjust chunk_size as needed

# Optionally, read the results from the CSV after processing
census_geos_df = pd.read_csv('censusgeos.csv')

# Ensure that the numeric columns like STATE, COUNTY, and TRACT are treated as strings
census_geos_df['STATE'] = census_geos_df['STATE'].astype(str)
census_geos_df['COUNTY'] = census_geos_df['COUNTY'].astype(str)
census_geos_df['TRACT'] = census_geos_df['TRACT'].astype(str)
census_geos_df['BLOCK'] = census_geos_df['BLOCK'].astype(str)

# Select only the columns you want to keep
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]

# Concatenate the geocoded data with the original dataframe
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Processing chunks:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,5468,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,5342,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,5329,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,5069,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,4810,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001


In [9]:
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]
census_geos_df

Unnamed: 0,GEOID,STATE,COUNTY,TRACT,BLOCK
0,360610154023000,36,61,15402,3000
1,360610050002000,36,61,5000,2000
2,360610028003001,36,61,2800,3001
3,360610115002002,36,61,11500,2002
4,360610089004001,36,61,8900,4001
...,...,...,...,...,...
8472,360610277001000,36,61,27700,1000
8473,360610161001000,36,61,16100,1000
8474,360610002021002,36,61,202,1002
8475,360470033001001,36,47,3300,1001


In [10]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,5468,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,5342,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,5329,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,5069,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,4810,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001


### Clean Data: 

In [11]:
# Convert 'Age (in days)' to age in years (ignoring leap years)
df_with_geos['Age'] = df_with_geos['Age'] / 365
df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age,Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.600000,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8472,103652192,Manhattan,0.0,2019-09-19,2025-05-03,5.621918,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610277001000,36,61,27700,1000
8473,104213637,Manhattan,0.0,2020-08-28,2025-05-03,4.679452,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610161001000,36,61,16100,1000
8474,104213682,Manhattan,0.0,2020-10-01,2025-05-03,4.586301,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610002021002,36,61,202,1002
8475,302585815,Brooklyn,0.0,2019-02-22,2025-05-03,6.194521,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,360470033001001,36,47,3300,1001


In [12]:
# Rename the 'Age' column to 'Age (in years)'
df_with_geos = df_with_geos.rename(columns={'Age': 'Age (in years)'})

df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age (in years),Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610154023000,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610050002000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.600000,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,360610028003001,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,360610115002002,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,360610089004001,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8472,103652192,Manhattan,0.0,2019-09-19,2025-05-03,5.621918,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610277001000,36,61,27700,1000
8473,104213637,Manhattan,0.0,2020-08-28,2025-05-03,4.679452,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610161001000,36,61,16100,1000
8474,104213682,Manhattan,0.0,2020-10-01,2025-05-03,4.586301,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,360610002021002,36,61,202,1002
8475,302585815,Brooklyn,0.0,2019-02-22,2025-05-03,6.194521,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,360470033001001,36,47,3300,1001


In [13]:
df_with_geos['GEOID'] = \
    df_with_geos['STATE'] + \
    df_with_geos['COUNTY'].str.pad(width=3, side='left', fillchar='0') + \
    df_with_geos['TRACT'].str.pad(width=6, side='left', fillchar='0')

df_with_geos

Unnamed: 0,Job Number,Borough Name,Count Permits,First Permit Date,Current Date,Age (in years),Permit Expiration Date,Sidewalk Shed/Linear Feet,Construction Material,Current Job Status,BIN Number,Community Board,Latitude Point,Longitude Point,House Number,Street Name,Borough Digit,Block,Lot,Applicant Business Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061015402,36,61,15402,3000
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.73590,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061005000,36,61,5000,2000
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.600000,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061002800,36,61,2800,3001
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061011500,36,61,11500,2002
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,36061008900,36,61,8900,4001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8472,103652192,Manhattan,0.0,2019-09-19,2025-05-03,5.621918,2025-09-11,0.0,METAL & WOOD,R,1076751,112,40.85661,-73.92599,549,AUDOBON AVENUE,1,2160,18,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061027700,36,61,27700,1000
8473,104213637,Manhattan,0.0,2020-08-28,2025-05-03,4.679452,2025-08-02,0.0,METAL & WOOD,R,1030196,107,40.78174,-73.97806,138,WEST 78 STREET,1,1149,7,ROMA SCAFFOLDING INC,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061016100,36,61,16100,1000
8474,104213682,Manhattan,0.0,2020-10-01,2025-05-03,4.586301,2025-10-11,0.0,METAL & WOOD,R,1003736,103,40.71435,-73.98303,287,HENRY STREET,1,288,15,WHITESTONE CONSTRUCTION C,1,BIS SCA,Construction or Maintenance,Other Zoning Districts,36061000202,36,61,202,1002
8475,302585815,Brooklyn,0.0,2019-02-22,2025-05-03,6.194521,2026-01-23,0.0,METAL & WOOD,R,3058752,302,40.68891,-73.97643,1,FORT GREENE PLACE,3,2098,13,ROMA SCAFFOLDING INC,1,BIS SCA,Local Law 11,Other Zoning Districts,36047003300,36,47,3300,1001


In [14]:
df_with_geos['Borough Name'].value_counts()

Borough Name
MANHATTAN        3815
BROOKLYN         2029
BRONX            1434
QUEENS           1050
STATEN ISLAND      90
Manhattan          33
Brooklyn           13
Bronx               8
Queens              3
Staten Island       2
Name: count, dtype: int64

In [15]:
# Standardize the 'Borough Name' column to proper case (first letter capitalized)
df_with_geos['Borough Name'] = df_with_geos['Borough Name'].replace({
    "MANHATTAN": "Manhattan",
    "BROOKLYN": "Brooklyn",
    "BRONX": "Bronx",
    "QUEENS": "Queens",
    "STATEN ISLAND": "Staten Island"
})

In [16]:
df_with_geos['Borough Name'].value_counts()

Borough Name
Manhattan        3848
Brooklyn         2042
Bronx            1442
Queens           1053
Staten Island      92
Name: count, dtype: int64

In [17]:
# BIN Number: Building Identification Number
# BIS: Building Information System
# SCA: School Construction Authority
# Permit-Entire	Permit Issued - Entire Job/Work

#### Current Job Status: R = Permit-Entire = Permit Issued - Entire Job/Work, but they use diff symbols depending whether the source is BIS or DOB NOW
#### , based on https://www.nyc.gov/site/buildings/industry/permit-type-and-job-status-codes.page

#### ProCert: The Department offers a Professional Certification (Pro Cert) Program which enables Professional Engineers (PE) and Registered Architects (RA) to certify that the plans they file with the Department are in compliance with all applicable laws. This reduces the amount of time a builder normally has to wait for a DOB permit by eliminating the process of Department plan examination and approval.

# Q: Would 1 refer to with certificate and 0 refer to no certificate? 

## 👉 Grab Census Data

1. loading the Census API key

In [18]:
# pip install dotenv

In [19]:
from dotenv import load_dotenv
load_dotenv() # <- searches for a file named .env and loads the environment variables in it

False

In [20]:
%%R 

require('tidycensus')

# because it an environment variable, we don't have to 
# explicitly pass this string to R, it is readable here
# in this R cell.
census_api_key(Sys.getenv("CENSUS_API_KEY"))

R[write to console]: Loading required package: tidycensus

R[write to console]: To install your API key for use in future sessions, run this function with `install = TRUE`.



2. Decide which Census variables you want

    Use <https://censusreporter.org/> to figure out which tables you want. (if censusreporter is down, check out the code in the cell below)

    -   Scroll to the bottom of the page to see the tables.
    -   If you already know the table ID, stick that in the "Explore" section to learn more about that table.

    By default this code loads (B01003_001) which we found in censusreporter here: https://censusreporter.org/tables/B01003/

    - find some other variables that you're also interested in
    - don't forget to pick a geography like "tract", "county" or "block group". here is the list of [all geographies](https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus
    ).


In [21]:
%%R 
require("tigris")

R[write to console]: Loading required package: tigris

R[write to console]: To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.



# Q: What are the related columns and how to deal with them?

In [22]:
%%R 

# long-form data: with only one numeric data

# the variable B01003_001 was selectd from the census table 
# for population, which we found in censusreporter here:
# https://censusreporter.org/tables/B01003/


# Here are the various geographies you can use with tidycensus
# https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus

# Get variable from ACS
nyc_census_data <- get_acs(geography = "tract", 
                      state='NY',
                      county = c("New York", "Kings", "Bronx", "Queens", "Richmond"),
                      variables = c(
                        population = "B01003_001",
                        occupied = "B25002_002", 
                        vacant = "B25002_003",
                        owner_occupied = "B25003_002",
                        renter_occupied = "B25003_003",
                        owner_income = "B25119_002",
                        renter_income = "B25119_003",
                        black_african = "B02009_001"
                      ), 
                      year = 2021,
                      survey="acs5",
                      geometry=F,
                      cb = T)

nyc_census_data <- nyc_census_data #%>% 
    #erase_water() 

R[write to console]: Getting data from the 2017-2021 5-year ACS

R[write to console]: Using FIPS code '36' for state 'NY'

R[write to console]: Using FIPS code '061' for 'New York County'

R[write to console]: Using FIPS code '047' for 'Kings County'

R[write to console]: Using FIPS code '005' for 'Bronx County'

R[write to console]: Using FIPS code '081' for 'Queens County'

R[write to console]: Using FIPS code '085' for 'Richmond County'



### Possible Related Variables:

#### From Housing: 
##### 1. population: B01003_001

##### 2. Occupancy Status - Total: B25002_001
##### 3. Occupancy Status - Occupied: B25002_002
##### 4. Occupancy Status - Vacant: B25002_003

##### 5. Tenure - Total: B25003_001
##### 6. Tenure - Owner occupied: B25003_002
##### 7. Tenure - Renter occupied: B25003_003

##### (8. Median Year Structure Built by Tenure - Total: B25037_001
##### 9. Median Year Structure Built by Tenure - Owner occupied: B25037_002
##### 10. Median Year Structure Built by Tenure - Renter occupied: B25037_003)

##### 11. Median Household Income by Tenure - Total: B25119_001
##### 12. Median Household Income by Tenure - Owner occupied: B25119_002
##### 13. Median Household Income by Tenure - Renter occupied: B25119_003

##### 14. Value??

#### From Income & Earnings: 

In [23]:
%%R 
nyc_census_data

# A tibble: 18,616 × 5
   GEOID       NAME                                   variable    estimate   moe
   <chr>       <chr>                                  <chr>          <dbl> <dbl>
 1 36005000100 Census Tract 1, Bronx County, New York population      6661   702
 2 36005000100 Census Tract 1, Bronx County, New York black_afri…     3346   468
 3 36005000100 Census Tract 1, Bronx County, New York occupied           0    18
 4 36005000100 Census Tract 1, Bronx County, New York vacant             0    18
 5 36005000100 Census Tract 1, Bronx County, New York owner_occu…        0    18
 6 36005000100 Census Tract 1, Bronx County, New York renter_occ…        0    18
 7 36005000100 Census Tract 1, Bronx County, New York owner_inco…       NA    NA
 8 36005000100 Census Tract 1, Bronx County, New York renter_inc…       NA    NA
 9 36005000200 Census Tract 2, Bronx County, New York population      4453   563
10 36005000200 Census Tract 2, Bronx County, New York black_afri…     1450   546
# ℹ 1

In [24]:
%%R 

# pivot from wide to long
nyc_census_data <- nyc_census_data %>% 
  pivot_wider(
    names_from=variable, 
    values_from = c(estimate, moe),
    names_glue = "{variable}_{.value}"
  ) 

nyc_census_data

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
# ℹ 2

In [25]:
%%R 
nyc_census_data %>% print(n=20)

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
11 36

In [26]:
%%R -o nyc_census_data
nyc_census_data

# A tibble: 2,327 × 18
   GEOID      NAME  population_estimate black_african_estimate occupied_estimate
   <chr>      <chr>               <dbl>                  <dbl>             <dbl>
 1 360050001… Cens…                6661                   3346                 0
 2 360050002… Cens…                4453                   1450              1392
 3 360050004… Cens…                6000                   1572              2199
 4 360050016… Cens…                6038                   2593              2187
 5 360050019… Cens…                2168                    904               885
 6 360050019… Cens…                1399                    531               376
 7 360050019… Cens…                   0                      0                 0
 8 360050019… Cens…                   0                      0                 0
 9 360050020… Cens…                4694                   3021              1759
10 360050020… Cens…                4274                   1307              1904
# ℹ 2

## 👉 Merge it with your data

hint...`tidycensus` provides you data in long format you may need to pivot the census data from long to wide format before merging it with your data

In [27]:
df_with_geos.to_csv('df_with_geos.csv', index=False)

In [28]:
%%R
df_with_geos <- read.csv('df_with_geos.csv')

In [29]:
%%R
df_with_geos$GEOID <- as.character(df_with_geos$GEOID)
nyc_census_data$GEOID <- as.character(nyc_census_data$GEOID)

In [30]:
%%R
# Merge the data: 
library(dplyr)

merged_data <- left_join(df_with_geos, nyc_census_data, by = "GEOID")
merged_data

       Job.Number  Borough.Name Count.Permits First.Permit.Date Current.Date
1       120351662     Manhattan             0         5/13/2010     5/3/2025
2       120470409     Manhattan             0         9/16/2010     5/3/2025
3       120486633     Manhattan             0         9/29/2010     5/3/2025
4       120725705     Manhattan             0         6/16/2011     5/3/2025
5       120987236     Manhattan             0          3/1/2012     5/3/2025
6       121045127     Manhattan             0         4/23/2012     5/3/2025
7       121844941     Manhattan             0         12/5/2013     5/3/2025
8       122106078     Manhattan             0         8/19/2014     5/3/2025
9       122199236     Manhattan             0         12/9/2014     5/3/2025
10      122212131     Manhattan             0        12/16/2014     5/3/2025
11      122538995     Manhattan             0         12/7/2015     5/3/2025
12      122574570     Manhattan             0          1/6/2016     5/3/2025

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



            NA        2025-03-13   2025-05-03
1572 B01156376-I1      Brooklyn            NA        2024-12-17   2025-05-03
1573 B01156856-I1      Brooklyn            NA        2024-12-17   2025-05-03
1574 B01156926-I1      Brooklyn            NA        2024-12-24   2025-05-03
1575 B01156930-I1      Brooklyn            NA        2024-12-19   2025-05-03
1576 B01157308-I1      Brooklyn            NA        2024-12-20   2025-05-03
1577 B01157409-I1      Brooklyn            NA        2024-12-18   2025-05-03
1578 B01157585-I1      Brooklyn            NA        2024-12-19   2025-05-03
1579 B01158029-I1      Brooklyn            NA        2025-04-25   2025-05-03
1580 B01158168-I1      Brooklyn            NA        2024-12-23   2025-05-03
1581 B01158177-I1      Brooklyn            NA        2025-01-09   2025-05-03
1582 B01158195-I1      Brooklyn            NA        2025-01-08   2025-05-03
1583 B01158249-I1      Brooklyn            NA        2025-01-09   2025-05-03
1584 B01158416-I1      Brookly

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



    0.106849315             2025-12-22                     236.0
1631    0.252054795             2025-10-18                     338.0
1632    0.298630137             2025-09-08                     125.0
1633    0.183561644             2026-02-24                      41.0
1634    0.298630137             2025-08-01                      45.0
1635    0.295890411             2026-01-14                      60.0
1636    0.295890411             2025-11-03                     340.0
1637    0.298630137             2025-11-19                      35.0
1638    0.293150685             2025-06-28                      30.0
1639    0.134246575             2025-08-01                      16.0
1640    0.287671233             2025-11-03                     239.0
1641    0.290410959             2025-05-08                     111.0
1642    0.249315068             2025-12-21                      32.0
1643    0.249315068             2025-12-21                      32.0
1644    0.076712329             2026-0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



             303
1681                            Permit Entire    3182169             314
1682                            Permit Entire    3021582             306
1683                            Permit Entire    3079250             304
1684                            Permit Entire    3062846             301
1685                            Permit Entire    3135140             311
1686                            Permit Entire    3325574             304
1687                            Permit Entire    3393889             304
1688                            Permit Entire    3127675             314
1689                            Permit Entire    3107206             309
1690                            Permit Entire    3429069             303
1691                            Permit Entire    3413929             302
1692                            Permit Entire    3429618             317
1693                            Permit Entire    3429637             317
1694                            Pe

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



       40.64465       -74.01838         5524                  3 AVENUE
1738       40.67928       -73.98512          224                  3 AVENUE
1739       40.67965       -73.94432         1420             FULTON STREET
1740       40.73720       -73.95508         1138          MANHATTAN AVENUE
1741       40.69325       -73.92582         1152          LAFAYETTE AVENUE
1742       40.69466       -73.95762          653             MYRTLE AVENUE
1743       40.70420       -73.91756          374          STOCKHOLM STREET
1744       40.66590       -73.96027          960           FRANKLIN AVENUE
1745       40.71640       -73.95309            8          HAVEMEYER STREET
1746       40.71424       -73.96179          300            BEDFORD AVENUE
1747       40.71646       -73.95301            6          HAVEMEYER STREET
1748       40.70200       -73.98706          100                JAY STREET
1749       40.67120       -73.90606           89        CHRISTOPHER AVENUE
1750       40.68833       -73

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



             3  2313   24                                  PR       1
1574             3  5698   24                                  PR       1
1575             3  1266 7502                         ASHRAF CORP       1
1576             3   876    3                         ASHRAF CORP       1
1577             3  3690   15                  ASR ENGINEERING PC       1
1578             3  2784   12        RIGID STRUCTURAL DESIGN  LLC       1
1579             3   792   26                 NYMS CONSULTING LLC       0
1580             3  2279   30                                  PR       1
1581             3   156 7501                  KZ ENGINEERING  PC       1
1582             3  2698   32        PAUL PERDEK  PROF. ENG. PLLC       0
1583             3  2166 7501                         ASHRAF CORP       1
1584             3  6329   59                                  PR       1
1585             3  6667   24                 NYMS CONSULTING LLC       0
1586             3  1994   45      MSM ENG

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 Construction or Maintenance Commercial District/Overlay
1916 DOB NOW Construction or Maintenance      Other Zoning Districts
1917 DOB NOW                Local Law 11      Other Zoning Districts
1918 DOB NOW Construction or Maintenance Commercial District/Overlay
1919 DOB NOW Construction or Maintenance      Other Zoning Districts
1920 DOB NOW                Local Law 11      Other Zoning Districts
1921 DOB NOW Construction or Maintenance      Other Zoning Districts
1922 DOB NOW Construction or Maintenance      Other Zoning Districts
1923 DOB NOW Construction or Maintenance      Other Zoning Districts
1924 DOB NOW Construction or Maintenance Commercial District/Overlay
1925 DOB NOW Construction or Maintenance      Other Zoning Districts
1926 DOB NOW Construction or Maintenance      Other Zoning Districts
1927 DOB NOW Construction or Maintenance Commercial District/Overlay
1928 DOB NOW Construction or Maintenance Commercial District/Overlay
1929 DOB NOW Construction or Maintenance Comme

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  2001
1771 36047012700    36     47  12700  2000
1772 36047055400    36     47  55400  2000
1773 36047057500    36     47  57500  1001
1774 36047051900    36     47  51900  3006
1775 36047019100    36     47  19100  1012
1776 36047051900    36     47  51900  1009
1777 36047019900    36     47  19900  1001
1778 36047000100    36     47    100  3001
1779 36047055000    36     47  55000  3003
1780 36047000502    36     47    502  2001
1781 36047017700    36     47  17700  1000
1782 36047027700    36     47  27700  1001
1783 36047020300    36     47  20300  1003
1784 36047051300    36     47  51300  1000
1785 36047013300    36     47  13300  3001
1786 36047016500    36     47  16500  3002
1787 36047004900    36     47   4900  3000
1788 36047016700    36     47  16700  2000
1789 36047092200    36     47  92200  2000
1790 36047086800    36     47  86800  2000
1791 36047050300    36     47  50300  1000
1792 36047020100    36     47  20100  3000
1793 36047028300    36     47  28300  3001
1794

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




588                    1745              1325              85
589                     266              1128             179
590                    2573               825              30
591                     263              1799             301
592                       0               996              22
593                     188               690              56
594                    4597              1877             303
595                    1566              1828             317
596                     199              2694             292
597                    6287              2867             228
598                    1384              1961              92
599                     357              1785             292
600                    1384              1961              92
601                    2265              1527             210
602                      78              1138              59
603                     504              1184             145
604    

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                      482                 57679
1245                      65                     2458                250001
1246                     231                     1881                103922
1247                     231                     1881                103922
1248                     313                     1093                150321
1249                     103                      612                 71509
1250                    1265                     1247                 68644
1251                     223                     1161                 77813
1252                     155                      552                 94531
1253                     447                      840                    NA
1254                     568                      825                100096
1255                      13                     1510                    NA
1256                      13                     1510                    NA
1257                      13            

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



                  50278            100                34           48
1191                  77578            675               338          249
1192                  40057            380               332          136
1193                  84066            526               191          202
1194                  46420            420                80          124
1195                 106952            463               307          199
1196                  44034            906               276          253
1197                  45494            694               541          221
1198                  51197           1024               210          143
1199                  59427            496               228          170
1200                 103438            653                76          242
1201                  39732            612               521          187
1202                  39732            612               521          187
1203                  14446            609

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



           148513
1077         89                100                 108            24169
1078         60                307                  95            48887
1079        135                205                 131            28958
1080         93                162                 147            51773
1081         13                 13                  13               NA
1082         61                 91                 178            92615
1083         41                 77                 186            25612
1084        143                191                 305           102184
1085         76                 74                 106            62381
1086         17                 47                  91               NA
1087         34                100                 121            46757
1088         69                 81                 142            61461
1089        133                119                 184            30699
1090         87                 66            

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [31]:
%%R
head(merged_data)

  Job.Number Borough.Name Count.Permits First.Permit.Date Current.Date
1  120351662    Manhattan             0         5/13/2010     5/3/2025
2  120470409    Manhattan             0         9/16/2010     5/3/2025
3  120486633    Manhattan             0         9/29/2010     5/3/2025
4  120725705    Manhattan             0         6/16/2011     5/3/2025
5  120987236    Manhattan             0          3/1/2012     5/3/2025
6  121045127    Manhattan             0         4/23/2012     5/3/2025
  Age..in.years. Permit.Expiration.Date Sidewalk.Shed.Linear.Feet
1       14.98082              4/22/2026                        56
2       14.63562              6/17/2025                        65
3       14.60000              9/20/2025                       100
4       13.88767              3/11/2026                       314
5       13.17808              6/18/2025                        77
6       13.03288              1/27/2026                       339
  Construction.Material Current.Job.Statu

In [32]:
%%R
# Save the merged data as a CSV file:
write.csv(merged_data, "merged_data.csv", row.names = FALSE)

In [33]:
merged_data_python = pd.read_csv("merged_data.csv")
merged_data_python.head()

Unnamed: 0,Job.Number,Borough.Name,Count.Permits,First.Permit.Date,Current.Date,Age..in.years.,Permit.Expiration.Date,Sidewalk.Shed.Linear.Feet,Construction.Material,Current.Job.Status,BIN.Number,Community.Board,Latitude.Point,Longitude.Point,House.Number,Street.Name,Borough.Digit,Block,Lot,Applicant.Business.Name,ProCert,Source,activity,Commercial,GEOID,STATE,COUNTY,TRACT,BLOCK,NAME,population_estimate,black_african_estimate,occupied_estimate,vacant_estimate,owner_occupied_estimate,renter_occupied_estimate,owner_income_estimate,renter_income_estimate,population_moe,black_african_moe,occupied_moe,vacant_moe,owner_occupied_moe,renter_occupied_moe,owner_income_moe,renter_income_moe
0,120351662,Manhattan,0.0,5/13/2010,5/3/2025,14.980822,4/22/2026,56.0,STEEL,R,1050240,108,40.78184,-73.94844,1772,2 AVENUE,1,1555,4,CS BRIDGE CORP,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061015402,36,61,15402,3000,"Census Tract 154.02, New York County, New York",3579,931,1681,427,659,1022,212008.0,103182.0,1259,1007,314,227,235,206,127402.0,28807.0
1,120470409,Manhattan,0.0,9/16/2010,5/3/2025,14.635616,6/17/2025,65.0,STEEL/WOOD,R,1017833,105,40.7359,-73.98799,116,EAST 17 STREET,1,872,68,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061005000,36,61,5000,2000,"Census Tract 50, New York County, New York",5042,70,2852,622,1424,1428,195956.0,138364.0,814,74,299,246,309,222,72736.0,33353.0
2,120486633,Manhattan,0.0,9/29/2010,5/3/2025,14.6,9/20/2025,100.0,WOOD AND STEEL,R,1079685,103,40.72631,-73.97949,605,EAST 9 STREET,1,392,10,ROCKLEDGE SCAFFOLD CORP,1,BIS,Construction or Maintenance,Other Zoning Districts,36061002800,36,61,2800,3001,"Census Tract 28, New York County, New York",6483,1146,3073,303,482,2591,156071.0,44028.0,901,639,306,168,162,321,73124.0,12781.0
3,120725705,Manhattan,0.0,6/16/2011,5/3/2025,13.887671,3/11/2026,314.0,WOOD & STEEL,R,1026319,104,40.75827,-73.99532,443,WEST 40 STREET,1,1050,6,BS GROUP INC,1,BIS,Construction or Maintenance,Commercial District/Overlay,36061011500,36,61,11500,2002,"Census Tract 115, New York County, New York",3367,409,1750,565,239,1511,217356.0,134427.0,701,229,194,163,89,189,85738.0,48688.0
4,120987236,Manhattan,0.0,3/1/2012,5/3/2025,13.178082,6/18/2025,77.0,WOOD & STEEL,R,1083575,104,40.74551,-74.00375,444,WEST 21 STREET,1,718,1,ARSENAL SCAFFOLD INC,1,BIS,Construction or Maintenance,Other Zoning Districts,36061008900,36,61,8900,4001,"Census Tract 89, New York County, New York",5844,306,3095,603,974,2121,158555.0,88665.0,620,231,345,206,224,344,23669.0,37651.0


In [34]:
print(len(merged_data_python))
print(len(merged_data_python.query("population_estimate.isna()")))

8477
0
