# Notebook Title

## Setup Python and R environment
you can ignore this section

In [19]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [21]:
%%R

# My commonly used R imports

require('tidyverse')

## 👉 download your data

You can write code here to download your dataset. Or if you already have it, just leave the URL in the comments and just load it into a pandas or R (or both) dataframe.

In [22]:
#Motor vehicle collisions 2023, dropped rows without lat/long as there were 7300 rows and I dont have API credits https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data
import pandas as pd

df = pd.read_csv('df_cleaned.csv')

In [23]:
df =df.sample(10000)

In [24]:
print(df.dtypes)

CRASH DATE                        object
CRASH TIME                        object
BOROUGH                           object
ZIP CODE                         float64
LATITUDE                         float64
LONGITUDE                        float64
LOCATION                          object
ON STREET NAME                    object
CROSS STREET NAME                 object
OFF STREET NAME                   object
NUMBER OF PERSONS INJURED          int64
NUMBER OF PERSONS KILLED           int64
NUMBER OF PEDESTRIANS INJURED      int64
NUMBER OF PEDESTRIANS KILLED       int64
NUMBER OF CYCLIST INJURED          int64
NUMBER OF CYCLIST KILLED           int64
NUMBER OF MOTORIST INJURED         int64
NUMBER OF MOTORIST KILLED          int64
CONTRIBUTING FACTOR VEHICLE 1     object
CONTRIBUTING FACTOR VEHICLE 2     object
CONTRIBUTING FACTOR VEHICLE 3     object
CONTRIBUTING FACTOR VEHICLE 4     object
CONTRIBUTING FACTOR VEHICLE 5     object
COLLISION_ID                       int64
VEHICLE TYPE COD

## 👉 convert addresses --> lat/long 

See the [census-examples](https://github.com/data4news/census-examples) repository for examples. If you need help, try asking in the class slack channel. Chances are someone in the class is struggling with the same problem as you are so we might as well all learn together in the same slack channel! 

In [25]:
# Save the df_cleaned DataFrame as a CSV file
df.to_csv('df_cleaned3.csv', index=False)


## 👉 convert lat/long to census geography codes 

(like 'GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK', etc...)

Same note as above, see [census-examples](https://github.com/data4news/census-examples) repository for examples or ask in the class slack channel if stuck.

In [26]:
# Code adapted from:
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data
# Defines a geocode function that accepts lat/long and spits out geographies
# The code then runs that funciton in parllel (for speed).

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import glob
import json
import requests
import pandas as pd
from pprint import pprint
from tqdm import tqdm


import requests_cache
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        census = data['result']['geographies']['Census Blocks'][0]
        return census
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

def bulk_geocode(latitudes, longitudes):
    """
    Geocode a list of latitudes and longitudes in parallel (for speed).
    """

    with ThreadPoolExecutor() as tpe:
        latitudes = df['LATITUDE']
        longitudes = df['LONGITUDE']
        mapped_results = tpe.map(geocode, latitudes, longitudes)
        data = list(tqdm(mapped_results, total=len(df)))

    #return pd.DataFrame(data)
        return pd.DataFrame([x for x in data if x])

census_geos_df = bulk_geocode(df['LATITUDE'], df['LONGITUDE']) 
census_geos_df.head()
len(census_geos_df)


  1%|          | 72/10000 [00:01<02:44, 60.20it/s] 

Error geocoding (0.0, 0.0): 'Census Blocks'


  2%|▏         | 166/10000 [00:03<02:45, 59.59it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


  8%|▊         | 841/10000 [00:13<02:18, 66.19it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 13%|█▎        | 1303/10000 [00:20<02:13, 64.99it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 15%|█▍        | 1458/10000 [00:23<02:11, 64.85it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 19%|█▊        | 1857/10000 [00:29<02:06, 64.43it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 24%|██▍       | 2409/10000 [00:37<01:41, 74.48it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 25%|██▌       | 2539/10000 [00:39<01:54, 65.04it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 26%|██▌       | 2574/10000 [00:40<01:48, 68.21it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 38%|███▊      | 3764/10000 [00:57<01:24, 73.43it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 40%|████      | 4043/10000 [01:01<01:33, 63.87it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 44%|████▍     | 4388/10000 [01:06<01:16, 73.39it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 48%|████▊     | 4790/10000 [01:12<01:14, 69.99it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 52%|█████▏    | 5234/10000 [01:18<01:08, 69.17it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 54%|█████▍    | 5377/10000 [01:20<01:06, 69.05it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 55%|█████▍    | 5475/10000 [01:21<01:05, 69.48it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 64%|██████▍   | 6380/10000 [01:33<01:05, 54.91it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 69%|██████▉   | 6878/10000 [01:40<00:40, 76.74it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 78%|███████▊  | 7848/10000 [01:52<00:24, 89.57it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'
Error geocoding (0.0, 0.0): 'Census Blocks'


 80%|████████  | 8041/10000 [01:54<00:24, 80.30it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 86%|████████▌ | 8555/10000 [02:01<00:19, 73.44it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'
Error geocoding (0.0, 0.0): 'Census Blocks'


 86%|████████▌ | 8604/10000 [02:01<00:17, 81.94it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 91%|█████████▏| 9127/10000 [02:07<00:09, 92.90it/s] 

Error geocoding (0.0, 0.0): 'Census Blocks'


 92%|█████████▏| 9161/10000 [02:08<00:10, 80.97it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 92%|█████████▏| 9197/10000 [02:08<00:08, 90.26it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 96%|█████████▌| 9552/10000 [02:13<00:05, 81.25it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 96%|█████████▌| 9583/10000 [02:13<00:04, 86.52it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


 99%|█████████▉| 9940/10000 [02:17<00:00, 101.80it/s]

Error geocoding (0.0, 0.0): 'Census Blocks'


100%|██████████| 10000/10000 [02:18<00:00, 72.26it/s]


9970

In [27]:
census_geos_df

Unnamed: 0,SUFFIX,POP100,GEOID,CENTLAT,BLOCK,AREAWATER,STATE,BASENAME,OID,LSADC,...,TRACT,CENTLON,BLKGRP,AREALAND,HU100,INTPTLON,MTFCC,LWBLKTYP,UR,COUNTY
0,,105,360610033002007,+40.7207072,2007,0,36,2007,210701008621492,BK,...,003300,-074.0055917,2,4221,52,-074.0055917,G5040,L,U,061
1,,39,360810769021002,+40.7152988,1002,0,36,1002,210701006114638,BK,...,076902,-073.8333178,1,16299,11,-073.8333178,G5040,L,U,081
2,,266,360050345002000,+40.9051707,2000,0,36,2000,210701006026415,BK,...,034500,-073.8989332,2,22929,127,-073.8989332,G5040,L,U,005
3,,213,360810840002005,+40.6752289,2005,0,36,2005,210701006107453,BK,...,084000,-073.8199189,2,18335,72,-073.8199189,G5040,L,U,081
4,,210,360470519003007,+40.7138453,3007,0,36,3007,210701004649189,BK,...,051900,-073.9588926,3,8415,106,-073.9588926,G5040,L,U,047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9965,,0,360610210005001,+40.8128572,5001,0,36,5001,210701008619778,BK,...,021000,-073.9345132,5,3770,0,-073.9345132,G5040,L,U,061
9966,,704,360470257001003,+40.6959147,1003,0,36,1003,210701004657171,BK,...,025700,-073.9478539,1,20401,166,-073.9478539,G5040,L,U,047
9967,,380,360471198003003,+40.6771610,3003,0,36,3003,210701004652395,BK,...,119800,-073.8919216,3,14177,131,-073.8919216,G5040,L,U,047
9968,,473,360810149001000,+40.7618158,1000,0,36,1000,210701006114446,BK,...,014900,-073.9138956,1,23152,232,-073.9138956,G5040,L,U,081


## 👉 Output Data

Output your dataframe containing your data and the Census connector codes (like tract, block, etc...).

In [16]:
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]
census_geos_df

Unnamed: 0,GEOID,STATE,COUNTY,TRACT,BLOCK
0,360810769021002,36,081,076902,1002
1,360470458002000,36,047,045800,2000
2,360810125002000,36,081,012500,2000
3,360050199004002,36,005,019900,4002
4,360850170051019,36,085,017005,1019
...,...,...,...,...,...
9956,360470085001000,36,047,008500,1000
9957,360470119011009,36,047,011901,1009
9958,360050425005000,36,005,042500,5000
9959,360610303001000,36,061,030300,1000


In [17]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,GEOID,STATE,COUNTY,TRACT,BLOCK
0,2023-08-08,17:11:00,QUEENS,11375.0,40.71512,-73.83224,"(40.71512, -73.83224)",QUEENS BOULEVARD,78 AVENUE,,...,Sedan,Station Wagon/Sport Utility Vehicle,,,,360810769021002,36,81,76902,1002
1,2023-10-26,18:00:00,,,40.624554,-73.9671,"(40.624554, -73.9671)",EAST 9 STREET,,,...,Sedan,Bus,,,,360470458002000,36,47,45800,2000
2,2023-07-18,16:07:00,QUEENS,11102.0,40.772083,-73.91752,"(40.772083, -73.91752)",,,24-46 29 STREET,...,Sedan,Station Wagon/Sport Utility Vehicle,,,,360810125002000,36,81,12500,2000
3,2023-05-07,02:30:00,BRONX,10452.0,40.83632,-73.92748,"(40.83632, -73.92748)",,,1133 OGDEN AVENUE,...,Sedan,Sedan,Sedan,,,360050199004002,36,5,19900,4002
4,2023-01-22,03:47:00,,,40.5405,-74.177315,"(40.5405, -74.177315)",SNEDEN AVENUE,,,...,Sedan,Pick-up Truck,,,,360850170051019,36,85,17005,1019


In [18]:
# Saving the DataFrame as a CSV file
df_with_geos.to_csv('geocoded_col_10k.csv', index=False)
