# This notebook cleans up the cities data, selects only US cities, and finds lat/lng/county for each city. 

Note: the kernel won't run without setting BING_API_KEY to your key. 

In [None]:
!pip install geocoder
!pip install uszipcode

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.preprocessing import RobustScaler, QuantileTransformer
import geopandas as gpd
import geocoder
import time 
from tqdm import *
import plotly.express as px

from uszipcode import Zipcode
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=False)

Let's open the cities disclosing file and select the cities in the US. 

In [None]:
cities_disc_2020 = pd.read_csv("/kaggle/input/cdp-unlocking-climate-solutions/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
cities_disc_2020_sub = cities_disc_2020[cities_disc_2020["Country"] == "United States of America"]

orgs_2020 = list(cities_disc_2020_sub["Organization"]) # list of organization names in US
cities_2020 = list(cities_disc_2020_sub["City"]) # list of cities in US


In [None]:
BING_API_KEY = "YOUR_API_KEY"


Let's do a Bing geocoding query for all the cities. There were a few quirks but the following conditionals seem to work! 

In [None]:
gs_2020 = []
for i in tqdm(range(len(orgs_2020))):
    
    if(orgs_2020[i] == "City of Toledo"): #toledo and birmingham need the "city of" in their string to return the right result
        g = geocoder.bing(orgs_2020[i], key=BING_API_KEY)
        gs_2020.append(g.raw)
    elif(orgs_2020[i] == "City of Birmingham"):
        g = geocoder.bing(orgs_2020[i], key=BING_API_KEY)
        gs_2020.append(g.raw)
    elif(orgs_2020[i][0:2] == "Ci"): #strip city of 
        g = geocoder.bing(orgs_2020[i][8:], key=BING_API_KEY)
        gs_2020.append(g.raw)
    else:
        g = geocoder.bing(orgs_2020[i], key=BING_API_KEY)
        gs_2020.append(g.raw)


Okay, now we want to find the counties that span a city. Sometimes a city can have multiple counties (ex. NYC), so we will also be querying the population of city in each county and weight the corresponding county accordingly. The code below does that. It's a little messy but works. 

In [None]:
counties_2020 = [] # all the counties for a city
counties_weights_2020 = [] # "weight" each county by the population of the city that is in the county. this is done using uszipcode.
bbox_2020 = [] # bbox of city. unreliable? 
loc_2020 = [] # latlng
states_2020 = [] # state 
cities_bing_name_2020 = [] # bing name more reliable
pop_densities_2020 = [] # pop_density
match_multiple_county = np.zeros(len(cities_2020), dtype=bool)
zip_2020 = []

for i in tqdm(range(len(cities_2020))):
    split_city = ((gs_2020[i]["address"]["formattedAddress"].split(", ")))

    if(orgs_2020[i] == "New York City"): # NYC needs to be handcoded; data using Wiki
        counties_2020.append('["New York County", "Kings County", "Bronx County", "Richmond County", "Queens County"]')
        counties_weights_2020.append([1.628/8.336, 2.559/8.336,1.412/8.336,0.476/8.336, 2.253/8.336])
        result = search.by_city_and_state(split_city[0], split_city[1], returns=-1)
        pop_densities_2020.append(result[0].population_density)
        match_multiple_county[i] = True
        zip_2020.append(result[0].zipcode)
        
    elif(orgs_2020[i] == "District of Columbia"): # DC also needs to be handcoded
        counties_2020.append(orgs_2020[i])
        counties_weights_2020.append(1.0)
        pop_densities_2020.append(np.nan)
        zip_2020.append(20001)
        
    else:
        if(len(split_city) == 2): # it is a city! because the split led to something like ["Seattle", "WA"]

            result = search.by_city_and_state(split_city[0], split_city[1], returns=-1) # search all zipcodes belonging to city
            

            if(len(result)==0): # if this does not work use Bing 

                counties_2020.append(gs_2020[i]["address"]["adminDistrict2"])
                counties_weights_2020.append(1.0)
                pop_densities_2020.append(np.nan)
                zip_2020.append(np.nan)

            else: # if this does work, we now have a list of zipcodes and their respective county belonging to a city
                zip_2020.append(result[0].zipcode)
                
                all_counties = [] # get all the counties
                for j in range(len(result)):
                    if(len(result[j].county)>0):
                        all_counties.append(result[j].county)
    
                all_counties = np.array(all_counties)
                unique_counties, unique_counts = np.unique(all_counties, return_counts=True) # get unique counties bc zipcodes can belong to same county
            
                if(len(unique_counties) == 1): # only one county for this city
                    counties_2020.append(unique_counties[0])
                    counties_weights_2020.append(1.0)

                    if(result[0].population_density == None):
                        pop_densities_2020.append(np.nan)
                    else:
                        pop_densities_2020.append(result[0].population_density)

                else:

                    pops = np.array([result[j].population if result[j].population != None else 0 for j in range(len(result))]) # get population for this zip code
                    pops_county = np.array([np.sum(pops[all_counties == unique_counties[j]]) for j in range(len(unique_counties))]) # combine population by county
                    pop_density = np.array([result[j].population_density if result[j].population_density != None else 0 for j in range(len(result))]) # population density
                    pop_density_county = np.array([np.mean(pop_density[all_counties == unique_counties[j]]) for j in range(len(unique_counties))]) # population density by ocunty
                    weights = (pops_county/(pops_county.sum())) # county weights by population
                    pop_density_city = np.sum(pop_density_county*weights)
                    
                    
                    counties_2020.append(unique_counties.tolist())
                    counties_weights_2020.append(weights.tolist())
                    pop_densities_2020.append(pop_density_city)
                    match_multiple_county[i] = True
        else: # use bing
            counties_2020.append(gs_2020[i]["address"]["adminDistrict2"])
            counties_weights_2020.append(gs_2020[i]["address"]["adminDistrict2"])
            pop_densities_2020.append(np.nan)
            zip_2020.append(np.nan)
    
    bbox_2020.append(gs_2020[i]["bbox"])
    loc_2020.append("POINT (%f %f)"%(gs_2020[i]["point"]["coordinates"][1], gs_2020[i]["point"]["coordinates"][0]))
    states_2020.append(gs_2020[i]['address']["adminDistrict"])
    cities_bing_name_2020.append(gs_2020[i]["address"]["formattedAddress"])


Most datasets at the county level use the FIPS code as a unique identifier since a lot of counties can have the same name. We use the CDC SVI data to match the counties we derived above to their respective FIPS code

In [None]:
df = pd.read_csv("/kaggle/input/cdp-unlocking-climate-solutions//Supplementary Data/CDC Social Vulnerability Index 2018/SVI2018_US_COUNTY.csv")
counties = list(cities_disc_2020_sub['counties'])
states = list(cities_disc_2020_sub['state'])

In [None]:
import ast 
match = []

for i in range(len(counties)): # loop over all derived counties
    if(not match_multiple_county[i]): # only one county
        if "County" in counties[i]:
            counties[i] = counties[i].replace(" County","")
            counties[i] = ' '.join(counties[i].split())
        elif "Borough" in counties[i]:
            counties[i] = counties[i].replace(" Borough","")
            counties[i] = ' '.join(counties[i].split())
        elif "Parish" in counties[i]:
            counties[i] = counties[i].replace(" Parish","")
            counties[i] = ' '.join(counties[i].split())
        else:
            pass    
    
        idxs = np.where(df["COUNTY"] == counties[i])[0]
    
            
        if(len(idxs) == 1): # simple match 
            match.append(idxs[0])
            
        elif('DC' in counties[i]): # if it's DC 
            idxs = np.where(np.array([df["ST_ABBR"] == "DC"]).flatten())[0]
            match.append(idxs[0])
            
        else: # multiple counties share the same name, so we also gotta match it to the state
            sts = df["ST_ABBR"].iloc[idxs]
            idxss = np.where(sts == states[i])[0]
            match.append(idxs[idxss][0])
    else: # has multiple counties
        tmatch = []
        mcounties = counties[i]
        for j in range(len(mcounties)):
            
            if "County" in mcounties[j]:
                mcounties[j] = mcounties[j].replace(" County","")
                mcounties[j] = ' '.join(mcounties[j].split())
            elif "Borough" in mcounties[j]:
                mcounties[j] = mcounties[j].replace(" Borough","")
                mcounties[j] = ' '.join(mcounties[j].split())
            elif(mcounties[j] == "Municipality of Anchorage"):
                mcounties[j] = mcounties[j].replace("Municipality of ","")
                mcounties[j] = ' '.join(mcounties[j].split())
            elif "Municipality" in mcounties[j]:
                mcounties[j] = mcounties[j].replace(" Municipality","")
                mcounties[j] = ' '.join(mcounties[j].split())                
            elif "Parish" in mcounties[j]:
                mcounties[j] = mcounties[j].replace(" Parish","")
                mcounties[j] = ' '.join(mcounties[j].split())
            elif "city" in mcounties[j]:
                mcounties[j] = mcounties[j].replace(" city","")
                mcounties[j] = ' '.join(mcounties[j].split())
            else:
                pass    

            idxs = np.where(df["COUNTY"] == mcounties[j])[0]
            


            if(len(idxs) == 1):
                tmatch.append(idxs[0])
            elif('DC' in mcounties[j]):
                idxs = np.where(np.array([df["ST_ABBR"] == "DC"]).flatten())[0]
                tmatch.append(idxs[0])
            elif(len(idxs) > 1):
                sts = df["ST_ABBR"].iloc[idxs]
                idxss = np.where(sts == states[i])[0]
                tmatch.append(idxs[idxss][0])
            else:
                tmatch.append(np.nan)
        match.append(tmatch)

In [None]:
match_fips = []
flag_multiple = np.zeros(len(match), dtype=np.bool)
for i in range(len(match)):
    if(type(counties[i]) == str or type(counties[i]) == np.str_):
        match_fips.append(df["FIPS"].iloc[match[i]])
    else:
        tm = []
        for j in range(len(match[i])):
            tm.append(df["FIPS"].iloc[match[i][j]])
        match_fips.append(tm)
        flag_multiple[i] = True

a couple of the external datasets are also at the MSA level instead of county-level. We will choose a random zip code and derive the MSA name and MSA code using data from the US dept of labor (fs11_gpci_by_msa-ZIP.xls)

In [None]:
zips_2020_unassigned = {
    "City of Piedmont, CA" : "94618",
    "Orange County, NC" : "27243",
    "Cuyahoga County" : "44118",
    "Santa Fe County" : "87507",
    "Boulder County" : "80302",
    "Summit County, UT" : "84061",
    "Dane County" : "53508",
    "Broward County, FL" : "33024",
    "City of Milwaukie, OR" : "97206", 
    "City and County of Honolulu": "96795"
}

for i in range(len(zip_2020)):
    if(np.isnan(float(zip_2020[i]))):
        zip_2020[i] = zips_2020_unassigned[orgs_2020[i]]

In [None]:
zz = pd.read_excel("/kaggle/input/msabyzip/fs11_gpci_by_msa-ZIP.xls", sheet_name="fs11gpci by ZIP, owcp", skiprows=10)

MSA = [] 
MSA_name = []
for i in range(len(zip_2020)):
    cdf = zz[zz["ZIP CODE"] == int(zip_2020[i])]
    MSA.append(int(cdf["MSA No."]))
    MSA_name.append(str(cdf["MSA Name"].iloc[0]))

In [None]:
cities_disc_2020_sub["counties"] = counties_2020
cities_disc_2020_sub["counties_weights"] = counties_weights_2020
cities_disc_2020_sub["bbox"] = bbox_2020
cities_disc_2020_sub["loc"] = loc_2020
cities_disc_2020_sub["state"] = states_2020
cities_disc_2020_sub["city_bing"] = cities_bing_name_2020
cities_disc_2020_sub["pop_density"] = pop_densities_2020
cities_disc_2020_sub["FIPS"] = match_fips
cities_disc_2020_sub["flag_multiple_counties"] = flag_multiple
cities_disc_2020_sub["MSA"] = MSA
cities_disc_2020_sub["MSA_name"] = MSA_name
cities_disc_2020_sub["random_zipcode"] = zip_2020

In [None]:
cities_disc_2020_sub.to_csv("cities_updated_geo_us_2020.csv")