# This notebook cleans up the corporations data and finds lat/lng/city for each corporation. 

Note: the kernel won't run without setting GOOGLE_API_KEY to your key. 

In [None]:
!pip install geocoder
!pip install uszipcode

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.preprocessing import RobustScaler, QuantileTransformer
import geopandas as gpd
import geocoder
import time 
from tqdm import *


In [None]:
corporations_2020 = pd.read_csv("/kaggle/input/cdp-unlocking-climate-solutions/Corporations/Corporations Responses/Climate Change/2020_Full_Climate_Change_Dataset.csv")
supplied_geo = pd.read_csv("../input/cdp-unlocking-climate-solutions/Supplementary Data/Locations of Corporations/NA_HQ_public_data.csv")
supplied_geo = supplied_geo[supplied_geo["survey_name"] == "Climate Change 2020"]

supplied_geo["lat"] = np.nan
supplied_geo["lng"] = np.nan

In [None]:
acct_numbers = np.unique(corporations_2020["account_number"])
acct_org = dict(zip(np.array(corporations_2020["account_number"]), np.array(corporations_2020["organization"])))

Get all cities and states that are supplied and set up a boolean flag to figure out which ones we don't have info for. 

In [None]:
cities = np.zeros(len(acct_numbers), dtype=object)
states = np.zeros(len(acct_numbers), dtype=object)
has_geo_flag = np.zeros(len(acct_numbers), dtype=np.bool)

for i in range(len(acct_numbers)):
    cdf = supplied_geo[supplied_geo["account_number"] == acct_numbers[i]]
    if(len(cdf) > 0): # if we match
        if((type(cdf['address_city'].iloc[0]) == str) & ((type(cdf['address_state'].iloc[0]) == str))):
            cities[i] = (cdf['address_city'].iloc[0])
            states[i] = (cdf['address_state'].iloc[0])
            has_geo_flag[i] = True
        else: # if no match, we need to find the geo ourselves
            cities[i] = (np.nan)
            states[i] = (np.nan)
    else:
        cities[i] = (np.nan)
        states[i] = (np.nan)

In [None]:
GOOGLE_MAPS_API_KEY = "api_key" # won't run without your key here

The query below finds lat/lng, city, state, and country for all corporations. 

In [None]:
gs = []
countries = np.zeros_like(cities)
latlng = []

for i in tqdm(range(len(acct_numbers))): # loop over all accounts
    acct = acct_numbers[i]
    cdf = supplied_geo[supplied_geo["account_number"] == acct]
    idx = np.where(supplied_geo["account_number"] == acct)[0] # find this account in the supplementary dataset

    if(len(cdf) > 0): # some accounts are not in the supplementary data so this checks for that
        if(has_geo_flag[i]): # if geo already supplied

            query_string = cities[i] + " " + states[i]
            g = geocoder.google(query_string, key=GOOGLE_MAPS_API_KEY)
            gs.append(g)
            latlng.append(g.latlng)
            try: # this fials a couple of times
                countries[i] = cdf["hq_country"].iloc[0]
                supplied_geo["lat"].iloc[idx] = g.latlng[0]
                supplied_geo["lng"].iloc[idx] = g.latlng[1]
            except:
                countries[i] = cdf["hq_country"].iloc[0]
                supplied_geo["lat"].iloc[idx] = None
                supplied_geo["lng"].iloc[idx] = None
            
        else: # geo is not supplied! let's do a query
            
            query_string = acct_org[acct]

            city = cdf["address_city"].iloc[0]
            state = cdf["address_state"].iloc[0]
            country = cdf["hq_country"].iloc[0]
            
            if(type(city) == str): # if city is provided, add
                query_string = query_string + " " + city
            
            if(type(state) == str): # if state is provided, add
                query_string = query_string + " " + state
            
            if(type(country) == str): # if country is provided, add
                query_string = query_string + " " + country
            
            g = geocoder.google(query_string, key=GOOGLE_MAPS_API_KEY)

            gs.append(g)

            cities[i] = g.city
            states[i] = g.state

            if(type(country) == str): # country from the supplementary data frame
                countries[i] = country
            else: # no country supplied, use google result
                countries[i] = g.country

            latlng.append(g.latlng)
            
            supplied_geo["address_city"].iloc[idx] = g.city
            supplied_geo["address_state"].iloc[idx] = g.state
            supplied_geo["hq_country"].iloc[idx] = g.country

            try: # fails a couple of times
                supplied_geo["lat"].iloc[idx] = g.latlng[0]
                supplied_geo["lng"].iloc[idx] = g.latlng[1]
            except:
                supplied_geo["lat"].iloc[idx] = None
                supplied_geo["lng"].iloc[idx] = None
    else: # account not in supplementary data
        query_string = acct_org[acct]
        g = geocoder.google(query_string, key=GOOGLE_MAPS_API_KEY)
        gs.append(g)
        cities[i] = g.city
        states[i] = g.state
        countries[i] = g.country
        latlng.append(g.latlng)

In [None]:
corporations_2020["city"] = np.nan
corporations_2020["state"] = np.nan
corporations_2020["country"] = np.nan
corporations_2020["lat"] = np.zeros(len(corporations_2020), dtype=float)
corporations_2020["lng"] = np.zeros(len(corporations_2020), dtype=float)

Now that we have the cities, states, countries, and coordinates for all the corporations, we can modify the original survey file and those as fields. 

In [None]:
for i in tqdm(range(len(acct_numbers))):
    idxs = np.where(corporations_2020["account_number"] == acct_numbers[i])[0]
    
    curr_city = cities[i]
    curr_state = states[i]
    curr_country = countries[i]
    curr_latlng = latlng[i]
    
    if((curr_city) is None):
        corporations_2020["city"].iloc[idxs] = np.nan
    else:
        corporations_2020["city"].iloc[idxs] = curr_city
    
    if((curr_state) is None):
        corporations_2020["state"].iloc[idxs] = np.nan
    else:
        corporations_2020["state"].iloc[idxs] = curr_state
    
    if((curr_country) is None):
        corporations_2020["country"].iloc[idxs] = np.nan
    else:
        corporations_2020["country"].iloc[idxs] = curr_country
    
    if((curr_latlng) is None):
        corporations_2020["lat"].iloc[idxs] = np.nan
        corporations_2020["lng"].iloc[idxs] = np.nan
    else:
        ll = np.zeros((len(idxs), 2))
        ll[:,0].fill(curr_latlng[0])
        ll[:,1].fill(curr_latlng[1])
        corporations_2020["lat"].iloc[idxs] = ll[:,0]
        corporations_2020["lng"].iloc[idxs] = ll[:,1]
    
    

In [None]:
# corporations_2020.to_csv("data/Supplementary Data/corporations_2020_climate_change_geo.csv") save file for later use should show up in external