# Calculation notebook
This notebook holds the data extraction and processing part of the ADP:ABC team submission. It is split into the 3 main headings which align with the Sections in the descriptive notebook, and creation of additional helpful datasets:
* Preparing external data
* Section A: Ambition KPI
* Section B: Benefits
* Section C: Collaboration
* Code appendix

In [None]:
# Settings
CURRENT_YEAR = 2020

In [None]:
import pandas as pd

fname = '../input/cdp-unlocking-climate-solutions/Cities/Cities Responses/{}_Full_Cities_Dataset.csv'
df_cities = pd.read_csv(fname.format('2020'))
df_cities_2019 = pd.read_csv(fname.format('2019'))
df_cities_2018 = pd.read_csv(fname.format('2018'))
df_citites_all_years = pd.concat([df_cities, df_cities_2019, df_cities_2018])

# Preparing external data
## Append city population and GDP data
Our amibition KPI will be in 'per capita' terms, and evaluated in the context of city GDP. The following cells prepare population projections and GDP data for joining with the CDP questionnaire data.

### City population data
We source projections for _country level_ populations from the World Bank. We then project each city's latest population estimate (CDP city questionnaire Q0.5 or Q0.6 depending on the year) by the yearly percentage change in the relevant country's urban population by this World Bank forecast.

#### Limitations
* Taiwan and the State of Palestine are not represented in the World Bank data; we sourced UN estimates. These are at a total country level and not urban population, so the projection for these cities assumes a flat level of urbanisation to 2050.
* Where the latest population figure from the questionnaire is for a year preceding 2010 (or after 2050, though no cases apply) we assume the population is equal in 2010. This is so that we can join on the World Bank data, which starts in 2010.

In [None]:
from typing import List, Tuple, Set
import math
import collections

Population = collections.namedtuple(
    'Population', ['year', 'population', 'reported_year']
)

df = pd.concat([
    df_cities[df_cities['Question Number'] == '0.5'],
    df_cities_2019[df_cities_2019['Question Number'] == '0.5'],
    df_cities_2018[df_cities_2018['Question Number'] == '0.6']
])

ordered_questions = [
    'Current population', 
    'Current population year', 
    'Projected population', 
    'Projected population year'
]  # random but consistent order
ordered_questions_dict = {i: q for i, q in enumerate(ordered_questions)}

# Create tuples for all population records / forecasts
# Note: no questionnaires have multiple rows for this question so it has been omitted,
# but in future be careful to check this; we use Column Name as we believe it is less 
# like to change between questionnaires for different years than Column Number
def create_year_pop_tuples(r):
    return [
        Population(r['Current population year'], 
                   r['Current population'], 
                   r['Year Reported to CDP']), 
        Population(r['Projected population year'], 
                   r['Projected population'], 
                   r['Year Reported to CDP'])
    ]

# Transpose relevant rows to coulmns
df_pop = (
    df.sort_values(by=['Year Reported to CDP', 'Column Name'])
      .groupby(['Organization', 'Country', 'Year Reported to CDP'])['Response Answer'] 
      .apply(lambda df: df.reset_index(drop=True))
      .unstack()
      .reset_index()
      .rename(columns=ordered_questions_dict)
)
# Create tuples
df_pop[ordered_questions] = df_pop[ordered_questions].astype(float)
df_pop['year_pop'] = df_pop.apply(create_year_pop_tuples, axis=1)
df_pop = (
    df_pop.groupby(['Organization', 'Country'])
          .agg({'year_pop': 'sum'})
          .reset_index()
)

# Where multiple values exist for the same population year, take the most recent questionnaire's
def take_latest_population_estimate(l: List[Tuple[int]]) -> Set[Tuple[int]]:
    """ 
    Accepts a list of namedtuples and returns only the most recently specified (attribute 
    'reported_year') for each year (attribute 'year'). In the case of conflicts, takes
    the highest population (attribute 'population'). Filters tuples with nan values.
    """
    l = [
        t for t in l 
        if not any(isinstance(n, float) and math.isnan(n) for n in t)
    ]  # remove nan
    _l = []
    for key in sorted({t.year for t in l}):
        group = filter(lambda t: t.year == key, l)
        _l.append(max(group, key=lambda t: (t.reported_year, -t.population)))
    return set(_l)

df_pop['year_pop'] = df_pop['year_pop'].apply(take_latest_population_estimate)

In [None]:
# Load World Bank urban population estimates and projections
fpath = "../input/global-population-estimates/data.csv"
df_wcp = pd.read_csv(fpath)

cols = ['Country Name', 'Country Code', 'Series Name'] 
cols += [f'{str(y)} [YR{str(y)}]' for y in range(2010, 2051)]
series = ['Urban population']  # 'Urban population growth (annual %)', 'Population, total'
df_wcp = df_wcp[df_wcp['Series Name'].isin(series)][cols]
df_wcp['Country Name'] = df_wcp['Country Name'].apply(lambda s: s.lower())

cols_rename = {f'{str(y)} [YR{str(y)}]': str(y) for y in range(2010, 2051)}
df_wcp = df_wcp.rename(columns=cols_rename)

In [None]:
## Manual adjustments

# Fill missing population data from CDP questionnaires
d = {
    'Comune di Lucca': Population(2017, 88397, 2020),  # lstat
    'Comune di Reggio Emilia': Population(2017, 171491, 2020),  # lstat
    'Dura Municipality': Population(2007, 28268, 2020),  # Wikipedia 
    'Melton City Council': Population(2018, 51100, 2020),  # ONS UK 
    'Piura': Population(2017, 484475, 2020),  # Wikipedia
    'Municipalidad de Colina': Population(2002, 77815, 2020),  # Wikipedia 
    'Municipality of Hj√∏rring': Population(2020, 25780, 2020),  # Wikipedia 
    'Municipality of Ilha de Mozambique': Population(2020, 14000, 2020),  # Wikipedia
    'Municipality of Villanueva': Population(2018, 433734, 2018),  # Wikipedia, census
    'Municipalit√© de Rabat': Population(2014, 577827, 2020),  # Wikipedia
    'Municipio de Maneiro': Population(2011, 48952, 2011),  # Wikipedia, census
    'Munic√≠pio de C√¢mara de Lobos': Population(2020, 36000, 2020),  # UNESCO estimate
    'Prefeitura de Alex√¢nia': Population(2007, 20033, 2020),  # Wikipedia
    'Prefeitura de Pedra Bela': Population(2015, 6044, 2020),  # Wikipedia
    'Prefeitura de S√£o Carlos': Population(2010, 116765, 2020),  # 
    'Puente Piedra': Population(2011, 10556, 2011),  # Wikipedia, census
    'West Coast District Municipality': Population(2018, 450610, 2018),  # SA-DSD estimate
    'Wrexham council': Population(2018, 136126, 2018)  # ONS UK
}

for city_name, city_pop_tuple in d.items():
    df_pop.loc[df_pop['Organization'].str.contains(city_name), 'year_pop'] = (
        set((city_pop_tuple, ))
    )

msg = '{} organizations without population information.'
print(msg.format(df_pop[df_pop['year_pop'].apply(lambda x: len(x) < 1)].shape[0]))


# Add Taiwan and Palestine (population in thousands) to World Bank data
# UN Department of Economic and Social Affairs: World Population Prospects 2019
# Source: https://population.un.org/wpp/DataQuery/
# Licence: Creative Commons BY 3.0 IGO: 
#          http://creativecommons.org/licenses/by/3.0/igo/
# Note: by using total population (data limitation) we're assuming no 
# change in urbanisation
taiwan_pop = {2010: 23188, 2011: 23269, 2012: 23347, 2013: 23422, 2014: 23492, 2015: 23557, 2016: 23618, 2017: 23675, 2018: 23726, 2019: 23774, 2020: 23817, 2021: 23855, 2022: 23889, 2023: 23918, 2024: 23943, 2025: 23965, 2026: 23983, 2027: 23998, 2028: 24007, 2029: 24012, 2030: 24011, 2031: 24005, 2032: 23992, 2033: 23972, 2034: 23945, 2035: 23908, 2036: 23863, 2037: 23809, 2038: 23746, 2039: 23674, 2040: 23593, 2041: 23503, 2042: 23405, 2043: 23299, 2044: 23187, 2045: 23069, 2046: 22946, 2047: 22818, 2048: 22686, 2049: 22551, 2050: 22413}
palestine_pop = {2010: 4056, 2011: 4150, 2012: 4242, 2013: 4334, 2014: 4429, 2015: 4529, 2016: 4636, 2017: 4747, 2018: 4863, 2019: 4981, 2020: 5101, 2021: 5223, 2022: 5346, 2023: 5469, 2024: 5594, 2025: 5718, 2026: 5843, 2027: 5967, 2028: 6092, 2029: 6217, 2030: 6342, 2031: 6467, 2032: 6593, 2033: 6719, 2034: 6845, 2035: 6971, 2036: 7097, 2037: 7223, 2038: 7349, 2039: 7474, 2040: 7599, 2041: 7724, 2042: 7848, 2043: 7972, 2044: 8095, 2045: 8217, 2046: 8339, 2047: 8460, 2048: 8580, 2049: 8698, 2050: 8816}
taiwan_pop = {str(k): v * 1000 for k, v in taiwan_pop.items()}
palestine_pop = {str(k): v * 1000 for k, v in palestine_pop.items()}

df_manual_pop = pd.concat([
    pd.DataFrame(taiwan_pop, index=[0]), 
    pd.DataFrame(palestine_pop, index=[1])
])
df_manual_pop['Country Name'] = ['taiwan, greater china', 'state of palestine']
df_manual_pop['Country Code'] = ['TWN', 'PSE']
df_manual_pop['Series Name'] = 'Total population'
df_wcp = pd.concat([df_wcp, df_manual_pop]).reset_index()


# Map country names to World Bank dataset for merging
country_name_map_wb = {
    'united kingdom of great britain and northern ireland': 'united kingdom',
    'china, hong kong special administrative region': 'hong kong sar, china',
    'republic of korea': 'korea, rep.',
    'republic of moldova': 'moldova',
    'state of palestine': 'palestinian territory, occupied',
    'united republic of tanzania': 'tanzania, united republic of',
    'bolivia (plurinational state of)': 'bolivia',
    'viet nam': 'vietnam',
    "c√¥te d'ivoire": "cote d'ivoire",
    'democratic republic of the congo': 'congo, dem. rep.',
    'united states of america': 'united states',
    'venezuela (bolivarian republic of)': 'venezuela, rb',
    'tanzania, united republic of': 'tanzania',
    'russia': 'russian federation'
}

def _replace(s, d):
    return d[s] if s in d.keys() else s

df_pop['country_lower'] = df_pop['Country'].apply(lambda s: s.lower())
df_pop['country_lower'] = (
    df_pop['country_lower'].apply(lambda s: _replace(s, country_name_map_wb))
)
_df_pop = df_pop.merge(df_wcp, left_on='country_lower', right_on='Country Name', how='left')

In [None]:
# Apply growth in urban population to city population
# Make the assumption that the latest historical estimate of city population is correct
# Note: nasty error handling: years outside of 2010 <= year <= 2050 converted to 2020
import numpy as np


def get_latest_current_population_estimate(l : List[Tuple[int]]) -> Tuple[int]:
    if not l:
        return Population(0, 0, 0)
    return min(l, key=lambda t: np.abs(t.year - CURRENT_YEAR))


_df_pop['latest_year_pop_tuple'] = (
    _df_pop['year_pop'].apply(get_latest_current_population_estimate)
)
_df_pop['latest_year_pop'] = (
    _df_pop['latest_year_pop_tuple'].apply(lambda t: int(t.population))
)
# dangerous assumption; to be replaced!
_df_pop['latest_year'] = _df_pop['latest_year_pop_tuple'].apply(
    lambda t: int(t.year) if t.year >= 2010 and t.year <= 2050 else 2010  
)
_df_pop['prop_urban_pop_in_city'] = _df_pop.apply(
    lambda r: r['latest_year_pop'] / max(r.loc[str(r['latest_year'])], 1), 
    axis=1
)
for c in cols_rename.values():
    _df_pop[c] = _df_pop[c] * _df_pop['prop_urban_pop_in_city']

A sample of the population data:

In [None]:
_df_pop.sample(3)

### City GDP
We use city GDP to contrast the ambition of a city with its GDP, an indicator of the wealth of the city. The following cells import and clean this data. We use the most recently available data for the 2018 year.

#### Limitations
* GDP data at regional or city level was only identified for the United States. As such, while the method is suitable for any country, the analysis is limited to the US.
* The GDP data available is for Metropolitan Statistical Areas (MSA), as defined by the US BEA. Broadly, this is comprised of a single city and suits our purpose, however when multiple cities are in close proximity they fall under the same MSA and their GDP is combined. In this case we use the MSA's combinated population figuresc to give each city the same, shared GDP per capita figure. These cities are marked with the same 'idx' field in the output. 
* Approximately 40% of city organizations who report to CDP do not have cities covered within the BEA data. To estimate the GDP per capita of these cities:
 * We calculate the 'uplift' in GDP per capita of cities who _are_ represented relative to the GDP per capita of the entire state.
 * We perform the population-weighted average of these uplifts.
 * If no data exists for the state, apply the median GDP per capita of the US cities who are represented in the CDP questionnaire.

In [None]:
## City GDP
import pandas as pd
import re


fpath_gdp = '../input/cagdp1-gdp-by-us-metropolitan-area/CAGDP1_GDP_by_metropolitan_area.csv'
fpath_pop = '../input/cagdp1-gdp-by-us-metropolitan-area/CAINC1_2018_population_by_metropolitan_area.csv'
_df_gdp = pd.read_csv(fpath_gdp).rename(columns={'2018': '2018_gdp'})
_df_gdp_pop = pd.read_csv(fpath_pop)[['GeoFips', '2018']].rename(
    columns={'2018': '2018_pop'}
)
df_gdp = _df_gdp.merge(_df_gdp_pop, on='GeoFips', how='left')
msg = 'Merged 2018 population by metropolitan area with GDP data, ' + \
      'dropping {} records.'
print(msg.format(_df_gdp.shape[0] - df_gdp.shape[0]))

df_gdp['2018_gdp'] = df_gdp['2018_gdp'] * 1000
df_gdp['2018_gdp_per_capita'] = df_gdp['2018_gdp'] / df_gdp['2018_pop'] 
df_gdp = df_gdp.iloc[1:]     # cut off total US record

# Extract country code, clean city name
def extract_country_code(s):
    m = re.search(r'\b[A-Z]{2}\b', str(s))
    if m:
        match = m.group()
        return match
    return None
            
df_gdp['state'] = df_gdp['GeoName'].apply(extract_country_code)
df_gdp['city'] = df_gdp['GeoName'].apply(lambda s: s.split(',')[0].lower())
cols = ['city', 'state', '2018_gdp', '2018_pop', '2018_gdp_per_capita']
df_gdp = df_gdp[cols]

In [None]:
# Add state information
fpath_gdp_state = '../input/cagdp1-gdp-by-us-metropolitan-area/SAGDP1_2018_GDP_by_state.csv'
fpath_pop_state = '../input/cagdp1-gdp-by-us-metropolitan-area/SAINC1_2018_population_by_state.csv'
_df_gdp_state = pd.read_csv(fpath_gdp_state).rename(columns={'2018': '2018_gdp_state'})
_df_gdp_pop_state = pd.read_csv(fpath_pop_state)[['GeoFips', '2018']].rename(
    columns={'2018': '2018_pop_state'}
)
df_gdp_state = _df_gdp_state.merge(
    _df_gdp_pop_state, 
    on='GeoFips', 
    how='left'
)

df_gdp_state['2018_gdp_state'] = df_gdp_state['2018_gdp_state'] * 1e6
df_gdp_state['2018_gdp_per_capita_state'] = (
    df_gdp_state['2018_gdp_state'] / df_gdp_state['2018_pop_state'] 
)

us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

df_gdp_state['state_id'] = df_gdp_state['GeoName'].map(us_state_abbrev).dropna()

# Merge state information into city GDP dataframe
df_gdp = df_gdp.merge(
    df_gdp_state[['state_id', '2018_gdp_per_capita_state']],
    left_on='state',
    right_on='state_id',
    how='left'
)

In [None]:
# GDP information for some metropolitan areas is split between multiple 
# cities. Split these cities whilst we perform city-level calculations - 
# we'll give citiesin the municipalities the same GDP per capita, eventually

# Add index, split into rows, use groupby later
df_gdp[df_gdp['city'].str.contains('-')].shape
df_gdp['idx'] = df_gdp.index

s = df_gdp['city'].str.split('-').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'city_split'
df_gdp = df_gdp.join(s)

In [None]:
from typing import Tuple, List
from math import cos, asin, sqrt, pi
import ast


def get_city_with_county_and_lat_long() -> pd.DataFrame:
    """
    Merge US city locations data with supplementary data to find state. It's unfortunately
    a bit ridiculous but our latlong dataset doesn't contain the state, which means it is
    difficult to join on some external datasets which use county. Hence, add this function 
    adds the state by matching a pre-prepared city location dataset (see Appendix: get city 
    latitude / longitude) with the supplementary data: finding the state of the closest city 
    via the Haversine distance (straight line on a sphere) between latitude/longitude values.
    """
    # US city lat-long and clean organization field
    # (Later used to create the visualisation, includes all CDP reporting cities)
    fpath = '../input/cdp-city-locations-with-latlong/city_locs_with_latlong.csv'
    df_city_locs = pd.read_csv(
        fpath,
        index_col='Unnamed: 0', 
        converters={'lat_long': lambda x: ast.literal_eval(x)}  # load tuple
    )
    df_us_city_locs = df_city_locs[df_city_locs['alpha2_code'] == 'US']


    # Supplementary data for US state
    fpath = '../input/cdp-unlocking-climate-solutions/Supplementary Data/Simple Maps US Cities Data/uscities.csv'
    df_cities_meta = pd.read_csv(fpath)

    def distance(latlong1, latlong2):
        """ 
        Optimised Haversine distance calculation.
        Source: https://stackoverflow.com/questions/27928
        """
        lat1, lon1 = latlong1
        lat2, lon2 = latlong2
        p = pi/180
        a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
        return 12742 * asin(sqrt(a))


    def find_closest_latlong(latlong: Tuple[float, float], 
                             latlongs: List[Tuple[float, float]]) -> Tuple[float, float]:
        return min(latlongs, key=lambda x: distance(latlong, x))


    cities_meta_latlong = list(zip(df_cities_meta['lat'], df_cities_meta['lng']))
    df_cities_meta['lat_long'] = cities_meta_latlong
    df_us_city_locs['lat_long_match'] = df_us_city_locs['lat_long'].apply(
        lambda x: find_closest_latlong(x, cities_meta_latlong)
    )

    df_us_city_locs = df_us_city_locs.merge(
        df_cities_meta, 
        left_on='lat_long_match', 
        right_on='lat_long',
        how='left'
    )

    # Note: we take population data from World bank projections, not the supplementary data, 
    # and hence it is discarded here
    cols = ['organization', 'organization_clean', 'city', 'city_ascii', 'state_id', 
            'state_name', 'county_name', 'country',  'country_lower', 'alpha2_code', 
            'lat_long_y']
    df_us_city_locs = df_us_city_locs.loc[:, cols]
    df_us_city_locs['city'] = df_us_city_locs['city'].str.lower()
    return df_us_city_locs


df_us_city_locs = get_city_with_county_and_lat_long()

In [None]:
# Merge cleaned city data with GDP data
df_us_city_gdp = df_us_city_locs.merge(
    df_gdp,
    left_on=['city', 'state_id'], 
    right_on=['city_split', 'state'], 
    how='left'
)

cols = ['organization', 'organization_clean', 'city_x', 'state_id_x', 'state_name', 
        'county_name' ,'country_lower', 'alpha2_code', 'lat_long_y', '2018_gdp',
        '2018_pop',	'2018_gdp_per_capita', '2018_gdp_per_capita_state', 'idx']
df_us_city_gdp = df_us_city_gdp[cols].rename(columns={
    'city_x': 'city', 
    'state_id_x': 'state',
    'lat_long_y': 'lat_long'
})

In [None]:
df_us_city_gdp['2018_gdp_per_capita'].hist(bins=20)

In [None]:
## Fill missing GDP per capita values
# For each state, get a weighted average 'uplift' in GDP per capita, relative 
# to the state GDP per capita, of the cities for which do have GDP information. 
# This will act as a proxy for urbna-vs-rural GDP (as production will tend to
# pool in the cities or manufacturing areas). We're going to apply this uplift 
# to state-level GDP (which has far better availability) to estimate GDP per
# capita for cities without GDP figures.
def f(r):
    """ Get prop of city's GDP/capita relative to state's GDP/capita. """
    prop_gdp_pc = r['2018_gdp_per_capita'] / r['2018_gdp_per_capita_state']
    weighted_prop_gdp_pc = prop_gdp_pc * r['2018_pop']
    return weighted_prop_gdp_pc

# Get weighted average city GDP/capita uplift by state
weighted_avg_city_vs_state_gdp_per_capita_uplift = (
    df_us_city_gdp.groupby('state')
                  .apply(lambda g: f(g).sum() / g['2018_pop'].sum())
                  .rename('2018_gdp_per_capita_state_uplift')
                  .reset_index()
)
# Add state GDP information
df_gdp_state = df_gdp_state.merge(
    weighted_avg_city_vs_state_gdp_per_capita_uplift,
    left_on='state_id',
    right_on='state',
    how='left'
)
# Calculate GDP per capita estimate by applying uplift to state GDP/capita
df_gdp_state['2018_gdp_per_capita_estimate'] = (
    df_gdp_state['2018_gdp_per_capita_state'] * 
    df_gdp_state['2018_gdp_per_capita_state_uplift']
)

# Merge with US city dataframe
df_us_city_gdp = df_us_city_gdp.merge(
    df_gdp_state[['state', '2018_gdp_per_capita_estimate']], 
    on='state', 
    how='left'
)

# Fix missing values
_df = df_us_city_gdp[pd.isnull(df_us_city_gdp['2018_gdp_per_capita_estimate'])]
l = _df[['organization', 'county_name', 'state']].to_dict(orient='records')
print(f"Missing information for {len(set(_df['state']))} states. Organizations:")
for d in l:
    print(f"  - {d['organization']}, {d['county_name']}, {d['state']}")

city_gdp_per_capita_median = df_us_city_gdp['2018_gdp_per_capita'].median()
print(f'Using city median GDP per capita: ${city_gdp_per_capita_median:.0f}.')
df_us_city_gdp.loc[:, '2018_gdp_per_capita_estimate'].fillna(
    city_gdp_per_capita_median, 
    inplace=True
)

In [None]:
def f(x):
    return 'ESTIMATE' if np.isnan(x) else 'ACTUAL'

def g(r):
    if np.isnan(r['2018_gdp_per_capita']):
        return r['2018_gdp_per_capita_estimate'] 
    return r['2018_gdp_per_capita']

    
df_us_city_gdp['2018_gdp_per_capita_status'] = (
    df_us_city_gdp['2018_gdp_per_capita'].apply(f)
)
df_us_city_gdp['2018_gdp_per_capita_value'] = (
    df_us_city_gdp.apply(g, axis=1)
)
cols = ['organization', 'organization_clean', 'city', 'county_name', 
        'state', 'state_name', 'country_lower', 'alpha2_code', 'lat_long', 
        '2018_gdp', '2018_pop', '2018_gdp_per_capita_status', 
        '2018_gdp_per_capita_value', '2018_gdp_per_capita_state', 'idx']
df_us_city_gdp = df_us_city_gdp.loc[:, cols]

A sample of the GDP per capita data:

In [None]:
df_us_city_gdp.sample(3)

Save the GDP per capita dataframe for use in later analysis:

In [None]:
cols = ['organization', 'city', '2018_gdp_per_capita_status', '2018_gdp_per_capita_value']
_df_us_city_gdp = df_us_city_gdp[cols].copy()

# Create manual clusters of GDP
# In future this could be performed with a simple clustering algorithm or by number of
# standard-deviations from the mean.
_df_us_city_gdp.loc[_df_us_city_gdp['2018_gdp_per_capita_value'] < 45000, 'group'] = 'low'
_df_us_city_gdp.loc[_df_us_city_gdp['2018_gdp_per_capita_value'] > 45000, 'group'] = 'medium-low'
_df_us_city_gdp.loc[_df_us_city_gdp['2018_gdp_per_capita_value'] > 55000, 'group'] = 'medium-high'
_df_us_city_gdp.loc[_df_us_city_gdp['2018_gdp_per_capita_value'] > 71000, 'group'] = 'high'
_df_us_city_gdp.to_csv('city_gdp_per_capita.csv')

_df_us_city_gdp['2018_gdp_per_capita_value'].hist(bins=20)

## Calculate ambition KPI
Below, we calculate the ambition KPI for cities across the world who have reported sufficient information to generate an emissions trajectory.

#### Limitations
* We produce the KPI only for cities who report emissions targets which:
  * cover all emissions sources included in the city inventory (Q5.0a)
  * cover the entire city area or larger, relative to the city boundary (Q5.0a). We treat these boundaries equally.
* We assume a _linear trajectory_ from the starting year to the target year.
* As not to unfairly penalise cities with long-term predictions, we make the assumption that the target for years beyond the last target specified is kept constant. I.e., if the city has a target to 2030, that the target performance in all future year remains at the 2030 level.
* Where multiple targets exist for a city in the same year, we take the lower target. 
* We assume the 2020 emissions performance of the city is in line with its target, linearly between the closest targets or base years which span the 2020 year. I.e., if we know the 2015 city emissions were 10MtCO2e and the 2025 target is 5MtCO2e, we'd assume a 2020 performance of 7.5MtCO2e.

In [None]:
# Settings
MAX_YEAR = 2050
EXTRAPOLATION_STRATEGY = 'constant'

In [None]:
city_list = set(df_cities['Organization'])

_df = df_cities[
    (df_cities['Question Number'] == '5.0a') & 
    (df_cities['Column Number'] == 1) &
    (df_cities['Response Answer'] == 'All emissions sources included in city inventory')
]
c = _df.groupby('Organization').agg({'Row Number': 'min'}).count()[0]
print('City count: {}'.format(len(city_list)))
print('Cities reporting emissions targets: {}'.format(c))

In [None]:
## Apply conditions
_df = df_cities[df_cities['Question Number'] == '5.0a']

# All emission sources covered in target
l1 = lambda g: g[g['Response Answer'] == 'All emissions sources included in city inventory']
_df_q1 = _df.groupby('Organization').apply(l1).droplevel(0)
# Target is city-wide (at minimum)
l2 = lambda g: g[g['Response Answer'].isin([
    'Same ‚Äì covers entire city and nothing else', 
    'Larger ‚Äì covers the whole city and adjoining areas'
])]
_df_q2 = _df.groupby('Organization').apply(l2).droplevel(0)

cols = ['Organization', 'Row Number']
_df_q = _df_q1.merge(_df_q2, on=['Organization', 'Row Number'], how='inner')[cols]
print('Organizations with targets meeting criteria: {}'.format(len(set(_df_q['Organization']))))
df_ambitions = _df.merge(_df_q[cols], on=cols, how='inner')

In [None]:
# Question = 5.0a, Column Number = 12 has multiple values; compress rows into single list
df_ambitions_col12_list = (
    df_ambitions[df_ambitions['Column Number'] == 12]
        .groupby(['Organization', 'Column Number', 'Row Number'])['Response Answer']
        .apply(list)
        .reset_index()
)

a = df_ambitions.shape[0]
df_ambitions = df_ambitions.merge(
    df_ambitions_col12_list, on=['Organization', 'Column Number', 'Row Number'], how='left'
)
df_ambitions.loc[df_ambitions['Column Number'] == 12, 'Response Answer_x'] = (
    df_ambitions.loc[df_ambitions['Column Number'] == 12, 'Response Answer_y']
)
df_ambitions = (
    df_ambitions.drop_duplicates(subset=['Organization', 'Column Number', 'Row Number'], keep='first')
                .rename(columns={'Response Answer_x': 'Response Answer'})
                .drop('Response Answer_y', axis=1)
)
b = df_ambitions.shape[0]
print(f'Combined initiatives to reduce duplicate data, removing {a - b} rows.')

In [None]:
# Create column map to make feature creation steps simpler
column_map = dict(zip(df_ambitions['Column Number'], df_ambitions['Column Name']))
column_map = {k - 1: v for k, v in column_map.items()}

In [None]:
# Transpose data into column format
df_ambitions_T = (
    df_ambitions.sort_values(by=['Organization', 'Column Number', 'Row Number'])
                .groupby(['Organization', 'Row Number'])['Response Answer']
                .apply(lambda df: df.reset_index(drop=True))
                .unstack()
                .reset_index()
)

In [None]:
# Filter base year emissions (errors found, see Yaounde)
import numpy as np
from scipy.stats import zscore, norm

# Remove records where base year emissions are invalid (they equal the base year) 
# and drop other rows with NaN values
a = df_ambitions_T.shape[0]
df_ambitions_T.loc[df_ambitions_T[5] == df_ambitions_T[3], 5] = np.nan
df_ambitions_T = df_ambitions_T.dropna(subset=[3, 5, 7, 8])
b = df_ambitions_T.shape[0]
print(f'Removed {a - b} rows with outliers in base year emissions or NaN values.')

In [None]:
# Add World Bank population projections to convert to emission per capita
df_ambitions_T = df_ambitions_T.merge(_df_pop, on='Organization', how='left')
df_ambitions_T = df_ambitions_T.rename(columns={'Organization': 'organization'})

In [None]:
# Take 2010 or 2050 population for years outside of this
# base_year = 3, base_year emissions = 5, target_year = 7, target_year emissions = 8
df_ambitions_T['_3'] = df_ambitions_T[3].apply(lambda y: y if int(y) >= 2010 else '2010')
df_ambitions_T['_7'] = df_ambitions_T[7].apply(lambda y: y if int(y) >= 2010 else '2010')
df_ambitions_T['_7'] = df_ambitions_T[7].apply(lambda y: y if int(y) <= 2050 else '2050')

# Use lookup function to get population in target / base year
df_ambitions_T['_3_pop'] = df_ambitions_T.lookup(
    df_ambitions_T.index, df_ambitions_T['_3']
)
df_ambitions_T['_7_pop'] = df_ambitions_T.lookup(
    df_ambitions_T.index, df_ambitions_T['_7']
)

# Divide emissions by population => emission per capita
df_ambitions_T['_5'] = df_ambitions_T[5].astype(float) / df_ambitions_T['_3_pop']
df_ambitions_T['_8'] = df_ambitions_T[8].astype(float) / df_ambitions_T['_7_pop']

In [None]:
# Create city emissions trajectories, based on their base/target info
# base_year = 3, base_year emissions = 5, target_year = 7, target_year emissions = 8
df_ambitions_T['t1'] = df_ambitions_T[[3, 5]].apply(tuple, axis=1)
df_ambitions_T['t2'] = df_ambitions_T[[7, 8]].apply(tuple, axis=1)
df_ambitions_T['row_trajectory'] = df_ambitions_T[['t1', 't2']].apply(list, axis=1)
df_trajectories = (
    df_ambitions_T.groupby('organization')['row_trajectory']
                  .apply(sum)
                  .reset_index()
                  .rename(columns={'row_trajectory': 'org_trajectory'})
)
df_ambitions_T = df_ambitions_T.merge(df_trajectories, on='organization', how='left')

# Repeat the above for per capita trajectories
# TODO: make function
df_ambitions_T['t1'] = df_ambitions_T[['_3', '_5']].apply(tuple, axis=1)
df_ambitions_T['t2'] = df_ambitions_T[['_7', '_8']].apply(tuple, axis=1)
df_ambitions_T['row_trajectory_per_capita'] = (
    df_ambitions_T[['t1', 't2']].apply(list, axis=1)
)
rename_dict = {'row_trajectory_per_capita': 'org_trajectory_per_capita'}
df_trajectories = (
    df_ambitions_T.groupby('organization')['row_trajectory_per_capita']
                  .apply(sum)
                  .reset_index()
                  .rename(columns=rename_dict)
)
df_ambitions_T = df_ambitions_T.merge(df_trajectories, on='organization', how='left')

We've taken only the year and base / target date and emissions combinations as a tuple, however we've prepared the data such that it could be extracted as a dictionary for additional data when plotting.

In [None]:
## Create visualisation tooltips
from typing import Callable, List, Tuple

def sort_list_of_tuples_by_first_element(l: List[Tuple[int, int]],
                                         descending: bool = False
                                        ) -> List[Tuple[int, int]]:
    return sorted(list(set(l)), key=lambda t: (t[0], -t[1]), reverse=descending)


def get_relative_tuples(f: Callable, l: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
    """
    Makes the second element of each tuple relative to second element of a tuple chosen 
    using function f. E.g. [(0, 20), (1, 8)] => [(0, 1.0), (0, 0.4)].
    """
    t_min_year = f(l, key=lambda t: (int(float(t[0])), -float(t[1])))
    return [(int(_t[0]), float(_t[1]) / float(t_min_year[1])) for _t in l]


def generate_trajectory_tooltips(l: List[Tuple[int, int]]) -> str:
    """ Convert trajectory tuples to string for tooltips. """
    # Ensure tuples are ordered and unique by year (take max t[1])
    l = sort_list_of_tuples_by_first_element(l)
    l = [(y, next(v for k, v in l if k == y)) for y in dict(l).keys()]
    s = '{:.0f} -> {:.0f}%'
    return '<br>'.join([s.format(t[0], t[1] * 100) for t in l])


df_ambitions_T['org_trajectory_rel'] = (
    df_ambitions_T['org_trajectory'].apply(lambda t: get_relative_tuples(min, t))
)
df_ambitions_T['org_trajectory_rel_str'] = (
    df_ambitions_T['org_trajectory_rel'].apply(generate_trajectory_tooltips)
)

# Note: relative trajectory not required for per capita trajectories (not written)
# Calculated below after inclusion of current year

#### Current performance
Ideally, we'd like to be able to check ambition *relative to a city's current condition*, and hence evaluate it's current progress towards its target(s). However, responses to the question 'Percentage of target achieved so far' appear to have quality issues: a significant proportion of organisations do not submit an entry for this field and a visual check of some of those who do suggest it is not completed consistently (for example, using the same percentage against different targets; in this case it is not clear which base level of emissions this is relative to - i.e. City of Cleveland). Instead, we could attempt to make use of data given in Q4.6: 70% of cities report current-year emissions inventories in Q4.0. However, the basis on which the targets and emissions inventories sit is particularly complex (see sub-questions within Q4.6 and variability in inventory data collection methodologies) and would need to be comparable for us to plot; this was considered too in-depth given the available time.

As such, we simplify the plot to be relative to the target in the current year (2020). Where no such target exists, we take the linear interpolation between the most recent base year emissions and next target year emissions. This assumes that the city has been decarbonising in line with its latest target up to 2020, evidently falsely benefitting cities who are behind their target (assuming a linear trajectory) and hiding over-achievement of cities who are ahead of their target.

In [None]:
from bisect import bisect_left, bisect_right

def linear_interpolation_from_trajectory(trajectory: List[Tuple[int, int]], 
                                         interp_year: int = 2020) -> int:
    """
    Provides linear interpolation of targets for year 'interp_year' (no extrapolation).
    - Returns latest target emissions if interpolation_year greater than max target year.
    - Returns None if interpolation_year is less than min target year (a base year).
    """
    l = [(int(t[0]), float(t[1])) for t in trajectory]  # convert str->int
    l = sort_list_of_tuples_by_first_element(l, False)
    years = [t[0] for t in l]
    # If interp_year exists as a base/target, return it
    i_year_tuple = [t for t in l if t[0] == interp_year]
    if len(i_year_tuple) > 0:
        return min(i_year_tuple, key=lambda t: t[1])[1]
    elif years[0] > interp_year:
        return None
    elif years[-1] < interp_year:
        return l[-1][1]
    t_lower = bisect_right(years, interp_year)
    ts = l[t_lower-1:t_lower+1]
    interp = (
        ts[1][1] + (ts[0][1] - ts[1][1]) * 
        (ts[1][0] - interp_year) / (ts[1][0] - ts[0][0])
    )
    return interp


df_ambitions_T['current_year_emissions'] = df_ambitions_T['org_trajectory'].apply(
    lambda t: linear_interpolation_from_trajectory(t, CURRENT_YEAR)
)
df_ambitions_T['current_year_emissions_per_capita'] = (
    df_ambitions_T['org_trajectory_per_capita'].apply(
        lambda t: linear_interpolation_from_trajectory(t, CURRENT_YEAR)
    )
)
df_ambitions_T['current_year_emissions_type'] = 'interpolated'

In [None]:
# Add the current emissions interpolation into the trajectory and re-update the 
# relative trajectory values. This leaves the original trajectory string 
# (org_trajectory_rel_str) for the plot tooltips, excluding this ficticious base/
# target year entry from the plot.
df_ambitions_T['org_trajectory'] = df_ambitions_T.apply(
    lambda r: r['org_trajectory'] + [
        (str(CURRENT_YEAR), str(r['current_year_emissions']))
    ], axis=1
)
df_ambitions_T['org_trajectory_rel'] = (
    df_ambitions_T['org_trajectory'].apply(lambda t: get_relative_tuples(min, t))
)

# And for the per capita trajectory
df_ambitions_T['org_trajectory_per_capita'] = df_ambitions_T.apply(
    lambda r: r['org_trajectory_per_capita'] + [
        (str(CURRENT_YEAR), str(r['current_year_emissions_per_capita']))
    ], axis=1
)
df_ambitions_T['org_trajectory_rel_per_capita'] = (
    df_ambitions_T['org_trajectory_per_capita'].apply(
        lambda t: get_relative_tuples(min, t)
    )
)

In [None]:
import collections
import numpy as np
from shapely.geometry import MultiPoint, Polygon, LineString
from shapely.affinity import scale
from shapely.ops import split


def f(g, x, axis):
    """ 
    Apply function g to iterable x considering element axis. 
    
    Examples
    ------
    >>> x = {(2000, 100), (2015, 50), (2030, 25)}
    >>> f(min, x, 1) => (2030, 25)
    >>> f(max, x, 1) => (2000, 100)
    """
    return g(x, key=lambda t: t[axis])[axis]



def create_polygon(d: List[Tuple[int, int]], base_year: int = 2020, max_year: int = 2050, 
                   extrapolation_strategy: str = 'constant') -> Polygon:
    """ 
    Creates a shapely.geometry.polygon object from a list of coordinates.
    
    Parameters
    ------
    :param d: a list of tuples containing the year and associated quantity (i.e. tCO2e).
    :param base_year: left-hand cut-off for the polygon; years < base_year are clipped.
    :param max_year: right-hand cut-off for the polygon; years > max_year are clipped.
    :param extrapolation_strategy: method for extending d to max_year when the greatest
        year in d is less than the max_year. Options: 'constant', None.
    
    Examples
    ------
    >>> create_polygon([(1990, 1), (2010, 0.9), (2030, 0.5), (2050, 0)], 2000, 2050)
    """
    # In case of multiple targets on the same date, take lowest emissions.
    # This is because, unless there are administrative errors, this situation should 
    # only occur for target years (emissions for base/current years should be known 
    # with certainty), and we hypothesize that lower targets would be employed only 
    # in the case of exceeding an existing target; targets should not be increased 
    # due to poor performance.
    _d = collections.defaultdict(lambda: float(np.inf))
    for year, emissions in d:
        _d[emissions] = min(_d[emissions], int(year))
    d = {(v,k) for k,v in _d.items()}

    # Extrapolate to end year
    if extrapolation_strategy == 'constant':
        d.add((max_year, f(min, d, 1)))

    # Create polygon, adding zero at start / end
    d = (
        [(f(min, d, 0), 0)] + 
        sorted(list(d), key=lambda p: (p[0], p[1])) + [(f(max, d, 0), 0)]
    )
    mp_poly = Polygon(d)

    # Cut at base / max year if required
    min_year_line = LineString([(base_year, 0), (base_year, f(max, d, 1))])
    max_year_line = LineString([(max_year, 0), (max_year, f(max, d, 1))])
    mp_poly = split(mp_poly, max_year_line)[0]
    if len(list(split(mp_poly, min_year_line))) > 1:
        mp_poly = list(split(mp_poly, min_year_line))[1]

    # If missing base year value or its < 0, use y=1 for base_year
    base_year_y = [t[1] for t in d if t[0] == base_year]
    if len(base_year_y) < 1 or base_year_y[0] <= 0:
        base_year_y = 1.
        msg = 'Missing base_year={} entry, defaulting to base_year_emissions=1.0.'
        print(msg.format(base_year))
    else:
        base_year_y = base_year_y[0]
    
    mp_poly = scale(
        mp_poly, 
        xfact=1/(max_year-base_year),
        yfact=1/(base_year_y - 0),  # take y from 2020
        origin='center'
    )

    # print(f"Area: {mp_poly.area:.3f}")
    return mp_poly


df_ambitions_T['polygon'] = (
    df_ambitions_T['org_trajectory_rel_per_capita'].apply(create_polygon)
)

In [None]:
# Get resulting ambition metric
df_ambitions_T['area'] = df_ambitions_T['polygon'].apply(lambda x: x.area)
df_ambitions_T['ambition'] = 1 - df_ambitions_T['area']
df_ambitions_T[['organization', 'ambition']].sample(5)

## Visualise emissions, ambition and KPI
### Formatting, location and plot components
The following cells prepare the compiled data for plotting in an interactive Bokeh visualisation.

In [None]:
## Prepare data for visualisation
import ast

# City locations data: see Code Appendix section below for preparation
df_city_locs = pd.read_csv(
    '../input/cdp-city-locations-with-latlong/city_locs_with_latlong.csv',
    index_col='Unnamed: 0', converters={'lat_long': lambda x: ast.literal_eval(x)}
)  # literal_eval loads as tuple

_df = df_ambitions_T.merge(
    df_city_locs[['organization', 'lat_long']], on='organization', how='left'
)

msg = 'Removed {} rows with missing lat-long information.'
print(msg.format(_df[_df['lat_long'].isnull()].shape[0]))
_df = _df[~_df['lat_long'].isnull()]

In [None]:
def latlong_to_web_mercator(df):
    """ Converts decimal longitude/latitude to Web Mercator format. """
    df[['lat', 'long']] = pd.DataFrame(df['lat_long'].tolist(), index=df.index)
    k = 6378137
    df['web_merc_x'] = df['long'] * (k * np.pi / 180.0)
    df['web_merc_y'] = np.log(np.tan((90 + df['lat']) * np.pi / 360.0)) * k
    return df

_df = latlong_to_web_mercator(_df)

In [None]:
cols = ['organization', 3, 4, 5, 7, 11, 'org_trajectory_rel_str', 
        'org_trajectory_rel', 'org_trajectory_rel_per_capita', 'polygon', 
        'web_merc_x', 'web_merc_y',  'current_year_emissions', 
        'current_year_emissions_per_capita', 'current_year_emissions_type']
_df = _df[cols].copy()

col_rename = {
    'organization': 'city',
    3: 'base_year',
    4: 'target_introduced',
    5: 'base_year_emissions_tco2e',
    # '_5': 'base_year_emissions_tco2e_per_capita',
    10: 'paris_agreement_alignment',
    11: 'linked_initiatives'
}

_df = (
    _df.sort_values(by=['organization', 3, 7])
       .drop_duplicates(subset=['organization'], keep='first')
       .fillna('')
       .drop(7, axis=1)
       .rename(columns=col_rename)
)
_df['current_year_emissions_mtco2e'] = _df['current_year_emissions'] / 1e6
_df = _df.round({'current_year_emissions_mtco2e': 2})

In [None]:
# Remove polygon area outliers (i.e. Orlando, Yaounde) for colour normalisation when plotting
# Take 95% using z-score
_df['area'] = _df['polygon'].apply(lambda x: x.area)
_df['z_score'] = zscore(_df['area'])
area_max = 2 * np.std(_df['area']) + np.mean(_df['area'])
area_min = -2 * np.std(_df['area']) + np.mean(_df['area'])
_df['area_trimmed'] = _df.apply(
    lambda r: r['area'] if r['z_score'] < 2 else area_max, axis=1
)
_df['area_trimmed'] = _df.apply(
    lambda r: r['area_trimmed'] if r['z_score'] > -2 else area_min, axis=1
)

In [None]:
_df['area'].hist(bins=50)
_df['area_trimmed'].hist(bins=50)

In [None]:
# Ambition colour
import matplotlib

a = list(_df['area_trimmed']) + [1.1]  # Viridis yellow can be hard to see => shift
_df['colour'] = [
    "#%02x%02x%02x" % (int(r), int(g), int(b)) for r, g, b, _ 
    in 255 * matplotlib.cm.viridis(matplotlib.colors.Normalize()(a)[:-1])
]

In [None]:
# Create tooltip text and image
import cairosvg
import base64

def svg_to_b64_png(svgs):
    """ Encode SVG to base64 PNG. """
    urls = []
    for svg in svgs:
        png = cairosvg.svg2png(svg)
        url = 'data:image/png;base64,' + base64.b64encode(png).decode('utf-8')
        urls.append(url)
    return urls


_df['polygon_image'] = _df['polygon'].apply(lambda p: p._repr_svg_())
_df['polygon_area'] = _df['polygon'].apply(lambda p: p.area)
_df.drop(columns=['polygon', 'linked_initiatives'], axis=1, inplace=True)  # not JSON compliant
_df['image_files'] = svg_to_b64_png(_df['polygon_image'])
_df['_org_trajectory_rel'] = _df['org_trajectory_rel_str']

In [None]:
# Save plot data for display notebook
_df.to_csv('ambition_bokeh_plot_data.csv')

In [None]:
from bokeh.models import *
from bokeh.plotting import *
from bokeh.io import *
from bokeh.tile_providers import *
from bokeh.palettes import *
from bokeh.transform import *
from bokeh.layouts import *

scale = 2000
plot_height = 600
x = _df['web_merc_x']
y = _df['web_merc_y']

# Centre map on London (51.5074, 0.1278)
x_min=int(14226.630 - (scale ** 2))
x_max=int(14226.630 + (scale ** 2))
y_min=int(6711542.475 - (scale ** 2))
y_max=int(6711542.475 + (scale ** 2))

tile_provider = get_provider(CARTODBPOSITRON)

plot = figure(
    title='City emissions trajectories (2020-2050)',
    match_aspect=True,
    tools='wheel_zoom,pan,reset,save',
    x_range=(x_min, x_max),
    y_range=(y_min, y_max),
    x_axis_type='mercator',
    y_axis_type='mercator',
    height=plot_height,
    width=750
)

plot.grid.visible = True
map = plot.add_tile(tile_provider)
map.level = 'underlay'
plot.xaxis.visible = False
plot.yaxis.visible = False
plot.title.text_font_size='20px'

output_notebook()

In [None]:
def bubble_map(plot, df, radius_col, scale, leg_label):
    """ Add bubble map to existing Bokeh plot using """
    df['radius'] = [int(float(i)) * scale for i in df[radius_col]]
    source = ColumnDataSource(df)
    c = plot.circle(
        x='web_merc_x', y='web_merc_y', color='colour', source=source, size=1, 
        fill_alpha=0.4, radius='radius', legend_label=leg_label, hover_color='red'
    )

    circle_hover = HoverTool(
        mode='mouse', point_policy='follow_mouse', renderers=[c],
        tooltips="""
            <strong>@city</strong><br>
            Current: @current_year_emissions_mtco2e MtCO2e (@current_year_emissions_type)<br>
                &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
                @current_year_emissions_per_capita tCO2e per capita<br>
            Targets: <br>
            <center>
                <div>
                    <font size="1", style="float: center">@_org_trajectory_rel{safe}</font><br>
                </div>
            </center>
            2020-2050 target reduction profile (per capita):<br>
            <font size="1", style="float: center">Area: @polygon_area</font><br>
            <center>
                <div>
                    <img
                    src="@image_files" height="60" alt="image"
                    style="float: center; margin: 0px 15px 15px 0px; image-rendering: pixelated;"
                    border="2"
                    ></img>
                </div>
            </center>
        """
    )
    circle_hover.renderers.append(c)
    plot.tools.append(circle_hover)
    plot.legend.location = 'top_right'
    plot.legend.click_policy = 'hide'



bubble_map(
    plot=plot,
    df=_df, 
    radius_col='current_year_emissions', 
    leg_label='Cities', 
    scale=0.01
)

# Create colour bar legend with custom title
color_bar = ColorBar(
    color_mapper=LogColorMapper(palette="Viridis256", low=0, high=1), ticker=LogTicker(), 
    label_standoff=12, border_line_color=None, location=(0,0)
)
color_bar_title = '<<< less ambitous                                        more ambitious >>>'
color_bar_plot = figure(
    title=color_bar_title, title_location='right', height=plot_height, width=100, 
    toolbar_location=None, min_border=0, outline_line_color=None
)

color_bar_plot.add_layout(color_bar, 'right')
color_bar_plot.title.align='center'
color_bar_plot.title.text_font_size = '9pt'

layout = row(plot, color_bar_plot)

show(layout)

# Benefits
This section is sorted by co-benefit:
* Health
* Employment
* Congestion
* Inclusion
* NLP

Some metrics use an NLP model to label whether a sequence of free text discusses the implementation of, or an action related to, one of these co-benefits. These are computed separately from their respective co-benefit.

## Health
### Life expectancy at birth

In [None]:
# Dataframe which will compile our results
df_b = get_city_with_county_and_lat_long()

# Merge ambition metric for the 85 US cities we have sufficient data for
df_ambition = df_ambitions_T[['organization', 'ambition']].drop_duplicates()
df_b = df_b.merge(df_ambition, on='organization', how='left')

In [None]:
## Helper functions
# Helpful renaming dict for joining dataframes
org_country_rename_dict = {'Organization': 'organization', 'Country': 'country'}

# Some helper functions
qna = 'Question not applicable'

def normalize_column(df: pd.DataFrame, col: str = None) -> pd.Series:
    """ Normalizes values in a given column of df. """
    _df = df.copy()
    _df[col] = _df[col].astype(float)
    max_value = _df[col].max()
    min_value = _df[col].min()
    if min_value == max_value:
        _df[col] = _df[col].isna().replace(True, np.nan).astype('float') + 1
    else:
        _df[col] = (_df[col] - min_value) / (max_value - min_value)
    return _df[col]


def get_ordered_repeating_values(df, order_col, return_col=None):
    """ 
    Given a dataframe of the CDP questionnaire format, return the dataframe ordered 
    by Organization and 'order_col' and an ordered list of the unique values in 
    'return_col'. This is useful when there are multiple columns or rows per question 
    and you're transposing to get columns as rows (for renaming columns).
    """
    return_col = return_col if return_col else order_col
    _df = df.copy()
    _df = _df.sort_values(by=['organization', order_col])
    org_sample = _df.iloc[1, :].loc['organization'] 

    return_col_unique_values = (
        _df[(_df['organization'] == org_sample)]
            .groupby('organization')[return_col]
            .apply(list)[org_sample]
    )
    return _df, return_col_unique_values


def get_bounds_using_iqr(df: pd.DataFrame, col: str) -> Tuple[float, float]:
    """ Return upper and lower bounds based on 1.5 * interquartile range. """
    _df = df.copy()
    _df = _df.loc[~pd.isnull(_df[col]), col].astype(float)
    upper_quantile, lower_quantile = np.percentile(_df, [75, 25])
    iqr = upper_quantile - lower_quantile
    lower_bound = lower_quantile - 1.5 * iqr
    upper_bound = upper_quantile + 1.5 * iqr
    return float(lower_bound), float(upper_bound)

In [None]:
import pandas as pd
import ast

fpath = '../input/us-life-expectancy-at-birth-20102015/US_life_expectancy_at_birth_by_state_and_census_tract_2010_2015.csv'
df_life_exp = pd.read_csv(fpath)

df_us_city_locs = get_city_with_county_and_lat_long()

In [None]:
df_life_exp['county_clean'] = (
    df_life_exp['County'].apply(lambda s: s.split(',')[0].replace('County', '').strip())
)
df_life_exp = (
    df_life_exp.groupby('county_clean')
               .agg({'Life Expectancy': 'median'})
               .reset_index()
               .rename(columns={
                   'Life Expectancy': 'life_expectancy',
                   'county_clean': 'county_name'
               })
)

In [None]:
df_us_city_locs = df_us_city_locs.merge(
    df_life_exp[['county_name', 'life_expectancy']],
    on='county_name',
    how='left'
)

In [None]:
# Get state median life expectancy
median_life_expectancy_by_state = (
    df_us_city_locs.groupby('state_id')['life_expectancy'].median()
)

# Fill states with no life expectancy data with US state median
median_life_expectancy_by_state = (
    median_life_expectancy_by_state.fillna(median_life_expectancy_by_state.median())
                                   .to_dict()
)
df_us_city_locs['life_expectancy_state'] = (
    df_us_city_locs['state_id'].map(median_life_expectancy_by_state)
)

In [None]:
_n = pd.isnull(df_us_city_locs['life_expectancy'])
a = df_us_city_locs.loc[_n, 'county_name'].shape[0]
df_us_city_locs['life_expectancy'].fillna(
    df_us_city_locs['life_expectancy_state'], 
    inplace=True
)
b = df_us_city_locs.loc[_n, 'county_name'].shape[0]
c = df_us_city_locs.shape[0]

msg = 'Filled missing values for life expectancy in {} counties ({:.1f}%).'
print(msg.format(a-b, (a-b)/c*100))

In [None]:
df_life_exp = df_us_city_locs.copy()
df_life_exp['life_expectancy_rel'] = normalize_column(df_life_exp, 'life_expectancy')

In [None]:
# Merge to results dataframe
df_b = df_b.merge(
    df_life_exp[['organization', 'life_expectancy', 'life_expectancy_rel']], 
    on='organization',
    how='left'
)

### Air quality in cities
Pollution level for the most recent year available for PM2.5, PM10, NO2, O3, SO2. Source: (Q10.14)

In [None]:
cols = [
    'Most  recent years available (select year)',
    'Average concentration for most recent year available (ug/m3)'
]
df_aq = df_cities[
    (df_cities['Question Number'] == '10.14') &
    (df_cities['Column Name'].isin(cols)) &
    (df_cities['Country'] == 'United States of America')
].rename(columns={'Organization': 'organization'})

df_aq['Response Answer'] = df_aq['Response Answer'].replace(qna, np.nan)
df_aq, cols = get_ordered_repeating_values(df_aq, 'Row Number', 'Row Name')


# Rename columns after transpose
# Tailor for year for this question (questions for year are the same) and we don't 
# want to overwrite them in the dict; add '_year' flag.
rename_dict_root = {
    'NO2 (1 year (annual) mean)': 'q10.14_no2_annual_mean_rel',
    'O3 (Daily maximum 8 hour mean)': 'q10.14_o3_daily_max_8hr_mean_rel',
    'PM10 (1 year (annual) mean)': 'q10.14_pm10_annual_mean_rel',
    'PM10 (Maximum 24-hour average)': 'q10.14_pm10_24hr_mean_rel',
    'PM2.5 (1 year (annual) mean)': 'q10.14_pm2.5_annual_mean_rel',
    'PM2.5 (Maximum 24-hour average)': 'q10.14_pm2.5_max_24hr_mean_rel',
    'SO2 (Maximum 24-hour average)': 'q10.14_so2_max_24hr_mean_rel'
}
cols_with_year = []
for c in cols:
    for existing, new in rename_dict_root.items():
        c = c.replace(existing, new)
    if c + '_year' not in cols_with_year:
        cols_with_year.append(c + '_year')
    else:
        cols_with_year.append(c)
ordered_questions_dict = {i: c for i, c in enumerate(cols_with_year)}
ordered_questions_dict

# Transpose relevant rows to columns
df_aq = (
    df_aq.sort_values(by=['Column Number', 'Row Number'])
      .groupby(['organization'])['Response Answer']
      .apply(lambda df: df.reset_index(drop=True))
      .unstack()
      .reset_index()
      .rename(columns=ordered_questions_dict)
)

for c in rename_dict_root.values():
    df_aq[c] = 1 - normalize_column(df_aq, c)

In [None]:
# Merge to results dataframe
df_b = df_b.merge(df_aq, on='organization', how='left')

### Green areas
Area of parks per capita. Source: Q11 and Q0.5 of city questionnaire.

In [None]:
cols = ['Organization', 'Country', 'Response Answer']
df_green = df_cities.loc[
    (df_cities['Question Number'] == '11.0') &
    (~df_cities['Response Answer'].isin([qna])) &
    (df_cities['Country'] == 'United States of America')
].loc[:, cols]

# Remove effect of outliers using interquartile range
# (i.e. City of Lagos, City of Cape Town)
_, ub = get_bounds_using_iqr(df_green, 'Response Answer')
df_green.loc[df_green['Response Answer'].astype(float) >= ub] = ub

# Make per capita
population = _df_pop[['Organization', 'latest_year_pop', 'latest_year']].copy()
df_green = df_green.merge(population, on='Organization', how='left')
df_green['park_space_sq_km_per_capita'] = (
    df_green['Response Answer'].astype(float) / df_green['latest_year_pop'] 
)
df_green['park_space_sq_km_per_capita_rel'] = normalize_column(
    df_green, 'park_space_sq_km_per_capita'
)
df_green['park_space_sq_km_per_capita_rel'].fillna(0, inplace=True)

In [None]:
# Merge to results dataframe
df_green.rename(columns={'Organization': 'organization'}, inplace=True)
cols = ['organization', 'park_space_sq_km_per_capita_rel']
df_b = df_b.merge(df_green[cols], on='organization', how='left')

### Access to potable water
Share of total population having access to potable water (Q14.1):

In [None]:
cols = ['Organization', 'Country', 'Response Answer']
df_water = df_cities.loc[
    (df_cities['Question Number'] == '14.1') &
    (df_cities['Country'] == 'United States of America'), 
    cols
]
# Replace zero with NaN
df_water['Response Answer'] = (
    df_water['Response Answer'].astype(float)
                               .mask(df_water['Response Answer'] == 0)
)
df_water['potable_water_rel'] = normalize_column(df_water, 'Response Answer')
df_water['potable_water_rel'] = df_water['potable_water_rel'].fillna(0)

In [None]:
# Merge to results dataframe
df_water.rename(columns={'Organization': 'organization'}, inplace=True)
cols = ['organization', 'potable_water_rel']
df_b = df_b.merge(df_water[cols], on='organization', how='left')

### Food security

In [None]:
# Q12.6: Percentage of population that is food insecure
cols = ['Organization', 'Country', 'Response Answer']
df_food = df_cities.loc[
    (df_cities['Question Number'] == '12.6') &
    (df_cities['Column Number'] == 1) &
    (df_cities['Country'] == 'United States of America') &
    (~df_cities['Response Answer'].isin([qna, np.nan])),
    cols
]
df_food['q12.6'] = 1 - normalize_column(df_food, 'Response Answer')
df_food = df_food.drop('Response Answer', axis=1)

# Merge to results dataframe
df_food.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(df_food, on=['organization', 'country'], how='left')

### Supplementary
Extracting data from other questions involved in the KPI:

In [None]:
# Q2.0b: Is public health listed as one of the areas/sectors covered by the risk and 
# vulnerability assessment (Column_Number=8)?
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '2.0b') &
    (df_cities['Column Number'] == 8) &
    (df_cities['Country'] == 'United States of America'),
    cols
]

_df['Response Answer'] = _df['Response Answer'].str.lower().fillna('')
_df['Response Answer'] = _df['Response Answer'].str.contains('health').astype('int')
_df['Response Answer'] = _df['Response Answer'].astype(float)
_df = _df.groupby(['Organization', 'Country']).max().reset_index()
_df = _df.rename(columns={'Response Answer': 'q2.0b'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q2.2: Factors that affect ability to adapt: is public health included in the assessment?
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '2.2') &
    (df_cities['Column Number'] == 4) &
    (df_cities['Country'] == 'United States of America'),
    cols
]

# 1 if not empty
_df['Response Answer'] = -pd.isnull(_df['Response Answer']).astype(int) + 1
_df = _df.groupby(['Organization', 'Country']).max().reset_index()
_df = _df.rename(columns={'Response Answer': 'q2.2'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q3.0: Do any of the actions include public health as a co-benefit?
cols = ['Organization', 'Country', 'Response Answer']
answers = ['Improved resource quality (e.g. air, water)', 'Improved public health']
_df = df_cities.loc[
    (df_cities['Question Number'] == '3.0') &
    (df_cities['Column Number'] == 6) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'].isin(answers)),
    cols
]
_df = _df.groupby(['Organization', 'Country']).count().reset_index()
_df['Response Answer'] = _df['Response Answer'].apply(lambda c: c > 0).astype(int)
_df = _df.rename(columns={'Response Answer': 'q3.0_health'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q10.7: Do you have a low or zero-emission zone in your city?
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '10.7') &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'] == 'Yes'),
    cols
]
_df = _df.groupby(['Organization', 'Country']).count().reset_index()
_df = _df.rename(columns={'Response Answer': 'q10.7'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q10.9: How many public access EV charging points do you have in your city and/or 
# metropolitan area for the following types. (Met. area => Column Number = 2)
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '10.9') &
    (df_cities['Column Number'] == 2) &
    (df_cities['Row Name'] == 'All types') &
    (df_cities['Country'] == 'United States of America') &
    (~df_cities['Response Answer'].isin([qna, np.nan])),
    cols
]

# Make per capita
population = _df_pop[['Organization', 'latest_year_pop', 'latest_year']].copy()
_df = _df.merge(population, on='Organization', how='left')
_df['ev_charging_points_per_capita'] = (
    _df['Response Answer'].astype(float) / _df['latest_year_pop'] 
)
_df['ev_charging_points_per_capita_rel'] = normalize_column(
    _df, 'ev_charging_points_per_capita'
)

# Merge to results dataframe
cols = ['organization', 'country', 'ev_charging_points_per_capita_rel']
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df[cols], on=['organization', 'country'], how='left')

In [None]:
# Q14.2: Please select the actions you are taking to reduce the risks to your city‚Äôs 
# water security
#    - If the answer is yes and Q14.3 includes an adaptation action ‚Äì full score
#    - If the answer is yes and no adaptation action included in Q14.3 ‚Äì zero score.

cols = ['Organization', 'Country', 'Question Number', 'Column Name', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'].isin(['14.2', '14.3'])) &
    (df_cities['Column Name'].isin([np.nan, 'Adaptation action'])) &
    (df_cities['Country'] == 'United States of America') &
    (~df_cities['Response Answer'].isin([qna, np.nan])),
    cols
] 

# Find orgs with answers to both 14.2, 14.3 where 14.3 includes 'Adaptation action'
adaptation_action_incl = (
    _df.groupby('Organization')['Column Name']
       .apply(lambda r: 'Adaptation action' in list(r))
       .astype(int)
       .rename('q14.2')
)

_df = (
    _df.merge(adaptation_action_incl, on='Organization', how='left')
       .drop_duplicates(subset=['Organization'])
)
_df = _df[['Organization', 'Country', 'q14.2']]

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q12.4: How does your city increase access to sustainable foods?
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '12.4') &
    (df_cities['Column Number'] == 1) &
    (df_cities['Country'] == 'United States of America'),
    cols
]

_df = _df.groupby(['Organization', 'Country'])['Response Answer'].apply(list).reset_index()
mapping = {'No': 0., 'Yes': 0.25, np.nan: 0, 'Do not know': 0}
_df['q12.4'] = _df['Response Answer'].apply(lambda l: sum(mapping.get(_l) for _l in l))
_df = _df[['Organization', 'Country', 'q12.4']]

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
_df['q12.4'] = normalize_column(_df, 'q12.4')
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

## Employment
### Unemployment rate

In [None]:
fpath = '../input/cdp-unlocking-climate-solutions/Supplementary Data/CDC Social Vulnerability Index 2018/SVI2018_US_COUNTY.csv'
cols = {'ST_ABBR': 'state_id', 'COUNTY': 'county_name', 'MP_UNEMP': 'unemp_rate'}
df_emp = pd.read_csv(fpath, usecols=cols.keys())
df_emp = df_emp.rename(columns=cols)
df_emp = df_emp.loc[df_emp['unemp_rate'] >= 0]

# Remove effect of outliers using interquartile range
# (i.e. Loving, TX: 54.8% unemployment)
_, ub = get_bounds_using_iqr(df_emp, 'unemp_rate')
df_emp.loc[df_emp['unemp_rate'].astype(float) >= ub] = ub

# Normalize after merge as cities exist in emp dataset which don't in results df
cols = ['county_name', 'state_id']
df_b = df_b.merge(df_emp[cols + ['unemp_rate']], on=cols, how='left')
df_b['emp_rate_rel'] = 1 - normalize_column(df_b, 'unemp_rate')
df_b = df_b.drop('unemp_rate', axis=1)

### Share of green jobs
Share of green jobs in the city (Q6.15):

In [None]:
cols = ['Organization', 'Country', 'Response Answer']
df_gjobs = df_cities.loc[
    (df_cities['Question Number'] == '6.15') &
    (df_cities['Column Number'] == 1) &
    (~df_cities['Response Answer'].isin([qna])) &
    (df_cities['Country'] == 'United States of America')
].loc[:, cols]

# Make per capita
population = _df_pop[['Organization', 'latest_year_pop', 'latest_year']].copy()
df_gjobs = df_gjobs.merge(population, on='Organization', how='left')
df_gjobs['green_jobs_per_capita'] = (
    df_gjobs['Response Answer'].astype(float) / df_gjobs['latest_year_pop'] 
)
df_gjobs['green_jobs_per_capita_rel'] = normalize_column(
    df_gjobs, 'green_jobs_per_capita'
)
df_gjobs['green_jobs_per_capita_rel'].fillna(0, inplace=True)

df_gjobs = df_gjobs[~pd.isnull(df_gjobs['green_jobs_per_capita'])]

# Merge to results dataframe
cols = ['organization', 'country', 'green_jobs_per_capita_rel']
df_gjobs.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(df_gjobs[cols], on=['organization', 'country'], how='left')

## Congestion
### Average commute time
Average commute time is taken from the US Census Bureau in the format 'count of occurances in bin per county'. We assume the unbounded upper bin has a median of 65 minutes, i.e. treating it as a bin of equal width as the lower unbounded bin (for which we take x < 10 => x=5). The median is calculated using the following formula:

<center>median = ùêø+ùë§(ùëõ/2‚àíùëê)/ùëì </center>

where: 
* ùêø is the lower limit of the bin containing the median
* ùë§ is the width of that bin 
* ùëõ is the total population
* ùëê is the cumulative count (cumulative frequency) up to ùêø (the end of the previous bin)
* ùëì is the count (frequency)

In [None]:
fpath = '../input/average-commute-time-by-city-us-census-bureau/ACSDT1Y2019.B08135_data_with_overlays_2020-11-29T095052.csv'
cols = pd.read_csv(fpath, header=1, nrows=1).columns.tolist()
cols = [c for c in cols if not 'Margin of Error!' in c]
df_ct = pd.read_csv(fpath, header=1, usecols=cols)

prepending = 'Estimate!!Aggregate travel time to work (in minutes):!!'
df_ct = df_ct.rename(columns={
    c: c.replace(prepending, '').replace(' ', '_').lower() for c in df_ct.columns
})
df_ct = df_ct.rename(columns={
    'estimate!!aggregate_travel_time_to_work_(in_minutes):': 'sum_of_travel_time'
})

Bin = collections.namedtuple('Bin', ['median', 'width'])
bins_map = {
    'less_than_10_minutes': Bin(5, 10),
    '10_to_14_minutes': Bin(12.5, 5),
    '15_to_19_minutes': Bin(17.5, 5),
    '20_to_24_minutes': Bin(22.5, 5),
    '25_to_29_minutes': Bin(27.5, 5),
    '30_to_34_minutes': Bin(32.5, 5),
    '35_to_44_minutes': Bin(40, 10),
    '45_to_59_minutes': Bin(52.5, 15),
    '60_or_more_minutes': Bin(65, 10)
}

In [None]:
def get_median_from_bins(r: pd.Series, bin_cols: List[str], bins_map: Dict[str, Bin]):
    """
    Gets median value from dataframe with bin index and bin value, given bin widths 
    and center value described in bin_map.
    """
    # Convert to dataframe, get bin width and median value
    r = r[bin_cols]
    r.name = 'count'
    df_r = r.to_frame()
    df_r['median'] = [bins_map[i].median for i in df_r.index]
    df_r['width'] = [bins_map[i].width for i in df_r.index]
    
    # Calculate median bin
    df_r['sum'] = df_r['median'] * df_r['count']
    threshold = df_r['sum'].sum() / 2 
    df_r['cumulative_sum'] = df_r['sum'].cumsum()
    median_bin_iloc = df_r['cumulative_sum'].searchsorted(threshold)
    median_bin = df_r.iloc[median_bin_iloc].name
    
    # Calculate median
    L = df_r.loc[median_bin, 'median'] - df_r.loc[median_bin, 'width'] / 2
    w = df_r.loc[median_bin, 'width']
    n = df_r['sum'].sum()
    c = df_r.loc[median_bin, 'cumulative_sum'] - df_r.loc[median_bin, 'sum']
    f = df_r.loc[median_bin, 'sum']
    return L + w * (n / 2 - c) / f
    

bin_cols = [
    'less_than_10_minutes', 
    '10_to_14_minutes', 
    '15_to_19_minutes', 
    '20_to_24_minutes', 
    '25_to_29_minutes', 
    '30_to_34_minutes', 
    '35_to_44_minutes', 
    '45_to_59_minutes', 
    '60_or_more_minutes'
]
df_ct['av_commute_mins'] = df_ct.apply(
    lambda r: get_median_from_bins(r, bin_cols, bins_map), axis=1
)

In [None]:
# Merge with US city locations information
cols = ['geographic_area_name', 'av_commute_mins']
df_ct = df_ct[cols]
df_ct['county_name'] = df_ct['geographic_area_name'].apply(
    lambda s: s.split(',')[0].replace('County', '').strip().lower()
)
df_ct['state_name'] = df_ct['geographic_area_name'].apply(
    lambda s: s.split(',')[1].strip().lower()
)

# Merge with results dataframe
df_b['state_name'] = df_b['state_name'].str.lower()
df_b['county_name'] = df_b['county_name'].str.lower()
df_b = df_b.merge(
    df_ct[['county_name', 'state_name', 'av_commute_mins']], 
    on=['county_name', 'state_name'], 
    how='left'
)

# Fill missing counties with state average and make relative
a = df_b['av_commute_mins'].isna().sum()
df_b['av_commute_mins'] = (
    df_b.groupby('state_id')['av_commute_mins']
        .transform(lambda r: r.fillna(np.median(r)))
)
b = df_b['av_commute_mins'].isna().sum()

us_median = df_b['av_commute_mins'].median()
df_b['av_commute_mins'] = df_b['av_commute_mins'].fillna(us_median)
df_b['av_commute_mins_rel'] = 1 - normalize_column(df_b, 'av_commute_mins')

c = df_b['av_commute_mins_rel'].isna().sum()
print(f"Filled {a-b} county records with the state average commute time.")
print(f"Filled {b-c} county records with the US average commute time: " +
      f"{us_median:.1f}.")

### Share of transportation 
Excluding private transport/taxis (Q10.1):

In [None]:
df_transp = df_cities.loc[
    (df_cities['Question Number'] == '10.1') &
    (df_cities['Country'] == 'United States of America')
].rename(columns={'Organization': 'organization'})

df_transp['Response Answer'] = df_transp['Response Answer'].replace(qna, np.nan)

# Transpose relevant rows to columns
df_transp, cols = get_ordered_repeating_values(df_transp, 'Column Name')
df_transp = (
    df_transp.sort_values(by=['Column Number'])
             .groupby(['organization'])['Response Answer']
             .apply(lambda df: df.reset_index(drop=True))
             .unstack()
             .reset_index()
             .rename(columns={i: c for i, c in enumerate(cols)})
)

# Scale sum down to 100, but not up to 100
df_transp['sum'] = df_transp[cols].astype(float).sum(axis=1, min_count=1)
msg = 'Transport split completed for {}/{} ({:.1f}%) records.'
a = df_transp[df_transp['sum'] >= 100].shape[0]
b = df_transp.shape[0]
print(msg.format(a, b, a/b*100))

def scale_to_100(r: pd.Series, cols: List[str]) -> pd.Series:
    for c in cols:
        r[c] = float(r[c]) * 100 / r['sum']
    r['sum'] = 100.
    return r

df_transp.loc[df_transp['sum'] > 100] = (
    df_transp.loc[df_transp['sum'] > 100]
             .apply(lambda r: scale_to_100(r, cols), axis=1)
)

# Get percentage which is not private car / taxi
undesired_cols = ['Private motorized transport', 'Taxis or For Hire Vehicles']
cols = [c for c in cols if c not in undesired_cols]
df_transp['sum_public_tr'] = df_transp[cols].astype(float).sum(axis=1, min_count=1)
df_transp['public_tr_rel'] = normalize_column(df_transp, 'sum_public_tr').fillna(0)
df_transp = df_transp[['organization', 'public_tr_rel']]

# Merge to results dataframe
df_b = df_b.merge(df_transp, on='organization', how='left')

### Supplementary
Extracting data form other questions involved in the KPI:

In [None]:
# Q2.1: Is transport identified as areas/sectors covered by the risk and 
# vulnerability assessment?
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '2.1') &
    (df_cities['Column Number'] == 6) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'] == 'Transport'),
    cols
]
_df = _df.drop_duplicates().rename(columns={'Response Answer': 'q2.1'})
_df['q2.1'] = 1

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q5.4: Describe the anticipated outcomes of the most impactful mitigation actions your 
# city is currently undertaking; the total cost of the action and how much is being 
# funded by the local government. If one or more mitigation action related to ‚ÄúMass 
# Transit‚Äù is/are included ‚Äì full score
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '5.4') &
    (df_cities['Column Number'] == 1) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'].str.contains('Mass Transit')),
    cols
]
_df['Response Answer'] = 1
_df = _df.drop_duplicates().rename(columns={'Response Answer': 'q5.4'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q5.5a: Please attach your city‚Äôs climate change mitigation plan below. If your city 
# has both mitigation and energy access plans, please make sure to attach all relevant 
# documents below.
#    - If area covered includes ‚ÄúTransport (Mobility)‚Äù - 1 (Column Number = 5)
#    - If not included ‚Äì 0
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '5.5a') &
    (df_cities['Column Number'] == 5) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'] == 'Transport (Mobility)'),
    cols
]
_df['Response Answer'] = 1
_df = _df.drop_duplicates().rename(columns={'Response Answer': 'q5.5a'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

## Inclusion

In [None]:
fpath = '../input/urban-institute-racial-and-economic-indexes/urban_inclusion_indexes.csv'
df_inclusion = pd.read_csv(fpath)
df_inclusion['city'] = df_inclusion['city'].str.lower()

cols = ['economic_inclusion_index_2016', 'racial_inclusion_index_2016']

# Merge with results dataset (some cities in the inclusion dataset don't exist
# in the CDP responses, hence safe to do normalization after merge)
df_inclusion.rename(columns={'state': 'state_id'}, inplace=True)
df_inclusion = df_inclusion[['city', 'state_id'] + cols]
df_b = df_b.merge(df_inclusion, on=['city', 'state_id'], how='left')

# Noramlize
cols_median = {}
for c in cols:
    df_b[c] = normalize_column(df_b, c)
    cols_median[c] = df_b[c].median()
    df_b[c] = df_b[c].fillna(cols_median[c])
    
a = set(df_inclusion['city'])
b = set(df_us_city_locs['city'])
msg = (
    f"Racial / economic inclusion index not available for {len(b - a)}/{len(b)} " +
    f"({len(b - a) / len(b) * 100:.1f}%) cities who reported to CDP. These will " +
    f"be filled with the median normalized values - " +
    f"racial: {cols_median['economic_inclusion_index_2016']:.3f} " +
    f"economic: {cols_median['racial_inclusion_index_2016']:.3f}."
)
print(msg)

### Supplementary
Extracting data form other questions involved in the KPI:

In [None]:
# Q3.0: Please describe the main actions you are taking to reduce the risk to, and 
# vulnerability of, your city‚Äôs infrastructure, services, citizens, and businesses 
# from climate change as identified in the Climate...
#    - At least one action (Column Number = 2) includes ‚ÄúSocial inclusion, social 
#      justice‚Äù as a co-benefit ‚Äì 1
#    - Not included ‚Äì 0
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '3.0') &
    (df_cities['Column Number'] == 6) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'] == 'Social inclusion, social justice'),
    cols
]
_df['Response Answer'] = 1
_df = _df.drop_duplicates().rename(columns={'Response Answer': 'q3.0_inclusion'})

# Merge to results dataframe
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

## Labelling city actions: BERT
Code based on multi-label classifier implementation of HuggingFace library, here: https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX

See the CDP:ABC-NLP project notebook for the model.

In [None]:
# Create a simple test dataframe to check; used in development only
# This is completely separate from the training and validation data
# This is Andrew Ng's "blurry cat picture" data :)
df_cheeky_test = pd.DataFrame({
    'response_answer_translated': [
        'We are considering the health of the public.',
        'This will create jobs.',
        'We will make sure to look at all the options.',
        'Our hospitals have set aside additional capacity',
        'Vulnerable people are catered for.',
        'Employment opportunities have been created.',
        'We are making sure vulnerable people have health care plans',
        'We are providing additional funding for improving the transporting network',
        'This will improve general health.',
        'Making sure vulnerable groups are considered when planning health measures',
        'Including traffic reduction schemes to improve health'
    ],
    'label': [
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 1, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 1, 1, 0],
        [0, 0, 1, 1]
    ]
})

### Answering questions with the model

In [None]:
import tensorflow as tf
import torch
import pandas as pd

# Check GPU availability
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('No GPU found.')
    

# Initialise pytorch with GPU
if torch.cuda.is_available():    
    device = torch.device('cuda')
    torch.cuda.empty_cache()
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    gpu = True
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')
    gpu = False

In [None]:
# Settings - only change if respective changes made in NLP notebook first
TOKEN_LENGTH = 512
BATCH_SIZE = 16
SEED = 42
threshold = 0.5

# Model to load
load_model = True
model_tag = 4
model_epoch = 750

In [None]:
from transformers.modeling_bert import BertPreTrainedModel, BertModel
from torch.nn import BCEWithLogitsLoss

class BertFromPreTrained(BertPreTrainedModel):
    """
    BERT model for encoding.
    """
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        
        return pooled_output
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [None]:
pretrained_bert = BertFromPreTrained.from_pretrained(
    'bert-base-uncased',  # Use the 12-layer BERT model, with an uncased vocab.
    output_attentions=False,
    output_hidden_states=False,
)

pretrained_bert.to(device)
pretrained_bert.freeze_bert_encoder()

In [None]:
import torch
import torch.nn.functional as F
from sklearn.datasets import make_multilabel_classification
from torch import optim

class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = torch.nn.Dropout(0.1)
        self.layer1 = torch.nn.Linear(768, 32)
        # self.layer2 = torch.nn.Linear(32, 32)
        self.logits = torch.nn.Linear(32, 4)
        # self.relu = torch.nn.ReLU
        
    def forward(self, x):
        x = self.dropout(x)
        x = self.layer1(x)
        x = F.relu(x)
        #x = self.layer2(x)
        #x = F.relu(x)
        return self.logits(x)


model = Network()
# model.to(device)

In [None]:
# Load weights
if load_model and gpu:
    fpath = f'../input/cdp-abc-nlp/model_{model_tag}_{model_epoch}'
    print(f'Model fpath: {fpath}')
    model.load_state_dict(torch.load(fpath))
    model.to(device)

In [None]:
import pandas as pd
import functools
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import torch.nn.functional as F


def make_predictions(model, df_test: pd.DataFrame, text_col: str, threshold: float = 0.5):
    """ Predict labels using model on column in df_test."""
    
    def tokenize_dataset(df_test: pd.DataFrame, text_col: str):
        print('Number of test sequences: {:,}\n'.format(df_test.shape[0]))
        sequences = df_test[text_col].values

        # Load tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        input_ids = []
        attention_masks = []
        for s in sequences:
            encoded_dict = tokenizer.encode_plus(
                s,
                add_special_tokens = True,
                max_length = TOKEN_LENGTH,
                pad_to_max_length = True,
                return_attention_mask = True,
                return_tensors = 'pt',
            )

            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])


        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        prediction_data = TensorDataset(input_ids, attention_masks)
        prediction_sampler = SequentialSampler(prediction_data)
        prediction_dataloader = DataLoader(
            prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE
        )
        return prediction_dataloader
    
    
    # Prediction on test set
    model.eval()

    predictions = [] 
    for batch in tokenize_dataset(df_test, text_col):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        # Forward pass, predict
        with torch.no_grad():
            b_token_type_ids = None
            outputs = pretrained_bert(b_input_ids, b_token_type_ids, b_input_mask)  # no labels
            outputs = model(outputs)
            m = torch.nn.Sigmoid()
            outputs = m(outputs)

        logits = outputs.detach().cpu().numpy()
        predictions.append(logits)

    print('Completed predictions.')
    predictions_binary = [(p > threshold).astype(float) for p in predictions]
    return np.vstack(predictions_binary), np.vstack(predictions)


def _predict_on_dataframe(model, output_col_names: List[str], df_test: pd.DataFrame, 
                          text_col: str, threshold: float = 0.5):
    
    preds, probs = make_predictions(model, df_test, text_col, threshold)
    df_preds = pd.DataFrame(preds, columns=output_col_names)
    return pd.concat([df_test.reset_index(), df_preds], axis=1)


# Add partial function for our use case
output_col_names = ['employment', 'inclusion', 'health', 'congestion']
predict_on_dataframe = functools.partial(_predict_on_dataframe, model, output_col_names)

In [None]:
# Q3.0. Do any of the actions explain how public health is incorporated in the 
# adaptation actions?
if gpu:
    q_col = 'q3.0_nlp_h'
    cols = ['Organization', 'Country', 'Response Answer']
    _df = df_cities.loc[
        (df_cities['Question Number'] == '3.0') &
        (df_cities['Column Number'] == 8) &
        (df_cities['Country'] == 'United States of America') &
        (~df_cities['Response Answer'].isin([qna, np.nan])),
        cols
    ] 

    _df = predict_on_dataframe(_df, 'Response Answer', threshold=threshold)
    _df = _df.rename(columns={'health': q_col})
    _df = _df.groupby(['Organization', 'Country']).agg({q_col: 'sum'}).reset_index()
    _df[q_col] = _df[q_col].apply(lambda x: (x > 0)).astype(float)

    # Merge to results dataframe
    _df.rename(columns=org_country_rename_dict, inplace=True)
    cols = ['organization', 'country', q_col]
    df_b = df_b.merge(_df[cols], on=['organization', 'country'], how='left')
    df_b[q_col] = df_b[q_col].fillna(0)

In [None]:
# Q3.5. Please explain how your city has addressed vulnerable groups through transformative 
# action ‚Äì employment is mentioned
if gpu:
    q_col = 'q3.5_nlp_e'
    cols = ['Organization', 'Country', 'Response Answer']
    _df = df_cities.loc[
        (df_cities['Question Number'] == '3.5') &
        (df_cities['Country'] == 'United States of America') &
        (~df_cities['Response Answer'].isin([qna, np.nan])),
        cols
    ]

    _df = predict_on_dataframe(_df, 'Response Answer', threshold=threshold)
    _df = _df.rename(columns={'employment': q_col})
    _df = _df.groupby(['Organization', 'Country']).agg({q_col: 'sum'}).reset_index()
    _df[q_col] = _df[q_col].apply(lambda x: (x > 0)).astype(float)

    # Merge to results dataframe
    _df.rename(columns=org_country_rename_dict, inplace=True)
    cols = ['organization', 'country', q_col]
    df_b = df_b.merge(_df[cols], on=['organization', 'country'], how='left')
    df_b[q_col] = df_b[q_col].fillna(0)

In [None]:
# Q6.2a. Does your city collaborate in partnership with businesses in your city on 
# sustainability projects?
if gpu:
    q_col = 'q6.2a_nlp_e'
    cols = ['Organization', 'Country', 'Response Answer']
    _df = df_cities.loc[
        (df_cities['Question Number'] == '6.2a') &
        (df_cities['Column Number'] == 3) &
        (df_cities['Country'] == 'United States of America') &
        (~df_cities['Response Answer'].isin([qna, np.nan])),
        cols
    ]

    _df = predict_on_dataframe(_df, 'Response Answer', threshold=threshold)
    _df = _df.rename(columns={'employment': q_col})
    _df = _df.groupby(['Organization', 'Country']).agg({q_col: 'sum'}).reset_index()
    _df[q_col] = _df[q_col].apply(lambda x: (x > 0)).astype(float)

    # Merge to results dataframe
    _df.rename(columns=org_country_rename_dict, inplace=True)
    cols = ['organization', 'country', q_col]
    df_b = df_b.merge(_df[cols], on=['organization', 'country'], how='left')
    df_b[q_col] = df_b[q_col].fillna(0)

In [None]:
# Q6.3. Describe how your local/regional government collaborates and coordinates horizontally on 
# climate action.
if gpu:
    q_col = 'q6.3_nlp_e'
    cols = ['Organization', 'Country', 'Response Answer']
    _df = df_cities.loc[
        (df_cities['Question Number'] == '6.3') &
        (df_cities['Column Number'] == 2) &
        (df_cities['Country'] == 'United States of America') &
        (~df_cities['Response Answer'].isin([qna, np.nan])),
        cols
    ]

    _df = predict_on_dataframe(_df, 'Response Answer', threshold=threshold)
    _df = _df.rename(columns={'employment': q_col})
    _df = _df.groupby(['Organization', 'Country']).agg({q_col: 'sum'}).reset_index()
    _df[q_col] = _df[q_col].apply(lambda x: (x > 0)).astype(float)

    # Merge to results dataframe
    _df.rename(columns=org_country_rename_dict, inplace=True)
    cols = ['organization', 'country', q_col]
    df_b = df_b.merge(_df[cols], on=['organization', 'country'], how='left')
    df_b[q_col] = df_b[q_col].fillna(0)

In [None]:
# Q3.5: Please explain how your city has addressed vulnerable groups through transformative 
# action. (Description includes any actions to support inclusion/address inequality)
if gpu:
    q_col = 'q3.5_nlp_i'
    cols = ['Organization', 'Country', 'Response Answer']
    _df = df_cities.loc[
        (df_cities['Question Number'] == '3.5') &
        (df_cities['Country'] == 'United States of America') &
        (~df_cities['Response Answer'].isin([qna, np.nan])),
        cols
    ]

    _df = predict_on_dataframe(_df, 'Response Answer', threshold=threshold)
    _df = _df.rename(columns={'inclusion': q_col})
    _df = _df.groupby(['Organization', 'Country']).agg({q_col: 'sum'}).reset_index()
    _df[q_col] = _df[q_col].apply(lambda x: (x > 0)).astype(float)

    # Merge to results dataframe
    _df.rename(columns=org_country_rename_dict, inplace=True)
    cols = ['organization', 'country', q_col]
    df_b = df_b.merge(_df[cols], on=['organization', 'country'], how='left')
    df_b[q_col] = df_b[q_col].fillna(0)

# Collaboration
### Corporate location
Prepare corporation locations for later merging with the cities dataset.

In [None]:
import pandas as pd
fpath = '../input/cdp-unlocking-climate-solutions/Supplementary Data/Locations of Corporations/NA_HQ_public_data.csv'
cols = ['account_number', 'organization', 'hq_country', 'address_city', 'address_state']
df_corp_locs = pd.read_csv(fpath, usecols=cols)

# Error handling
us = 'United States of America'
uk = 'United Kindom and Northern Ireland'
df_corp_locs.loc[df_corp_locs['account_number'] == 1271, 'hq_country'] = uk
df_corp_locs.loc[df_corp_locs['account_number'] == 1464, 'hq_country'] = us
df_corp_locs.loc[df_corp_locs['account_number'] == 1464, 'address_city'] = 'new york'
df_corp_locs.loc[df_corp_locs['account_number'] == 1464, 'address_state'] = 'NY'
df_corp_locs.loc[df_corp_locs['account_number'] == 73516, 'address_city'] = 'chicago'
df_corp_locs.loc[df_corp_locs['account_number'] == 73516, 'address_state'] = 'IL'

# Filter US
df_corp_locs = df_corp_locs[df_corp_locs['hq_country'] == us]

# Get state code
_us_state_abbrev = {k.lower(): v for k, v in us_state_abbrev.items()}
df_corp_locs['address_state'] = (
    df_corp_locs['address_state']
        .astype(str)
        .apply(lambda s: _replace(s.lower(), _us_state_abbrev).lower())
)
df_corp_locs['address_city'] = df_corp_locs['address_city'].str.lower()
df_corp_locs = df_corp_locs.drop_duplicates().dropna().reset_index(drop=True)

### Supplementary

In [None]:
# C12.3: Do you engage in activities that could either directly or indirectly 
# influence public policy on climate-related issues through any of the following?
# Each form of reported engagement scores 1

fpath = '../input/cdp-unlocking-climate-solutions/Corporations/Corporations Responses/Climate Change/2020_Full_Climate_Change_Dataset.csv'
df_corp_2020 = pd.read_csv(fpath, low_memory=False)

cols = ['account_number', 'response_value']
_df = df_corp_2020.loc[df_corp_2020['question_number'] == 'C12.3', cols]
_df = (
    _df.groupby('account_number')
       .agg({'response_value': 'nunique'})
       .rename(columns={'response_value': 'c12.3'})
       .reset_index()
)

# Merge and normalize
df_corp_locs = df_corp_locs.merge(_df, on='account_number', how='left')
df_corp_locs['c12.3'] = normalize_column(df_corp_locs, 'c12.3')

Collaboration by company sector and activities:

In [None]:
CollabArea = collections.namedtuple('CollabArea', ['activities', 'sectors'])

collab_areas = [
    CollabArea('Health care facilities', 'Health care provision'),
    CollabArea('Health care facilities, Health care services', 'Health care provision, Other services'),
    CollabArea('Health care facilities, Health care services, Health care supplies, Medical equipment', 'Health care provision, Medical equipment & supplies, Other services'),
    CollabArea('Health care facilities, Health care services, Health care supplies, Medical equipment, Other food processing, Pharmaceuticals', 'Biotech & pharma, Food & beverage processing, Health care provision, Medical equipment & supplies, Other services'),
    CollabArea('Health care facilities, Health care services, Medical equipment', 'Health care provision, Medical equipment & supplies, Other services'),
    CollabArea('Health care services', 'Other services'),
    CollabArea('Health care services, Health care supplies, Insurance, Supermarkets, food & drugstores', 'Convenience retail, Financial services, Medical equipment & supplies, Other services'),
    CollabArea('Health care services, Health care supplies, Medical equipment, Personal care & household products, Pharmaceuticals', 'Biotech & pharma, Chemicals, Medical equipment & supplies, Other services'),
    CollabArea('Health care services, Insurance', 'Financial services, Other services'),
    CollabArea('Health care services, Medical equipment', 'Medical equipment & supplies, Other services'), 
    CollabArea('Health care supplies', 'Medical equipment & supplies'),
    CollabArea('Health care supplies, Medical equipment', 'Medical equipment & supplies'),
    CollabArea('Medical equipment', 'Medical equipment & supplies'),
    CollabArea('Medical equipment, Pharmaceuticals', 'Biotech & pharma, Medical equipment & supplies'),
    CollabArea('Intermodal transport', 'Intermodal transport & logistics'),
    CollabArea('Intermodal transport, Logistics - 3rd party, Road freight', 'Intermodal transport & logistics, Road transport'),
    CollabArea('Logistics - 3rd party, Logistics - transport', 'Intermodal transport & logistics'),
    CollabArea('Logistics - 3rd party, Logistics - transport, Transportation support services, Vehicles & machinery rental & leasing', 'Industrial support services, Intermodal transport & logistics, Trading, wholesale, distribution, rental & leasing'),
    CollabArea('Logistics - transport', 'Intermodal transport & logistics'),
    CollabArea('Rail freight, Transportation support services', 'Industrial support services, Rail transport'),
    CollabArea('Specialist retail, Transportation equipment wholesale & dealing', 'Discretionary retail, Trading, wholesale, distribution, rental & leasing'),
    CollabArea('Transportation infrastructure & other construction', 'Construction')
]

# Get company sector
fpath = '../input/cdp-unlocking-climate-solutions/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv'
df_corp_sec = pd.read_csv(fpath)
df_corp_sec.columns
df_corp_sec = df_corp_sec[['account_number', 'organization', 'country', 'activities', 'sectors']]

df_collab = pd.DataFrame([])
for ca in collab_areas:
    # Find companies with relevant sectors and activities
    _df = df_corp_sec.loc[
        (df_corp_sec['activities'].str.strip() == ca.activities) &
        (df_corp_sec['sectors'].str.strip() == ca.sectors)
    ]
    accounts = set(_df['account_number'])
    
    # Extract count of distinct collaboration types
    _df_collab = df_corp_2020.loc[
        (df_corp_2020['account_number'].isin(accounts)) &
        (df_corp_2020['question_number'] == 'C12.3')
    ]
    _df_collab = (
        _df_collab.groupby('account_number')
                  .agg({'response_value': 'nunique'})
                  .rename(columns={'response_value': 'c12.3_sector'})
                  .reset_index()
    )
    df_collab = pd.concat([df_collab, _df_collab])


# Merge and normalize
df_corp_locs = df_corp_locs.merge(df_collab, on='account_number', how='left')

In [None]:
# Sum, normalize and merge with results dataset
df_corp_collab = (
    df_corp_locs.groupby(['address_city', 'address_state'])
                .agg({'c12.3': 'sum', 'c12.3_sector': 'sum'})
                .reset_index()
)

a = set(df_corp_collab['address_city'])
b = set(df_b['city'])
print(f"{len(a - b)} cities are referenced as one or more corporations' HQs where" +
      " the city does not exist in the 2020 CDP Cities return.")

df_corp_collab['c12.3'] = normalize_column(df_corp_collab, 'c12.3')
df_corp_collab['c12.3_sector'] = normalize_column(df_corp_collab, 'c12.3_sector')
df_corp_collab = df_corp_collab[['address_city', 'address_state', 'c12.3', 'c12.3_sector']]
df_corp_collab['address_state'] = df_corp_collab['address_state'].str.upper()

In [None]:
df_b = df_b.merge(
    df_corp_collab, 
    left_on=['city', 'state_id'], 
    right_on=['address_city', 'address_state'], 
    how='left'
)
df_b = df_b.drop(['address_city', 'address_state'], axis=1)

In [None]:
# Q6.2: Does your city collaborate in partnership with businesses in your city on 
# sustainability projects? Each form of reported collaboration (Column Number = 2) 
# scores 1
cols = ['Organization', 'Country', 'Response Answer']
_df = df_cities.loc[
    (df_cities['Question Number'] == '6.2a') &
    (df_cities['Column Number'] == 2) &
    (df_cities['Country'] == 'United States of America') &
    (~df_cities['Response Answer'].isna()),
    cols
]

_df = (
    _df.groupby(['Organization', 'Country'])
       .agg({'Response Answer': 'nunique'})
       .rename(columns={'Response Answer': 'q6.2'})
       .reset_index()
)

# Merge to results dataframe
_df['q6.2'] = normalize_column(_df, 'q6.2')
_df.rename(columns=org_country_rename_dict, inplace=True)
df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# Q6.2: Does your city collaborate in partnership with businesses in your city on 
# sustainability projects? Each form of reported collaboration (Column Number = 2) 
# scores 1
cols = ['Organization', 'Country', 'Response Answer']
answers_collab_area = [
    'Public Health and Safety', 
    'Transport (Mobility)', 
    'Building and Infrastructure'
    'Social Services'
]
_df_a = df_cities.loc[
    (df_cities['Question Number'] == '6.2a') &
    (df_cities['Column Number'] == 2) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'].isin(answers_collab_area)),
    cols
]

answers_collab_type = [
    'Labour market training initiatives', 
    'Capacity development'
]
_df_t = df_cities.loc[
    (df_cities['Question Number'] == '6.2a') &
    (df_cities['Column Number'] == 3) &
    (df_cities['Country'] == 'United States of America') &
    (df_cities['Response Answer'].isin(answers_collab_type)),
    cols
]

_df = pd.concat([_df_a, _df_t], ignore_index=True)
_df = (
    _df.groupby(['Organization', 'Country'])
       .agg({'Response Answer': 'sum'})
       .rename(columns={'Response Answer': 'q6.2_sectors'})
       .reset_index()
)

_df.shape
# Merge to results dataframe
#_df.rename(columns=org_country_rename_dict, inplace=True)
#_df['q6.2_sectors'] = normalize_column(_df, 'q6.2_sectors')
# df_b = df_b.merge(_df, on=['organization', 'country'], how='left')

In [None]:
# There are no examples for q6.2_sector: submit zeroes
df_b['q6.2_sector'] = 0

### Compile results
Compile and clean the results dataframe for Sections A, B and C (df_b). Note: information yet to be added from the NLP work.

In [None]:
cols_to_fill_zero = [
    'life_expectancy_rel', 'park_space_sq_km_per_capita_rel', 
    'potable_water_rel', 'q2.0b', 'q2.2', 'q3.0_health', 'q10.7', 
    'ev_charging_points_per_capita_rel', 'q14.2',  'q12.4', 'emp_rate_rel', 
    'green_jobs_per_capita_rel', 'public_tr_rel', 'q2.1', 'q5.4', 'q5.5a', 
    'q3.0_inclusion', 'q12.6', 'q10.14_pm2.5_annual_mean_rel_year', 
    'q10.14_pm2.5_max_24hr_mean_rel_year',
    'q10.14_pm10_annual_mean_rel_year', 'q10.14_pm10_24hr_mean_rel_year',
    'q10.14_no2_annual_mean_rel_year', 'q10.14_o3_daily_max_8hr_mean_rel_year',
    'q10.14_so2_max_24hr_mean_rel_year', 'q10.14_pm2.5_annual_mean_rel', 
    'q10.14_pm2.5_max_24hr_mean_rel', 'q10.14_pm10_annual_mean_rel', 
    'q10.14_pm10_24hr_mean_rel', 'q10.14_no2_annual_mean_rel', 
    'q10.14_o3_daily_max_8hr_mean_rel', 'q10.14_so2_max_24hr_mean_rel',
    'c12.3', 'c12.3_sector', 'q6.2'
]

df_b.loc[:, cols_to_fill_zero] = df_b.loc[:, cols_to_fill_zero].fillna(0.)
df_b = df_b.rename(columns={'lat_long_y': 'lat_long'})

## Calculating metrics
### Section B

In [None]:
# Section B
air_quality_cols = [
    'q10.14_pm2.5_annual_mean_rel', 
    'q10.14_pm10_annual_mean_rel',
    'q10.14_no2_annual_mean_rel',
    'q10.14_o3_daily_max_8hr_mean_rel',
    'q10.14_so2_max_24hr_mean_rel'
]
 
def b1_health(r):
    le = r['life_expectancy_rel'] * 0.5
    aq = sum(r[c] for c in air_quality_cols) / len(air_quality_cols) * 0.125
    gn = r['park_space_sq_km_per_capita_rel'] * 0.125
    pw = r['potable_water_rel'] * 0.125
    fs = r['q12.6'] * 0.125
    return sum([le, aq, gn, pw, fs])


def b1_employment(r):
    es = r['emp_rate_rel'] * 0.9
    gj = r['green_jobs_per_capita_rel'] * 0.1
    return sum([es, gj])


def b1_congestion(r):
    ct = r['av_commute_mins_rel'] * 0.75
    pt = r['public_tr_rel'] * 0.25
    return sum([ct, pt])


def b1_inclusion(r):
    ei = r['economic_inclusion_index_2016'] * 0.5
    ri = r['racial_inclusion_index_2016'] * 0.5
    return sum([ei, ri])


df_b['b1_health'] = df_b.apply(lambda r: b1_health(r), axis=1)
df_b['b1_employment'] = df_b.apply(lambda r: b1_employment(r), axis=1)
df_b['b1_congestion'] = df_b.apply(lambda r: b1_congestion(r), axis=1)
df_b['b1_inclusion'] = df_b.apply(lambda r: b1_inclusion(r), axis=1)
df_b['b1'] = df_b[['b1_health', 'b1_employment', 'b1_congestion', 'b1_inclusion']].mean(axis=1)

In [None]:
# Section B
def b2_health(r):
    return np.mean([
        r['q2.0b'],
        r['q2.2'],
        r['q3.0_health'],
        r['q3.0_nlp_h'],
        r['q10.7'],
        r['ev_charging_points_per_capita_rel'],
        r['q14.2'],
        r['q12.4']
    ])


def b2_employment(r):
    return np.mean([
        r['q3.5_nlp_e'],
        r['q6.2a_nlp_e'],
        r['q6.3_nlp_e']
    ])


def b2_congestion(r):
    return np.mean([
        r['q2.1'], 
        r['q5.4'], 
        r['q5.5a']
    ])


def b2_inclusion(r):
    return np.mean([
        r['q3.0_inclusion'],
        r['q3.5_nlp_i']
    ])


df_b['b2_health'] = df_b.apply(lambda r: b2_health(r), axis=1)
df_b['b2_employment'] = df_b.apply(lambda r: b2_employment(r), axis=1)
df_b['b2_congestion'] = df_b.apply(lambda r: b2_congestion(r), axis=1)
df_b['b2_inclusion'] = df_b.apply(lambda r: b2_inclusion(r), axis=1)
df_b['b2'] = df_b[['b2_health', 'b2_employment', 'b2_congestion', 'b2_inclusion']].mean(axis=1)

### Section C

In [None]:
# Section C
def c1(r):
    return np.mean([
        r['q6.2'],
        r['c12.3']
    ])


df_b['c1'] = df_b.apply(lambda r: c1(r), axis=1)
df_b['c2'] = df_b[['q6.2_sector', 'c12.3_sector']].mean(axis=1)

In [None]:
df_b.to_csv('abc_results.csv')
print('Saved!')

In [None]:
df_b.columns

# Code appendix
### Get city latitude / longitude
We use the Python geopy library to access the GeoNames API in order to return the latitude and longitude of the cities in the dataset for plotting. The technical challenge here is to identify the correct city from the CDP survey 'organization' field, which contains the name of the organisation submitting the questionnaire and not the raw city name. In order to perform this efficiently from both a code running and writing perspective, we emply the following methodology:
* Make all tokens lower case, non-accented, punctuation removed.
* Remove 'stop words' related to organisations or bloat terminology (i.e. 'City of ')
* Call the GeoNames API using an ISO-3166 country-code to discern cities of the same name.

Approximately 10% of the dataset had to be completed by hand. Typically, the changes to the 'organization_clean' column required to get GeoNames to recognise the city were very minor, and this automated process could be improved minor effort.

In [None]:
# prevents run by Kaggle notebook on save
if False:
    
    import string 
    import unidecode
    import functools
    import re
    from typing import Set


    def _clean_string(stop_words: Set[str], s: str):
        """ Remove accents, punctuation, capitals and user-defined stop words. """
        s = unidecode.unidecode(s.translate(str.maketrans('', '', string.punctuation)).lower())
        for stop_word in stop_words:
            s = re.sub(r'\b' + stop_word + r'\b', '', s)
            s = re.sub(stop_word + r'\b', '', s)
            s = re.sub(r'\b' + stop_word, '', s)
        return s.strip()

    stop_words = {' city', 'govt ', 'campos de ', 'prefeitura de ', 'ciudad ', 'town of ', 'village of ', 'ville de', 'executive', 'intendencia', 'village', 'intermedio', 'ambiente', 'peoples', 'combined', 'authority', 'borough', 'distrital', 'administration', 'ajuntament', 'alcaldia', 'assembly', 'ayuntamiento', 'city of', ' of ', ' de ', ' di ', ' da ', 'capital', 'council', 'comune', 'kommune', 'district', 'federal', 'government', 'junta', 'regency', 'region', 'county', 'municipality', 'municipal', 'municipalidad', 'municipio', 'metropolitan', 'metropole', 'gemeente', 'prefeitura', 'metropolitana', 'territory', 'township', 'xiv'}
    stop_words = sorted(list(stop_words), key=lambda x: -len(x))  # ensure longest match taken first
    clean_string = functools.partial(_clean_string, stop_words)

In [None]:
if False:
    # Prepare country names for ISO-3166 country-code matching
    df_city_locs = df_cities[['Organization', 'Country']].drop_duplicates().copy()
    df_city_locs = df_city_locs.rename(columns={
        'Organization': 'organization', 
        'Country': 'country'
    })
    df_city_locs['organization_clean'] = df_city_locs['organization'].apply(clean_string)
    df_city_locs['country_lower'] = df_city_locs['country'].apply(lambda s: s.lower())

    country_name_map = {
        'united kingdom of great britain and northern ireland': 'united kingdom',
        'china, hong kong special administrative region': 'hong kong',
        'republic of korea': 'korea, republic of (south korea)',
        'republic of moldova': 'moldova, republic of',
        'taiwan, greater china': 'taiwan',
        'russian federation': 'russia',
        'state of palestine': 'palestinian territory, occupied',
        'united republic of tanzania': 'tanzania, united republic of',
        'bolivia (plurinational state of)': 'bolivia'
    }

    def _replace(s, d):
        return d[s] if s in d.keys() else s

    df_city_locs['country_lower'] = (
        df_city_locs['country_lower'].apply(lambda s: _replace(s, country_name_map))
    )

In [None]:
if False:
    # Get IS0 country codes
    df_country_codes = pd.read_csv(
        '../input/countries-iso-codes/wikipedia-iso-country-codes.csv', 
        keep_default_na=False  # parse Namibia country code 'NA'
    )
    rename_dict = {
        'English short name lower case': 'country_lower', 
        'Alpha-2 code': 'alpha2_code'
    }
    df_country_codes = df_country_codes.rename(columns=rename_dict)
    df_country_codes['country_lower'] = df_country_codes['country_lower'].apply(
        lambda s: s.lower()
    )

    df_city_locs = df_city_locs.merge(
        df_country_codes[['country_lower', 'alpha2_code']],
        on='country_lower', 
        how='left'
    )

In [None]:
if False:
    from geopy import geocoders  

    username = ''   # set up at http://www.geonames.org/login
                    # ensure webservice enabled for your account

    gn = geocoders.GeoNames(username=username)

    # List cities A-Z (in case we run out of free API call 'credits'; tracks progress)
    city_locs_list = sorted(df_city_locs.to_dict(orient='records'), 
                            key=lambda d: d['organization'])

    # Get latitude / longitude information
    city_locs_list_with_geocode = []
    failed_locs = []
    for d in city_locs_list:
        # If making many requests, consider using a 1s time delay
        geocode_call = gn.geocode(d['organization_clean'], country=d['alpha2_code'])
        if geocode_call is not None:
            d['lat_long'] = (geocode_call.latitude, geocode_call.longitude)
            city_locs_list_with_geocode.append(d)
        else:
            failed_locs.append(d)
            msg = 'No geocode information found for: {} ({})'
            print(msg.format(d['organization_clean'], d['alpha2_code']))

    msg = 'Found lat/long info for {}/{} ({:.1f}%) of cities.'
    print(msg.format(
        len(city_locs_list_with_geocode),
        len(city_locs_list), 
        len(city_locs_list_with_geocode) / len(city_locs_list) * 100
    ))

### Get translations of freetext answers
We can make up to 20k word translations each day using the GoogleTranslate free API. This is OK when running the model on new lines but makes creating an initial model training set more difficult as we quickly run out of calls, hence we place this code in the annex. We run them as follows:

In [None]:
if False:
    df_3_0 = df_cities[
        (df_cities['Question Number'] == '3.0') &
        (df_cities['Column Name'] == 'Action description and implementation progress')
    ]

    df_3_5 = df_cities[df_cities['Question Number'] == '3.5']

    df_5_4 = df_cities[
        (df_cities['Question Number'] == '5.4') & 
        (df_cities['Column Name'] == 'Scope and impact of action')
    ]

    _df = pd.concat([df_3_0, df_3_5, df_5_4], axis=0)

    cols = ['Account Number', 'Organization', 'Country', 'Question Number', 'Column Number', 'Response Answer']
    a = _df.shape[0]
    df = _df[~_df['Response Answer'].isin(['Question not applicable'])][cols].dropna()
    b = df.shape[0]

    print('Removed {} NaN records, leaving {} records.'.format(a - b, b))

    # We don't want to waste time trying to translate English answers, so try to pre-empt responses
    # which will be returned in English for all cities within a country. Tackle only the countries
    # with the largest numbers of submissions

    # List of countries with all responses in English
    en_countries = ['United Kingdom of Great Britain and Northern Ireland', 'United States of America', 
                    'Canada', 'Australia', 'New Zealand', 'Denmark', 'Italy', 'Netherlands', 'Sweden', ]

    df[~df['Country'].isin(en_countries)]
    df['Country'].value_counts()

In [None]:
if False:
    from polyglot.detect import Detector

    list_of_dicts = df.to_dict(orient='records')

    # Use polyglot to detect languages to reduce the number of calls we'll make to the Google API
    list_of_dicts_reduced = []
    for d in list_of_dicts:
        if 'response_answer_translated' not in d.keys():
            try:
                lang = Detector(d['Response Answer']).languages[0].code
            except:
                lang = ''
            if lang != 'en':
                list_of_dicts_reduced.append([d])

In [None]:
if False:
    import sys
    import time
    import deep_translator


    for d in list_of_dicts_reduced:
        if 'response_answer_translated' not in d[0].keys():
            time.sleep(1)  # Google requests 0.2s delay but can complain at <1s
            try:
                t = (
                    deep_translator.GoogleTranslator(source='auto', target='en')
                                   .translate(d[0]['Response Answer'])
                )
                d[0]['response_answer_translated'] = t
            except deep_translator.exceptions.NotValidLength:
                d[0]['response_answer_translated'] = ''
                continue
            except deep_translator.exceptions.TooManyRequests:
                print('Too many requests!')
                break
            except: deep_translator.exceptions.TranslationNotFound:
                d[0]['response_answer_translated'] = ''
            except:
                print("Unexpected error:", sys.exc_info()[0])
                # continue

In [None]:
if False:
    df_translated = df.merge(
        df_list_of_dicts_reduced, 
        on=['Account Number', 'Question Number', 'Column Number', 'Response Answer'], 
        how='left'
    )
    df_translated = df_translated.drop(columns=['Organization_y', 'Country_y'])
    df_translated = df_translated.rename(columns={
        'Organization_x': 'organization', 
        'Country_x': 'country', 
        'Response Answer_x': 'response_answer'
    })

    save = False
    if save:
        df_translated.to_csv('bert_translations.csv')
    print('Saved!')
    df_translated.head()