In [None]:
import os
import re
import json 

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.graph_objs as go 
import plotly.express as px 
from plotly.offline import init_notebook_mode

#import geopandas as gpd 
import folium

init_notebook_mode(connected=True)

In [None]:
cc = pd.read_csv('Corporate Climate Change Dataset [INSERT HERE]')
ws = pd.read_csv('Corporate Water Security Dataset [INSERT HERE]')
cities = pd.read_csv('Cities Dataset [INSERT HERE]')
svi = pd.read_csv('EXTERNAL - Social Vulnerability Index data [INSERT HERE] -- from CDC')
cities_uslocs = pd.read_csv('uscities.csv')
cities_cdp = pd.read_csv('NA_HQ_public_data.csv')

In [None]:
def list_dedupe(x):
    return list(dict.fromkeys(x))

# state abbreviation dictionary
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [None]:
cities_6_2 = cities[cities['Question Number'] == '6.2'].rename(columns={'Organization': 'City'})
cities_6_2['Response Answer'].fillna('No Response', inplace=True)

# Map CDP metadata for US state abbreviations
# Form hybrid 'city_state' variable
cities_cdp['state'] = \
    cities_cdp['address_state']\
        .map(us_state_abbrev)\
        .fillna(cities_cdp['address_state'])\
        .replace({'ALBERTA':'AB'})
cities_cdp['address_city'].replace({'CALGARY':'Calgary'}, inplace=True)
cities_cdp.drop(columns=['address_state'], inplace=True)
cities_cdp['city_state'] = cities_cdp['address_city'].str.cat(cities_cdp['state'], sep=", ")

In [None]:
# Count HQs/city
cities_count = \
    cities_cdp[['organization', 'address_city', 'state', 'city_state']]\
        .groupby(['address_city', 'state', 'city_state'])\
        .count()\
        .sort_values(by=['organization'], ascending=False)\
        .reset_index()\
        .rename(columns={'organization':'num_orgs'})


In [None]:
# convert indexes to columns'
cities_count.reset_index(inplace=True)
cities_count.rename(columns = {'index':'city_id'}, inplace=True)
cities.reset_index(inplace=True)
cities.rename(columns = {'index':'city_org_id'}, inplace=True)

# convert id and city label columns into lists
city_id_no = list_dedupe(cities_count['city_id'].tolist())
city_name = list_dedupe(cities_count['address_city'].tolist())

city_org_id_no = list_dedupe(cities_df['city_org_id'].tolist())
city_org_name = list_dedupe(cities_df['Organization'].tolist())

# remove added index column in cities df
cities.drop('city_org_id', inplace=True, axis=1)
cities_count.drop('city_id', inplace=True, axis=1)

# zip to join the lists and dict function to convert into dicts
city_dict = dict(zip(city_id_no, city_name))
city_org_dict = dict(zip(city_org_id_no, city_org_name))

# compare dicts - matching when city name appears as a substring in the full city org name
city_names_df = pd.DataFrame(columns=['City ID No.','address_city', 'City Org ID No.','City Org', 'Match']) # initiate empty df

for ID, seq1 in city_dict.items():
    for ID2, seq2 in city_org_dict.items():
        m = re.search(seq1, seq2) # match string with regex search 
        if m:
            match = m.group()
            # Append rows in Empty Dataframe by adding dictionaries 
            city_names_df = city_names_df.append({'City ID No.': ID, 'address_city': seq1, 'City Org ID No.': ID2, 'City Org': seq2, 'Match' : match}, ignore_index=True)
            
# subset for city to city org name matches
city_names_df = city_names_df.loc[:,['address_city','City Org']]

# join city_org names and counts
cities_count = pd.merge(cities_count, city_names_df, on='address_city', how='left')

In [None]:
# join city org counts w/ question 6.2 response
cities_6_2 = cities_6_2[['City', 'Response Answer']].rename(columns={'City':'City Org'})
cities_count = pd\
    .merge(cities_count, cities_6_2, on='City Org', how='left')\
    .rename(columns={'Response Answer':'Sustainability Project Collab.'})
cities_count['Sustainability Project Collab.'].fillna('No Response', inplace=True)

# ranked proportion of orgs in cities disclosing to CDP
# highlighted response as bar color
plt.figure(figsize=(15,8))
sns.barplot(x='city_state', y='num_orgs', 
            hue='Sustainability Project Collab.', 
            palette='OrRd_r', data=cities_count.iloc[0:40,:])
plt.xticks(rotation=45, horizontalalignment='right', fontwieght='light', fontsize='medium')

# same map but as bubble map
