In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

import random
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk import pos_tag
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams.update({
    'axes.labelsize': 18
})

%matplotlib inline

In [None]:
def read_csv(path):
    return pd.read_csv(path)

def get_questions_by_org(cities_df):
    gb = cities_df.groupby(['Organization', 'Year Reported to CDP'])
    
    data = gb.agg({
        'Question Number': 'nunique'
    }).reset_index()
    
    return data

def get_org_with_missing_bars(questions_by_year, organizations, columns=None):
    
    arr1 = []
    arr2 = []
    arr3 = []

    for org in organizations:
        years_missing = list(set([2018, 2019, 2020]) - set(questions_by_year[questions_by_year['Organization'] == org]['Year Reported to CDP'].values))
        arr2.extend(years_missing)
        arr1.extend([org] * len(years_missing))
        arr3.extend([0] * len(years_missing))
        
    return pd.DataFrame(zip(arr1, arr2, arr3), columns=columns)

In [None]:
def stacked_bar_plot(data, labels, legends, title, colors, **rcParams):
    def autolabel(current_rects, sum_widths, xpos='center', color='white'):
        for i, rect in enumerate(current_rects):
            width = int(rect.get_width())
            yloc = rect.get_y() + rect.get_height() / 2
            
            if width == 0:
                continue
            ax.annotate('{}'.format(width), xy=(width / 2 + sum_widths[i], yloc), color=color, weight='bold', 
                        size=10, ha='center', va='center')
            
    def get_sum_bars(index, bars):
        sum_bars = np.zeros(len(bars[index]))
        prev_bars = bars[0:index]
        for vects in prev_bars:
            for i, elem in enumerate(vects):
                sum_bars[i] += elem
                
        return sum_bars
            
    rects = []
        
    if rcParams and 'figsize' in rcParams:
        fig, ax = plt.subplots(figsize = rcParams['figsize'])
    else:
        fig, ax = plt.subplots(figsize = (12, 8))
        
    y_pos = np.arange(len(labels))
    
    for k, bar in enumerate(bars):
            
        rect1 = ax.barh(y_pos, bar, left=get_sum_bars(k, bars), color=colors[k], edgecolor='yellow')
        rects.insert(k, rect1)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels)
        
    
    for i, rect in enumerate(rects):
        sum_widths = np.zeros(len(rect))
        for r in rects[0:i]:
            for i, j in enumerate(r):
                sum_widths[i] += j.get_width()
            
        autolabel(rect, sum_widths)

    plt.title(title, fontsize=13)
    plt.legend(legends)
    plt.show()
    
def autolabel(current_rects, sum_widths, xpos='center', color='white'):
    for i, rect in enumerate(current_rects):
        width = int(rect.get_width())
        yloc = rect.get_y() + rect.get_height() / 2
        ax.annotate('{}'.format(width), xy=(width, yloc), color=color, weight='bold', size=13, verticalalignment='center')
    
    
def display_bar_plot(bars, labels, title):
    
    fig, ax = plt.subplots(figsize = (10, 7))
        
    y_pos = np.arange(len(bars))
    rects = ax.barh(y_pos, bars, color='#0504aa', alpha=0.7, edgecolor='blue')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels)
    
    autolabel(rects, [0] * len(rects))
    plt.title(title, fontsize=14)
    plt.show()
    

In [None]:
def create_ngrams(response, n):
    tokens = response.split()
    
    ngrams = list()
    for i in range(0, len(tokens) - 1):
        ngrams.append(" ".join(tokens[i:i + n]))
        
    return ngrams

In [None]:
SOURCE_PATH = "../input/cdp-unlocking-climate-solutions/"
SUPPLE_PATH = "../input/cdp-unlocking-climate-solutions/Supplementary Data/"

COLORS = ['#0504aa', '#34495E', '#A15BF0']

#### 1. Read all files

In [None]:
!ls -l '/kaggle/input/cdp-unlocking-climate-solutions/Corporations/Corporations Disclosing/Water Security/'

In [None]:
# cities response

cities_2018 = read_csv(f"{SOURCE_PATH}/Cities/Cities Responses/2018_Full_Cities_Dataset.csv")
cities_2019 = read_csv(f"{SOURCE_PATH}/Cities/Cities Responses/2019_Full_Cities_Dataset.csv")
cities_2020 = read_csv(f"{SOURCE_PATH}/Cities/Cities Responses/2020_Full_Cities_Dataset.csv")

# cities disclosing

cities_dis_2018 = read_csv(f"{SOURCE_PATH}/Cities/Cities Disclosing/2018_Cities_Disclosing_to_CDP.csv")
cities_dis_2019 = read_csv(f"{SOURCE_PATH}/Cities/Cities Disclosing/2019_Cities_Disclosing_to_CDP.csv")
cities_dis_2020 = read_csv(f"{SOURCE_PATH}/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")

# corporations response

cr_ws_2018 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Responses/Water Security/2018_Full_Water_Security_Dataset.csv")
cr_ws_2019 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Responses/Water Security/2019_Full_Water_Security_Dataset.csv")
cr_ws_2020 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Responses/Water Security/2020_Full_Water_Security_Dataset.csv")

cr_cc_2018 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Responses/Climate Change/2018_Full_Climate_Change_Dataset.csv")
cr_cc_2019 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Responses/Climate Change/2019_Full_Climate_Change_Dataset.csv")
cr_cc_2020 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Responses/Climate Change/2020_Full_Climate_Change_Dataset.csv")

# Corporations Disclosing water security

cr_dis_ws_2018 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Disclosing/Water Security/2018_Corporates_Disclosing_to_CDP_Water_Security.csv")
cr_dis_ws_2019 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Disclosing/Water Security/2019_Corporates_Disclosing_to_CDP_Water_Security.csv")
cr_dis_ws_2020 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Disclosing/Water Security/2020_Corporates_Disclosing_to_CDP_Water_Security.csv")

# Corporations Disclosing climate change

cr_dis_cc_2018 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Disclosing/Climate Change/2018_Corporates_Disclosing_to_CDP_Climate_Change.csv")
cr_dis_cc_2019 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Disclosing/Climate Change/2019_Corporates_Disclosing_to_CDP_Climate_Change.csv")
cr_dis_cc_2020 = read_csv(f"{SOURCE_PATH}/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")


In [None]:
supp_cdp_rec = pd.read_excel(f"{SUPPLE_PATH}/Recommendations from CDP/CDP_recommendations_for_supplementary_datasets_to_include.xlsx")
                             
supp_cdp_qs = pd.read_excel(f"{SUPPLE_PATH}/Recommendations from CDP/CDP_recommendations_for_questions_to_focus_on.xlsx")

us_cities_mappings = read_csv(f"{SUPPLE_PATH}/Simple Maps US Cities Data/uscities.csv")

corp_locations = read_csv(f"{SUPPLE_PATH}/Locations of Corporations/NA_HQ_public_data.csv")

cdc_VI_County = read_csv(f"{SUPPLE_PATH}/CDC Social Vulnerability Index 2018/SVI2018_US_COUNTY.csv")

cdc_VI = read_csv(f"{SUPPLE_PATH}/CDC Social Vulnerability Index 2018/SVI2018_US.csv")

# data dictionary
dd = read_csv(f"{SOURCE_PATH}/Cities/Cities Responses/Full_Cities_Response_Data_Dictionary.csv")

dd_cities_dis = read_csv(f"{SOURCE_PATH}/Cities/Cities Disclosing/Cities_Disclosing_to_CDP_Data_Dictionary.csv")

In [None]:
STOPS = stopwords.words('english')

In [None]:
def normalize_text(sentence):
    
    # 1. Split into sentences
    sentences = sent_tokenize(sentence)
    sentences = list(map(str.lower, sentences))
    
    return sentences

#### 2. Join data

In [None]:
cities_df = pd.concat([cities_2018, cities_2019, cities_2020])                  # Cities Response

cities_dis_df = pd.concat([cities_dis_2018, cities_dis_2019, cities_dis_2020])  # Cities Disclosure

cr_ws_df = pd.concat([cr_ws_2018, cr_ws_2019, cr_ws_2020])                      # Corporations Response Water Security

cr_cc_df = pd.concat([cr_cc_2018, cr_cc_2019, cr_cc_2020])                      # Corporations Response Climate Change

cr_dis_ws_df = pd.concat([cr_dis_ws_2018, cr_dis_ws_2019, cr_dis_ws_2020])      # Corporations Disclosure Water Supply

cr_dis_cc_df = pd.concat([cr_dis_cc_2018, cr_dis_cc_2019, cr_dis_cc_2020])      # Corporations Disclosure Climate Change

In [None]:
cities_df_merged = cities_df.merge(cities_dis_df, on=['CDP Region', 'Country', 'Account Number', 'Organization', 'Year Reported to CDP'], how='left', suffixes=('', '_dis'))

In [None]:
sections_df = cities_df.groupby('Parent Section').agg({
    'Section': 'unique'
}).reset_index()

#### 3. Currencies across organization

In [None]:
temp = cities_df[cities_df['Question Number'] == '0.4']

res = temp.groupby('Response Answer').agg({
    'Organization': 'nunique',
    'CDP Region': 'unique'
}).reset_index()

res['CDP Region'] = res['CDP Region'].apply(lambda x: ",".join(x))

In [None]:
N = 10
d = res.loc[random.sample(list(res.sort_values(by=['Organization'], ascending=[0]).index.ravel()[:N]), N)]
bars = []
bars.append(list(d['Organization'].values))

stacked_bar_plot(bars, labels=d['Response Answer'], legends=['All'], title='# Organizations with currency', colors=COLORS)

del d
del N
del bars

#### 4. Introduction

In [None]:
def join_by_column_name(cities_df, used_cols):
    grouped = cities_df.groupby('Parent Section')
    
    df = grouped.get_group('Introduction')
    
    df1 = df[df['Column Name'] == 'Current population']
    df2 = df[df['Column Name'] == 'Projected population']
    df3 = df[df['Column Name'] == 'Current population year']
    df4 = df[df['Column Name'] == 'Projected population year']
    
    merged_df = df1[used_cols].merge(df2[used_cols], on=['Year Reported to CDP', 'Account Number'], suffixes=('_cur', '_proj'))
    merged_df = merged_df.merge(df3[used_cols], on=['Year Reported to CDP', 'Account Number'])
    merged_df = merged_df.merge(df4[used_cols], on=['Year Reported to CDP', 'Account Number'])
    
    merged_df = merged_df[filter_columns(merged_df.columns, used_cols)]
    
    merged_df.rename(columns={
        'Response Answer_cur': merged_df['Column Name_cur'].iloc[0],
        'Response Answer_proj': merged_df['Column Name_proj'].iloc[0],
        'Response Answer_x': merged_df['Column Name_x'].iloc[0],
        'Response Answer_y': merged_df['Column Name_y'].iloc[0]
    }, inplace=True)
    
    merged_df.drop(columns=['Column Name_cur', 'Column Name_proj', 'Column Name_x', 'Column Name_y'], inplace=True)
    
    return merged_df

def filter_columns(all_cols, used_cols):
    return [i for i in all_cols if i.split('_')[0] in used_cols]

In [None]:
cities_df = cities_df.sort_values(by=['Year Reported to CDP', 'Account Number', 'Question Number', 'Column Number', 'Row Number'])

In [None]:
grouped = cities_df.groupby(['Year Reported to CDP', 'Account Number', 'Question Number'])

grouped_sect = cities_df.groupby(['Parent Section'])

In [None]:
merged_df = join_by_column_name(cities_df, ['Year Reported to CDP', 'Account Number', 'Column Name', 'Response Answer'])
merged_df = merged_df.dropna()

merged_df = merged_df[~merged_df['Current population year'].isin(['216', '7', '19', '217'])]
merged_df['Current population'] = merged_df['Current population'].astype(float)
merged_df['Projected population'] = merged_df['Projected population'].astype(float)

city_names = cities_df.loc[cities_df['Account Number'].isin(merged_df['Account Number'])][['Account Number', 'Organization', 'Country']].drop_duplicates()
merged_df = merged_df.merge(city_names, on='Account Number', how='inner')

In [None]:
merged_df['pop_diff'] = (merged_df['Projected population'] - merged_df['Current population'])

merged_df['years'] = merged_df['Projected population year'].astype(int) - merged_df['Current population year'].astype(int)

merged_df['unit_diff'] = merged_df['pop_diff'] / merged_df['years']                                                                                                                   
                                                                                                                      

In [None]:
N = 10
d = merged_df[(merged_df['Year Reported to CDP'] == 2020) & (merged_df['pop_diff'] == 0)][:N]
bars = []
bars.append(list(d['pop_diff'].values))

stacked_bar_plot(bars, labels=d['Organization'].str.cat(d['Country'], sep="\n"), legends=['2020'], title='Population with no increase', colors=COLORS, **dict(figsize=(7, 5)))

del d
del N
del bars

In [None]:
merged_df[:2]

In [None]:
N = 10
d = merged_df.loc[random.sample(list(merged_df[(merged_df['unit_diff'] > 0) & (merged_df['Year Reported to CDP'] == 2020)]['pop_diff'].sort_values()[::-1].index.ravel()[:10]), 10)]

bars = []
bars.append(list(d['unit_diff'].values))

stacked_bar_plot(bars, labels=d['Organization'].str.cat(d['Country'], sep="\n"), legends=['2020'], title='Population with no increase', colors=COLORS, **dict(figsize=(10, 7)))

del d
del N
del bars

#### 5. Climate Hazards

In [None]:
cities_df[cities_df['Parent Section'].isin(['Climate Hazards', 'Climate Hazards & Vulnerability', 'Climate Hazards and Vulnerability'])]\
[['Question Number', 'Question Name']].drop_duplicates()

### 5.1 Risk Assessment actions

In [None]:
cities_df[(cities_df['Question Number'] == '2.0')]['Question Name'].unique()

In [None]:
q2_df = cities_df[(cities_df['Question Number'] == '2.0') & ~(cities_df['Response Answer'].isnull())]

In [None]:
res = q2_df.groupby('Account Number')['Response Answer'].size().reset_index(name='size')

for ans in q2_df['Response Answer'].unique():
    res[ans] = 0

In [None]:
def count_words(answers, key):
    return np.sum([1 for i in answers if i == key])

In [None]:
for ans in q2_df['Response Answer'].unique():
    res[ans] = res.apply(lambda x: count_words(q2_df.groupby('Account Number').get_group(x['Account Number'])['Response Answer'], ans), axis=1)
    
for ans in q2_df['Response Answer'].unique():
    res[ans] = res[ans] / res['size']

In [None]:
values = []
for ans in q2_df['Response Answer'].unique():
    values.append(np.sum(res[ans] > 0) / len(res['Account Number']) * 100)

values = pd.Series(values, index=list(q2_df['Response Answer'].unique()))

In [None]:
q2_mappings = dict()

for i in range(len(res)):
    q2_mappings[res.loc[i, 'Account Number']] = []
    [q2_mappings[res.loc[i, 'Account Number']].append(ans) for ans in q2_df['Response Answer'].unique() if res.loc[i, ans] > 0]
    
for k, v in q2_mappings.items():
    q2_mappings[k] = "-".join(v)

In [None]:
bars = []
d = pd.Series(q2_mappings.values()).value_counts()
bars.append(d[d > 10])

stacked_bar_plot(bars, labels=d[d>10].index, legends=['2018 + 2019 + 2020'], title='Cities with Risk Assessment actions', colors=COLORS, **dict(figsize=(8, 7)))

del d

In [None]:
q2_df = cities_df[(cities_df['Question Number'].isin(['2.0a', '2.0b', '2.0c', '2.0d']))]

In [None]:
q2_df[q2_df['Question Number'] == '2.0a']['Question Name'].unique()

### Primary methodology

In [None]:
grouped = q2_df[(q2_df['Question Number'] == '2.0a') & (q2_df['Column Name'] == 'Primary methodology')].groupby('Response Answer').agg({
    'Organization': 'nunique'
}).reset_index()

grouped = grouped.sort_values(by=['Organization'], ascending=[0])

In [None]:
bars = []
d = grouped[grouped['Organization'] > 1]
bars.append(d['Organization'])

stacked_bar_plot(bars, labels=d['Response Answer'], legends=['2018 + 2019 + 2020'], title='Primary methods', colors=COLORS, **dict(figsize=(8, 9)))

del d

In [None]:
df = q2_df[(q2_df['Question Number'] == '2.0a') & ~(q2_df['Column Name'] == 'Primary methodology')]

In [None]:
cities_df[(cities_df['Question Number'] == '2.1')]['Question Name'].unique()


In [None]:
keywords = ['heat', 'rain', 'economic', 'environment', 'forest', 'hazards']

In [None]:
df = cities_df[(cities_df['Question Number'] == '2.1')]

In [None]:
bars = []
d = cities_df[(cities_df['Question Name'] == 'Does your city have an update / revision process for the climate risk or vulnerability assessment?')]\
.groupby('Response Answer').apply(len)
bars.append(d)

stacked_bar_plot(bars, labels=d.index, legends=['2018 + 2019 + 2020'], title='Cities with climate revision plan', colors=COLORS, **dict(figsize=(7, 5)))

del d

In [None]:
list(df['Response Answer'].drop_duplicates())

In [None]:
# vectorizer = CountVectorizer()

# X = vectorizer.fit_transform(q2_df['Response Answer'].dropna())