In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.preprocessing import RobustScaler, QuantileTransformer
import geopandas as gpd
%matplotlib inline
import time 
from tqdm import *

In [None]:
corporations_2020 = pd.read_csv("../input/externald/Supplementary Data/corporations_2020_climate_change_geo.csv")
acct_corps = dict(zip(np.array(corporations_2020["account_number"]), np.array(corporations_2020["organization"])))

def fetch_answer_individual(account, question_number, org_type="city", year=2018, corp_res_type=None, column_number=None, row_number=None):
    if(org_type=="city"):
        df = cities_2020_merged_sub
        subset = df[df['Question Number'] == question_number]
        answer = subset[subset["Account Number"] == account]
        if((column_number is not None) and (row_number is None)):
            answer = answer[answer["Column Number"] == column_number]
        elif((column_number is not None) and (row_number is not None)):
            answer = answer[(answer["Column Number"] == column_number) & (answer["Row Number"] == row_number)]
        elif((column_number is None) and (row_number is not None)):
            answer = answer[answer["Row Number"] == row_number]
        else:
            pass
    elif(org_type=="corp" and corp_res_type=="cc"):
        df = corporations_2020
        subset = df[df['question_number'] == question_number]
        answer = subset[subset["account_number"] == account]
        if((column_number is not None) and (row_number is None)):
            answer = answer[answer["column_number"] == column_number]
        elif((column_number is not None) and (row_number is not None)):
            answer = answer[(answer["column_number"] == column_number) & (answer["row_number"] == row_number)]
        elif((column_number is None) and (row_number is not None)):
            answer = answer[answer["row_number"] == row_number]
        else:
            pass
    elif(org_type=="corp" and corp_res_type=="ws"):
        df = corporations_2020_water
        subset = df[df['question_number'] == question_number]
        answer = subset[subset["account_number"] == account]
        if((column_number is not None) and (row_number is None)):
            answer = answer[answer["column_number"] == column_number]
        elif((column_number is not None) and (row_number is not None)):
            answer = answer[(answer["column_number"] == column_number) & (answer["row_number"] == row_number)]
        elif((column_number is None) and (row_number is not None)):
            answer = answer[answer["row_number"] == row_number]
        else:
            pass
    else:
        print("Something went wrong. Try again.")
    
    return answer

In [None]:
def standardize_rank(arr, direction=1):
    rank = (stats.mstats.rankdata(np.ma.masked_invalid(arr)))
    rank[rank == 0] = np.nan
    if(direction == 1):
        return rank/(np.nanmax(rank)) 
    else:
        return 1-(rank/(np.nanmax(rank)))


In [None]:
all_corp_locations = np.c_[np.array(corporations_2020["lat"]), np.array(corporations_2020["lng"])]
all_corp_accounts = np.array(corporations_2020["account_number"])
unique_corp_accounts, unique_corp_idxs = np.unique(all_corp_accounts, axis=0, return_index=True)

corp_locs_lat = all_corp_locations[:,0][unique_corp_idxs]
corp_locs_lng = all_corp_locations[:,1][unique_corp_idxs]
corp_names = np.array(corporations_2020["organization"].iloc[unique_corp_idxs])
corp_cities = np.array(corporations_2020["city"].iloc[unique_corp_idxs])
corp_states = np.array(corporations_2020["state"].iloc[unique_corp_idxs])
corp_countries = np.array(corporations_2020["country"].iloc[unique_corp_idxs])

corp_locs = np.c_[corp_locs_lat, corp_locs_lng]
corp_locs = np.nan_to_num(corp_locs, nan=-999)

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

sid = SentimentIntensityAnalyzer()

def polarity_score_corp(question, column, sent_thresh=2):
    subset = corporations_2020[corporations_2020['question_number'] == question]
    subset_corp_answers = subset[subset["column_number"] == column]
    
    compound_score = []
    for i in tqdm(range(len(unique_corp_accounts))):
        ss = list(subset_corp_answers[subset_corp_answers["account_number"] == unique_corp_accounts[i]]["response_value"])
        
        ss = ' '.join(map(str, ss))
        sents = tokenize.sent_tokenize(ss)
        if(len(sents) > sent_thresh):
            cc = 0
            for s in sents:
                r = sid.polarity_scores(s)
                cc += r['compound']
            avg_cc = cc/len(sents)
            compound_score.append(avg_cc)
        else:
            compound_score.append(np.nan)
        
    return compound_score

In [None]:
risk_corp_sentiment = polarity_score_corp("C2.2a", 2.0)
opportunity_corp_sentiment = polarity_score_corp("C2.4a", 6.0)
potential_corp_sentiment = polarity_score_corp("C12.3a", 3.0)

In [None]:
financial_data = pd.read_csv("../input/annual-financial-data-for-hybrid-cdp-kpi/cdp_financial_data.csv")

finance_acct = []
match_len = []
for i in range(len(unique_corp_accounts)):
    cdf = financial_data[financial_data["account_number"] == unique_corp_accounts[i]]
    try:
        finance_acct.append(pd.DataFrame(cdf))
    except:
        pass
    match_len.append(len(cdf))
df = pd.concat(finance_acct)

In [None]:
tons_reduced_per_year = []
for i in range(len(unique_corp_accounts)):
    cdf = fetch_answer_individual(unique_corp_accounts[i], "C4.1a", org_type="corp", year=2020, corp_res_type="cc")
    if(len(cdf) > 0):
        base_emissions = float(cdf[cdf["column_number"] == 6.0]["response_value"].iloc[0])
        base_emissions_year = float(cdf[cdf["column_number"] == 5.0]["response_value"].iloc[0])
        reduction_emissions = float(cdf[cdf["column_number"] == 9.0]["response_value"].iloc[0])*base_emissions
        goal_year = float(cdf[cdf["column_number"] == 8.0]["response_value"].iloc[0])
        try:
            rate = reduction_emissions/(goal_year - base_emissions_year)
        except:
            rate = reduction_emissions
        tons_reduced_per_year.append(rate)
    else:
        tons_reduced_per_year.append(np.nan)

In [None]:
tons_reduced_per_year = []
for i in range(len(unique_corp_accounts)):
    cdf = fetch_answer_individual(unique_corp_accounts[i], "C4.1a", org_type="corp", year=2020, corp_res_type="cc")
    if(len(cdf) > 0):
        base_emissions = float(cdf[cdf["column_number"] == 6.0]["response_value"].iloc[0])
        base_emissions_year = float(cdf[cdf["column_number"] == 5.0]["response_value"].iloc[0])
        reduction_emissions = float(cdf[cdf["column_number"] == 9.0]["response_value"].iloc[0])*base_emissions
        goal_year = float(cdf[cdf["column_number"] == 8.0]["response_value"].iloc[0])
        try:
            rate = reduction_emissions/(goal_year - base_emissions_year)
        except:
            rate = reduction_emissions
        tons_reduced_per_year.append(rate)
    else:
        tons_reduced_per_year.append(np.nan)

In [None]:
ebdtas = []
for i in range(len(unique_corp_accounts)):
    cdf = financial_data[financial_data["account_number"] == unique_corp_accounts[i]]
    if(len(cdf) > 0):
        ebdtas.append(cdf["EBITDA"].iloc[0])
    else:
        ebdtas.append(np.nan)


In [None]:
tons_to_save_life = 258200
lives_saved_twenty_years = np.array(tons_reduced_per_year)/tons_to_save_life * 20

In [None]:
ebidta_lives_saved_emissions = lives_saved_twenty_years*np.array(tons_reduced_per_year)
ebidta_lives_saved_emissions_rank = standardize_rank(ebidta_lives_saved_emissions)

In [None]:
corp_locs_lat = all_corp_locations[:,0][unique_corp_idxs]
corp_locs_lng = all_corp_locations[:,1][unique_corp_idxs]
corp_names = np.array(corporations_2020["organization"].iloc[unique_corp_idxs])
corp_cities = np.array(corporations_2020["city"].iloc[unique_corp_idxs])
corp_states = np.array(corporations_2020["state"].iloc[unique_corp_idxs])
corp_countries = np.array(corporations_2020["country"].iloc[unique_corp_idxs])
risk_corp_sentiment = polarity_score_corp("C2.2a", 2.0)
opportunity_corp_sentiment = polarity_score_corp("C2.4a", 6.0)
potential_corp_sentiment = polarity_score_corp("C12.3a", 3.0)
lives_saved = lives_saved_twenty_years

In [None]:
df = pd.DataFrame(np.c_[unique_corp_accounts, corp_names, corp_cities, corp_states, corp_countries, corp_locs_lat, corp_locs_lng,
          ebidta_lives_saved_emissions, ebidta_lives_saved_emissions_rank, risk_corp_sentiment, opportunity_corp_sentiment, 
                       potential_corp_sentiment, lives_saved], 
                  columns=["account_number", "organization", "city", "state", "country", "lat", "long", "ebidta_lives_saved", "ebidta_lives_saved_rank",
                          "sentiment_risk", "sentiment_opportunity", "sentiment_collaboration", "lives_saved"])

In [None]:
df.to_csv("corporate_kpis_general.csv")