# **EXTRACTING ESG KPI FROM SENTENCES**


In [5]:
# Load Libraries
import pandas as pd

## Load Data

- Extracted , cleaned Sentences : 
- Extracted uncleaned sentences

In [None]:
clean_sentences_df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/ESG_KPI_Omdena/Data/Clean sentences.xlsx")

## Extract KeyPhrases From Sentences



In [None]:
#install Keybert for keyphrase/keyword extraction 
!pip install keybert
#using sentence transformer model in keybert
!pip install -U sentence-transformers

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [164]:

class KeyPhraseExtraction:
  ''' Custom Class for KeyPhrase extraction '''
  def __init__(self,model_name = 'nbroad/ESG-BERT', initialize_sent_transformer = True):
    if initialize_sent_transformer:
      sent_trans_model = SentenceTransformer('nbroad/ESG-BERT')
      self.model = KeyBERT(model = sent_trans_model)
    else:
      self.model = KeyBERT(model = model_name)

  def extract_keyphrases_mmr(self,sent,keyphrase_ngram_range=(2,5),diversity = 0.7, top_n=5, stop_words='english'):
    return self.model.extract_keywords(sent,use_mmr=True,keyphrase_ngram_range=keyphrase_ngram_range, diversity=diversity, top_n=top_n, stop_words=stop_words)
  
  def extract_keyphrases_maxsum(self,sent,keyphrase_ngram_range=(2,5),nr_candidates = 15, top_n=5, stop_words='english'):
    return self.model.extract_keywords(sent,use_maxsum=True, keyphrase_ngram_range=keyphrase_ngram_range, nr_candidates=nr_candidates, top_n=top_n, stop_words=stop_words)
  

In [165]:
#Extract Key Phrases
clean_sentences_df['Key_Phrases'] = clean_sentences_df['Sentences'].apply(lambda sent: [kpe[0] for kpe in kpe_instance.extract_keyphrases_mmr(sent=sent)])

In [166]:
clean_sentences_df.to_pickle('clean_sentences_KPE.pkl')

In [6]:
clean_sentences_df

Unnamed: 0,Sentences,Key_Phrases
0,62.2% use of electricity from renewable_source...,[electricity renewable_sources achieved total ...
1,"at present, the energy expenditure has been re...","[present energy expenditure reduced 15, expend..."
2,170 business leaders call on eu decision-maker...,"[support greenhouse gas ghg emissions, reducti..."
3,taking the tolerance values of the measuring s...,"[reference estimated managed reduce emission, ..."
4,"with the help of energy management, the divisi...","[reduce energy consumption 1000 mwh, compared ..."
...,...,...
121,a forest plantation managed by new forests in ...,"[carbon dioxide annually estimated carbon, pla..."
122,"waste_reduction, recycling business initiative...",[waste_reduction recycling business initiative...
123,"waste_reduction: 71,670 tons/year sugar cane r...","[biomass power generation, waste_reduction 71 ..."
124,the amount of fossil-based electricity has bee...,"[fossil based electricity reduced 91, percent ..."


In [6]:
#clean_sentences_df = pd.read_pickle('/content/clean_sentences_KPE (1).pkl')

## KPI Mapping Using ESG_BERT Embedding Similarity

In [None]:
!pip install simpletransformers

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from simpletransformers.language_representation import RepresentationModel


In [10]:
class BertSim:
    
    def __init__(self, model_type = "bert", model_name = "nbroad/ESG-BERT"):
        self.model = RepresentationModel(
                        model_type = model_type,
                        model_name = model_name,
                        use_cuda = True
                    )
    
    def get_model(self):
        return self.model

    def get_sentence_embeddings(self, sentences):
        return self.model.encode_sentences(sentences, combine_strategy = "mean")

    def get_similarity_matrix(self, sentences_1, sentences_2):
        return cosine_similarity(
            self.get_sentence_embeddings(sentences_1),
            self.get_sentence_embeddings(sentences_2)
        )

In [177]:
def get_sentences_kpi(model_class, sentence,kpi_df):
    sim_matrix = model_class.get_similarity_matrix([sentence], kpi_df["combined_kpi"])
    sim_index = sim_matrix.argmax()
    sim_score = sim_matrix.max()
    kpi = kpi_df["KPI"].iloc[sim_index]
    category = kpi_df['ESG category'].iloc[sim_index]
    return kpi, sim_score, category

In [None]:
#Generate KPI , Similarity Scores , ESG Category Mappings
key_phrase_list = clean_sentences_df['Key_Phrases']

kpi_df = pd.DataFrame(columns=['ESG_Category','Mapped_KPI','Similarity_Score'])
for ind, key_phrases in enumerate(key_phrase_list):
  kpi_list = []
  category_list = []
  score_list = []
  for key_phrase in key_phrases:
    kpi, sim_score, category = get_sentences_kpi(model_class, key_phrase)
    kpi_list.append(kpi)
    score_list.append(sim_score)
    category_list.append(category)
  # print(kpi_list)
  # print(score_list)
  #kpi_df = kpi_df.astype('object')
  kpi_df.at[ind, 'ESG_Category'] = category_list
  kpi_df.at[ind, 'Mapped_KPI'] = kpi_list
  kpi_df.at[ind, 'Similarity_Score'] = score_list

In [141]:
# Combining KPI if only one distinct KPI present in a row
kpi_df['Combined_KPI'] = kpi_df['Mapped_KPI'].apply(lambda kpis: list(set(kpis)) if len(set(kpis))==1 else kpis)
kpi_df['Combined_Category'] = kpi_df['ESG_Category'].apply(lambda cats: list(set(cats)) if len(set(cats))==1 else cats)

In [143]:
clean_sent_kpi = pd.concat([clean_sentences_df,kpi_df],axis=1)
clean_sent_kpi.to_excel('CLEAN_SENT_GENERIC_KPI_MAPPING.xlsx')

In [148]:
clean_sent_kpi

Unnamed: 0,Sentences,Key_Phrases,ESG_Category,Mapped_KPI,Similarity_Score,Combined_KPI,Combined_Category
0,62.2% use of electricity from renewable_source...,[electricity renewable_sources achieved total ...,"[Environmental, Environmental, Environmental, ...","[energy consumption, energy consumption, energ...","[0.91401374, 0.90717655, 0.836861, 0.97449636,...",[energy consumption],[Environmental]
1,"at present, the energy expenditure has been re...","[present energy expenditure reduced 15, expend...","[Environmental, Economic, Environmental, Envir...","[energy consumption, Corporate income tax, ene...","[0.8970034, 0.51539993, 0.8052012, 0.8965416, ...","[energy consumption, Corporate income tax, ene...","[Environmental, Economic, Environmental, Envir..."
2,170 business leaders call on eu decision-maker...,"[support greenhouse gas ghg emissions, reducti...","[Environmental, Governance, Environmental, Env...","[GHG emissions, net revenues, GHG emissions, G...","[0.69169486, 0.4378304, 0.77837104, 0.6697927,...","[GHG emissions, net revenues, GHG emissions, G...","[Environmental, Governance, Environmental, Env..."
3,taking the tolerance values of the measuring s...,"[reference estimated managed reduce emission, ...","[Environmental, Environmental, Economic, Envir...","[GHG emissions, GHG emissions, Corporate incom...","[0.6032284, 0.6493584, 0.53069377, 0.63353264,...","[GHG emissions, GHG emissions, Corporate incom...","[Environmental, Environmental, Economic, Envir..."
4,"with the help of energy management, the divisi...","[reduce energy consumption 1000 mwh, compared ...","[Environmental, Economic, Environmental, Envir...","[energy consumption, community investments, en...","[0.9421796, 0.5854432, 0.72099483, 0.7198863, ...","[energy consumption, community investments, en...","[Environmental, Economic, Environmental, Envir..."
...,...,...,...,...,...,...,...
121,a forest plantation managed by new forests in ...,"[carbon dioxide annually estimated carbon, pla...","[Environmental, Environmental, Environmental, ...","[GHG emissions, restored habitat areas, recycl...","[0.6681555, 0.6665286, 0.43836945, 0.56134325,...","[GHG emissions, restored habitat areas, recycl...",[Environmental]
122,"waste_reduction, recycling business initiative...",[waste_reduction recycling business initiative...,"[Environmental, Environmental, Environmental, ...","[recycled materials, non-renewable materials, ...","[0.61960626, 0.7124392, 0.6900979, 0.70979166,...","[recycled materials, non-renewable materials, ...","[Environmental, Environmental, Environmental, ..."
123,"waste_reduction: 71,670 tons/year sugar cane r...","[biomass power generation, waste_reduction 71 ...","[Environmental, Environmental, Environmental, ...","[energy consumption, organic pollutants, recyc...","[0.7994809, 0.47750506, 0.42381394, 0.46169093...","[energy consumption, organic pollutants, recyc...","[Environmental, Environmental, Environmental, ..."
124,the amount of fossil-based electricity has bee...,"[fossil based electricity reduced 91, percent ...","[Environmental, Economic, Environmental, Envir...","[energy consumption, third-party sales, energy...","[0.78861237, 0.47423828, 0.7629609, 0.76556104...","[energy consumption, third-party sales, energy...","[Environmental, Economic, Environmental, Envir..."


## Map Granular KPI Using Mapped ESG Category

In [183]:
# CURATED KPI LIST

granular_kpi_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ESG_KPI_Omdena/Data/Granular KPI list.csv')
granular_kpi_df.fillna(' ',inplace=True)
granular_kpi_df['combined_kpi'] = granular_kpi_df['KPI']+" "+ granular_kpi_df['Description of KPI']
granular_kpi_df.rename(columns = {'ESG':"ESG category"},inplace=True)
granular_kpi_df.columns

Index(['Description of KPI', 'KPI', 'ESG category', 'category',
       'gri_disclosure_sub_code', 'combined_kpi'],
      dtype='object')

In [185]:
#Generate KPI , Similarity Scores , ESG Category Mappings
key_phrase_list = clean_sent_kpi['Key_Phrases']

kpi_df = pd.DataFrame(columns=['Granular_KPI','Granular_Similarity_Score'])
for ind, key_phrases in enumerate(key_phrase_list):
  kpi_list = []
  score_list = []
  esg_cat_list = clean_sent_kpi['Combined_Category'].iloc[ind]
  if len(esg_cat_list) == 1:
    key_phrase = " ".join(key_phrases[:3])
    category_kpi_df = granular_kpi_df[granular_kpi_df['ESG category']==esg_cat_list[0]]
    kpi, sim_score, _ = get_sentences_kpi(model_class, key_phrase, category_kpi_df)
    kpi_list.append(kpi)
    score_list.append(sim_score)
  else:
    for j,key_phrase in enumerate(key_phrases):
      category_kpi_df = granular_kpi_df[granular_kpi_df['ESG category']==esg_cat_list[j]]
      kpi, sim_score, category = get_sentences_kpi(model_class, key_phrase,category_kpi_df)
      kpi_list.append(kpi)
      score_list.append(sim_score)
  kpi_df.at[ind, 'Granular_KPI'] = kpi_list
  kpi_df.at[ind, 'Granular_Similarity_Score'] = score_list

In [191]:
clean_sent_gen_gran_kpi = pd.concat([clean_sent_kpi,kpi_df],axis=1)
clean_sent_gen_gran_kpi.to_excel('Clean_Sentences_Generic_Granular_KPI.xlsx')
clean_sent_gen_gran_kpi

In [140]:
# #MAX SUM
# kw_model = KeyBERT(model= model)
# keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(2, 4), stop_words='english',
#                               use_maxsum=True, nr_candidates=15, top_n=5)