In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from simpletransformers.language_representation import RepresentationModel

pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

In [2]:
class BertSim:
    
    def __init__(self, model_type = "bert", model_name = "nbroad/ESG-BERT"):
        self.model = RepresentationModel(
                        model_type = model_type,
                        model_name = model_name,
                        use_cuda = True
                    )
    
    def get_model(self):
        return self.model

    def get_sentence_embeddings(self, sentences):
        return self.model.encode_sentences(sentences, combine_strategy = "mean")

    def get_similarity_matrix(self, sentences_1, sentences_2):
        return cosine_similarity(
            self.get_sentence_embeddings(sentences_1),
            self.get_sentence_embeddings(sentences_2)
        )
        

### Preparing Dataset

In [3]:
data_df = pd.read_csv("../Data/fb62e200-0fa3-11ec-82a8-0242ac130003_topics.csv")
data_df = data_df[data_df['Topics'].notna()]

In [4]:
kpi_df = pd.read_csv("../Data/GRI KPI list.csv")
kpi_df["Description of KPI"] = kpi_df["Description of KPI"].fillna(method = "ffill")
kpi_df["Descriptive KPI"] = kpi_df["Description of KPI"] + " " + kpi_df["KPI"]

### Compare Sentences and KPI

In [5]:
def get_sentences_kpi(model_class):
    sim_matrix = model_class.get_similarity_matrix(data_df["sentences"], kpi_df["Descriptive KPI"])
    sim_index = [sim.argmax() for sim in sim_matrix]
    sim_score = [sim.max() for sim in sim_matrix]
    df = pd.DataFrame()
    df["sentences"] = list(data_df["sentences"])
    df["KPI"] = list(kpi_df["KPI"].iloc[sim_index])
    df["score"] = list(sim_score)
    return df

In [6]:
esg_bert_sim = BertSim("bert", "nbroad/ESG-BERT")

Some weights of the model checkpoint at nbroad/ESG-BERT were not used when initializing BertForTextRepresentation: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
bert_uncased_sim = BertSim("bert", "bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTextRepresentation: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
esg_bert_df = get_sentences_kpi(esg_bert_sim)
esg_bert_df.head()

Unnamed: 0,sentences,KPI,score
0,"putting the growth of nations back on track will require a sustained effort over time, and also ...",total number of critical concerns communicated to highest governance body,0.400573
1,"during these months in which our attention has been focussed on peoples health, the challenges o...",total number of critical concerns communicated to highest governance body,0.453412
2,this figure supposes more than 80 times the_emissions we needed to produce and purify more than ...,particulate matter,0.561987
3,"in 2020, more than eight out of every ten euros invested by acciona businesses financed projects...",co2 equivalent of biogenic co2 emissions,0.874777
4,technologies integrated in acciona projects that have allowed us to increase our positive impact...,total cooling energy sold,0.566743


In [9]:
bert_uncased_df = get_sentences_kpi(bert_uncased_sim)
bert_uncased_df.head()

Unnamed: 0,sentences,KPI,score
0,"putting the growth of nations back on track will require a sustained effort over time, and also ...",minimum number of weeks' notice of operational changes to employees and representatives,0.753975
1,"during these months in which our attention has been focussed on peoples health, the challenges o...",minimum number of weeks' notice of operational changes to employees and representatives,0.763086
2,this figure supposes more than 80 times the_emissions we needed to produce and purify more than ...,amount of energy consumption reduced,0.811423
3,"in 2020, more than eight out of every ten euros invested by acciona businesses financed projects...",amount of energy consumption reduced,0.832019
4,technologies integrated in acciona projects that have allowed us to increase our positive impact...,amount of energy consumption reduced,0.769879


### Testing Performance

In [10]:
s1 = ['using these pillars of the global sustainability agenda to guide us, we regularly review sustainability issues and completed a materiality assessment!']
s2 = ['to ensure that we build a sustainable business and meet stakeholder expectations.']

In [11]:
esg_bert_sim.get_similarity_matrix(s1, s2)

array([[0.5810851]], dtype=float32)

In [12]:
bert_uncased_sim.get_similarity_matrix(s1, s2)

array([[0.81749976]], dtype=float32)