In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from simpletransformers.language_representation import RepresentationModel

pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

In [2]:
class BertSim:
    
    def __init__(self, model_type = "bert", model_name = "nbroad/ESG-BERT"):
        self.model = RepresentationModel(
                        model_type = model_type,
                        model_name = model_name,
                        use_cuda = True
                    )
    
    def get_model(self):
        return self.model

    def get_sentence_embeddings(self, sentences):
        return self.model.encode_sentences(sentences, combine_strategy = "mean")

    def get_similarity_matrix(self, sentences_1, sentences_2):
        return cosine_similarity(
            self.get_sentence_embeddings(sentences_1),
            self.get_sentence_embeddings(sentences_2)
        )

### Preparing Dataset

In [3]:
data_df = pd.read_csv("fb62e200-0fa3-11ec-82a8-0242ac130003_topics.csv")
data_df = data_df[data_df['Topics'].notna()]

In [4]:
kpi_df = pd.read_csv("GRI KPI list.csv")
kpi_df["Description of KPI"] = kpi_df["Description of KPI"].fillna(method = "ffill")
kpi_df["Descriptive KPI"] = kpi_df["Description of KPI"] + " " + kpi_df["KPI"]

### Compare Sentences and KPI

In [5]:
bert_sim = BertSim("bert", "nbroad/ESG-BERT")

Some weights of the model checkpoint at nbroad/ESG-BERT were not used when initializing BertForTextRepresentation: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
sim_matrix = bert_sim.get_similarity_matrix(data_df["sentences"], kpi_df["Descriptive KPI"])
sim_matrix.shape

(1320, 162)

In [7]:
sim_data = [sim.argmax() for sim in sim_matrix]
df = pd.DataFrame()
df["sentences"] = data_df["sentences"]
df["KPI"] = list(kpi_df["KPI"].iloc[sim_data])

In [8]:
df.head()

Unnamed: 0,sentences,KPI
1,"putting the growth of nations back on track will require a sustained effort over time, and also ...",total number of critical concerns communicated to highest governance body
2,"during these months in which our attention has been focussed on peoples health, the challenges o...",total number of critical concerns communicated to highest governance body
7,this figure supposes more than 80 times the_emissions we needed to produce and purify more than ...,particulate matter
8,"in 2020, more than eight out of every ten euros invested by acciona businesses financed projects...",co2 equivalent of biogenic co2 emissions
10,technologies integrated in acciona projects that have allowed us to increase our positive impact...,total cooling energy sold
