In [1]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import pandas as pd
import scipy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

In [2]:
class BertEncode:
    
    def __init__(self, model_name : str, max_length = 128, num_layers = 4):
        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config = self.config)
        self.model.eval()
        self.num_layers = num_layers
        self.max_length = max_length
        
    def __initTokens__(self, sentences):
        self.tokens = {'input_ids': [], 'attention_mask': []}
        for sentence in sentences:
            new_tokens = self.tokenizer.encode_plus(sentence, max_length = self.max_length,
                                               truncation=True, padding='max_length',
                                               return_tensors='pt')
            self.tokens['input_ids'].append(new_tokens['input_ids'][0])
            self.tokens['attention_mask'].append(new_tokens['attention_mask'][0])

        # reformat list of tensors into single tensor
        self.tokens['input_ids'] = torch.stack(self.tokens['input_ids'])
        self.tokens['attention_mask'] = torch.stack(self.tokens['attention_mask'])
        
    def __getDenseVectors__(self, aggr_last_layers : torch.Tensor) -> np.ndarray:
        attention_mask = self.tokens['attention_mask']
        mask = attention_mask.unsqueeze(-1).expand(aggr_last_layers.size()).float()
        masked_embeddings = aggr_last_layers * mask
        aggr = torch.sum(masked_embeddings, 1)
        aggr_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = aggr / aggr_mask
        mean_pooled = mean_pooled.detach().numpy()
        return mean_pooled
        
    def __getOutputs__(self, sentences) -> torch.Tensor:
        self.__initTokens__(sentences)
        with torch.no_grad():
            outputs = self.model(**self.tokens)
            aggr_last_layers = torch.stack(outputs.hidden_states[-self.num_layers:]).sum(0)
            return aggr_last_layers
        
    def get_sentence_embeddings(self, sentences):
        aggr_last_layers = self.__getOutputs__(sentences)
        dense_vectors = self.__getDenseVectors__(aggr_last_layers)
        return dense_vectors
    
    def get_similarity_matrix(self, sentences_1, sentences_2):
        return cosine_similarity(
            self.get_sentence_embeddings(sentences_1),
            self.get_sentence_embeddings(sentences_2)
        )

In [3]:
data_df = pd.read_csv("../Data/input_datasets/fb62e200-0fa3-11ec-82a8-0242ac130003/fb62e200-0fa3-11ec-82a8-0242ac130003_topics.csv")
data_df = data_df[data_df['Topics'].notna()].iloc[:100]
kpi_df = pd.read_csv("../Data/kpi_lists/GRI KPI list.csv")
kpi_df["Description of KPI"] = kpi_df["Description of KPI"].fillna(method = "ffill")
kpi_df["Descriptive KPI"] = kpi_df["Description of KPI"] + ". " + kpi_df["KPI"]

In [4]:
model_name = "nbroad/ESG-BERT"

In [5]:
def get_sentences_kpi(model_class):
    sim_matrix = model_class.get_similarity_matrix(data_df["sentences"], kpi_df["Descriptive KPI"])
    sim_index = [sim.argmax() for sim in sim_matrix]
    sim_score = [sim.max() for sim in sim_matrix]
    df = pd.DataFrame()
    df["sentences"] = list(data_df["sentences"])
    df["KPI"] = list(kpi_df["KPI"].iloc[sim_index])
    df["score"] = list(sim_score)
    return df

In [6]:
sim = BertEncode(model_name, 128, 1)

Some weights of the model checkpoint at nbroad/ESG-BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
esg_bert_df = get_sentences_kpi(sim)
esg_bert_df.head()

Unnamed: 0,sentences,KPI,score
0,"putting the growth of nations back on track will require a sustained effort over time, and also ...",total number of critical concerns communicated to highest governance body,0.455011
1,"during these months in which our attention has been focussed on peoples health, the challenges o...",total number of critical concerns communicated to highest governance body,0.511303
2,this figure supposes more than 80 times the_emissions we needed to produce and purify more than ...,persistent organic pollutants,0.52794
3,"in 2020, more than eight out of every ten euros invested by acciona businesses financed projects...",co2 equivalent of GHG emissions reduced,0.903068
4,technologies integrated in acciona projects that have allowed us to increase our positive impact...,total cooling energy sold,0.561492
