https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import scipy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

In [2]:
data_df = pd.read_csv("fb62e200-0fa3-11ec-82a8-0242ac130003_topics.csv")

In [3]:
data_df = data_df[data_df['Topics'].notna()]data_df = data_df[data_df['Topics'].notna()]

In [4]:
kpi_df = pd.read_csv("GRI KPI list.csv")

In [5]:
model_name = 'sentence-transformers/all-distilroberta-v1'

In [6]:
model_name = "nbroad/ESG-BERT"

In [7]:
class BertSim:
    
    def __init__(self, sent_data : list, kpi_data : list, model_name : str, max_length = 32):
        self.tokens = {'input_ids': [], 'attention_mask': []}
        self.data = kpi_data + sent_data
        self.kpi_length = len(kpi_data)
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.__initTokens__()
        
    def __initTokens__(self):
        for sentence in self.data:
            # encode each sentence and append to dictionary
            new_tokens = self.tokenizer.encode_plus(sentence, max_length = self.max_length,
                                               truncation=True, padding='max_length',
                                               return_tensors='pt')
            self.tokens['input_ids'].append(new_tokens['input_ids'][0])
            self.tokens['attention_mask'].append(new_tokens['attention_mask'][0])

        # reformat list of tensors into single tensor
        self.tokens['input_ids'] = torch.stack(self.tokens['input_ids'])
        self.tokens['attention_mask'] = torch.stack(self.tokens['attention_mask'])
        
    def __getDenseVectors__(self, embeddings : torch.Tensor) -> np.ndarray:
        attention_mask = self.tokens['attention_mask']
        mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
        masked_embeddings = embeddings * mask
        summed = torch.sum(masked_embeddings, 1)
        summed_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / summed_mask
        mean_pooled = mean_pooled.detach().numpy()
        return mean_pooled
        
    def getEmbeddings(self) -> torch.Tensor:
        outputs = self.model(**self.tokens)
        embeddings = outputs.last_hidden_state
        return embeddings
    
    def getSimilarityData(self):
        embeddings = self.getEmbeddings()
        mean_pooled = self.__getDenseVectors__(embeddings)
        sims = cosine_similarity(
            mean_pooled[self.kpi_length:],
            mean_pooled[:self.kpi_length]
        )
        return sims
    
    def getSimilarityDataFrame(self):
        sims = self.getSimilarityData()
        sim_data = [sim.argmax() for sim in sims]
        df = pd.DataFrame()
        df["sentences"] = self.data[self.kpi_length:]
        df["KPI"] = [self.data[i] for i in sim_data]
        return df


In [8]:
sim = BertSim(list(data_df["sentences"]), list(kpi_df["KPI"]), model_name, 64)

Some weights of the model checkpoint at nbroad/ESG-BERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
sim_df = sim.getSimilarityDataFrame()

In [10]:
sim_df.to_csv("ESG_BERT_Sim.csv")

In [11]:
sim_df

Unnamed: 0,sentences,KPI
0,"putting the growth of nations back on track will require a sustained effort over time, and also ...",total number of critical concerns communicated to highest governance body
1,"during these months in which our attention has been focussed on peoples health, the challenges o...",total number of critical concerns communicated to highest governance body
2,this figure supposes more than 80 times the_emissions we needed to produce and purify more than ...,total water withdrawal in megaliters
3,"in 2020, more than eight out of every ten euros invested by acciona businesses financed projects...",Scope 2 GHG emissions of CO2 equivalent
4,technologies integrated in acciona projects that have allowed us to increase our positive impact...,total cooling energy sold
5,"making them central to our decisions has been a priority in the cycle now coming to an end, and ...",total monetary value of development grants
6,in this financial year some of the measures we have adopted have included flexible working days ...,number of employees returned to work after parental leave
7,sor pant an = 11 report 2020 index 1 letter from the chairman 2 the first company of a new secto...,total number of operations
8,"acciona has been designated a sustainability leader in the sector by s&p global, has received fr...",revenues from third-party sales
9,"from the outset we have maintained a proactive and preventive approach, working hand in hand wit...",total number of critical concerns communicated to highest governance body
