# Using Fuzzy Matching with Granular KPI

In [1]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

Reading files
--

In [2]:
data_df = pd.read_csv("clean_sentences.csv")
data_df

Unnamed: 0,Sentences
0,62.2% use of electricity from renewable_source...
1,"at present, the energy expenditure has been re..."
2,170 business leaders call on eu decision-maker...
3,taking the tolerance values of the measuring s...
4,"with the help of energy management, the divisi..."
...,...
121,a forest plantation managed by new forests in ...
122,"waste_reduction, recycling business initiative..."
123,"waste_reduction: 71,670 tons/year sugar cane r..."
124,the amount of fossil-based electricity has bee...


In [3]:
data_df = data_df[data_df['Sentences'].notna()]
data_df.shape

(126, 1)

In [4]:
kpi_df = pd.read_csv("GRI KPI list - Granular KPI list.csv")
kpi_df

Unnamed: 0,Description of KPI,KPI,ESG,category,gri_disclosure_sub_code
0,"Scale of the organization, including: Total nu...",total number of employees,Governance,102.07,ai
1,"Scale of the organization, including: total nu...",total number of operations,Governance,102.07,aii
2,"Scale of the organization, including: Scale of...",net sales (for private sector),Governance,102.07,aiii
3,,net revenues (for public sector),Governance,102.07,aiii
4,"Scale of the organization, including: quantity...",quantity of products,Governance,102.07,av
...,...,...,...,...,...
157,,percentage of investment agreements that inclu...,Social,412.03,a
158,Percentage of new suppliers that were screened...,Percentage of new suppliers that were screened...,Social,414.01,a
159,Number of suppliers assessed for social impacts,Number of suppliers assessed for social impacts,Social,414.02,a
160,Number of suppliers identified as having signi...,Number of suppliers having significant actual ...,Social,414.02,b


Using Fuzzing Logic
--

In [5]:
metric = fuzz.token_set_ratio

In [6]:
sent = np.array(data_df[['Sentences']])
kpi = np.array(kpi_df[["KPI"]])

In [7]:
#Used Parallel Processing for when we get lots of data
def parallel_fuzzy_match(idxa,idxb):
    return [sent[idxa][0],kpi[idxb][0],metric(sent[idxa][0],kpi[idxb][0])]  
results = Parallel(n_jobs=-1,verbose=1)(delayed(parallel_fuzzy_match)(idx1, idx2) for idx1 in range(len(sent)) for idx2 in range(len(kpi)) \)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 10256 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 20412 out of 20412 | elapsed:    3.4s finished


In [8]:
results = pd.DataFrame(results,columns = ["Sentences","KPI","Token_Set_Ratio"])
results

Unnamed: 0,Sentences,KPI,Token_Set_Ratio
0,62.2% use of electricity from renewable_source...,total number of employees,48
1,62.2% use of electricity from renewable_source...,total number of operations,47
2,62.2% use of electricity from renewable_source...,net sales (for private sector),26
3,62.2% use of electricity from renewable_source...,net revenues (for public sector),27
4,62.2% use of electricity from renewable_source...,quantity of products,19
...,...,...,...
20407,"since 2015, use has been reduced by 26 percent...",percentage of investment agreements that inclu...,38
20408,"since 2015, use has been reduced by 26 percent...",Percentage of new suppliers that were screened...,36
20409,"since 2015, use has been reduced by 26 percent...",Number of suppliers assessed for social impacts,28
20410,"since 2015, use has been reduced by 26 percent...",Number of suppliers having significant actual ...,34


In [9]:
results.shape

(20412, 3)

In [12]:
# Here we are selecting the unique sentences with highest Token Set Ratio.
results = results.sort_values("Token_Set_Ratio", ascending=False)
results = results.drop_duplicates(subset=["Sentences"], keep="first")
results = results.sort_index()
results.shape

(126, 3)

In [13]:
results

Unnamed: 0,Sentences,KPI,Token_Set_Ratio
71,62.2% use of electricity from renewable_source...,amount of energy consumption reduced,89
233,"at present, the energy expenditure has been re...",amount of energy consumption reduced,56
414,170 business leaders call on eu decision-maker...,Scope 1 GHG emissions of CO2 equivalent,58
578,taking the tolerance values of the measuring s...,co2 equivalent of biogenic co2 emissions,64
719,"with the help of energy management, the divisi...",amount of energy consumption reduced,74
...,...,...,...
19693,a forest plantation managed by new forests in ...,emissions in base year,48
19915,"waste_reduction, recycling business initiative...",Percentage of security personnel who have rece...,42
19943,"waste_reduction: 71,670 tons/year sugar cane r...",operation costs,75
20153,the amount of fossil-based electricity has bee...,total electricity sold,67


In [14]:
results.to_csv("Fuzzy_Token Set Ratio_Granular_KPI_Mapping.csv")