# Using Fuzzy Matching with Generic KPI

In [1]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

Reading files
--

In [2]:
data_df = pd.read_csv("clean_sentences.csv")
data_df

Unnamed: 0,Sentences
0,62.2% use of electricity from renewable_source...
1,"at present, the energy expenditure has been re..."
2,170 business leaders call on eu decision-maker...
3,taking the tolerance values of the measuring s...
4,"with the help of energy management, the divisi..."
...,...
121,a forest plantation managed by new forests in ...
122,"waste_reduction, recycling business initiative..."
123,"waste_reduction: 71,670 tons/year sugar cane r..."
124,the amount of fossil-based electricity has bee...


In [3]:
data_df = data_df[data_df['Sentences'].notna()]
data_df.shape

(126, 1)

In [4]:
kpi_df = pd.read_csv("GRI KPI list - Generic KPI List.csv")
kpi_df

Unnamed: 0,KPI,ESG category
0,employees,Governance
1,operations,Governance
2,net sales,Governance
3,net revenues,Governance
4,products,Governance
5,services,Governance
6,critical concerns,Governance
7,compensation,Governance
8,operation costs,Economic
9,employee wages,Economic


Using Fuzzing Logic
--

In [5]:
metric = fuzz.token_set_ratio

In [6]:
sent = np.array(data_df[['Sentences']])
kpi = np.array(kpi_df[["KPI"]])

In [7]:
#Used Parallel Processing for when we get lots of data
def parallel_fuzzy_match(idxa,idxb):
    return [sent[idxa][0],kpi[idxb][0],metric(sent[idxa][0],kpi[idxb][0])]  
results = Parallel(n_jobs=-1,verbose=1)(delayed(parallel_fuzzy_match)(idx1, idx2) for idx1 in range(len(sent)) for idx2 in range(len(kpi)) \
                                       )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 6224 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 6804 out of 6804 | elapsed:    2.9s finished


In [8]:
results = pd.DataFrame(results,columns = ["Sentences","KPI","Token_Set_Ratio"])
results

Unnamed: 0,Sentences,KPI,Token_Set_Ratio
0,62.2% use of electricity from renewable_source...,employees,11
1,62.2% use of electricity from renewable_source...,operations,12
2,62.2% use of electricity from renewable_source...,net sales,12
3,62.2% use of electricity from renewable_source...,net revenues,15
4,62.2% use of electricity from renewable_source...,products,11
...,...,...,...
6799,"since 2015, use has been reduced by 26 percent...",incidents of discrimination,22
6800,"since 2015, use has been reduced by 26 percent...",security personnel,18
6801,"since 2015, use has been reduced by 26 percent...",human rights policies,19
6802,"since 2015, use has been reduced by 26 percent...",incidents of violations,20


In [12]:
# Here we are selecting the unique sentences with highest Token Set Ratio.
results = results.sort_values("Token_Set_Ratio", ascending=False)
results = results.drop_duplicates(subset=["Sentences"], keep="first")
results = results.sort_index()
results.shape

(126, 3)

In [13]:
results

Unnamed: 0,Sentences,KPI,Token_Set_Ratio
32,62.2% use of electricity from renewable_source...,fuel consumption,100
87,"at present, the energy expenditure has been re...",energy consumption,50
147,170 business leaders call on eu decision-maker...,GHG emissions,100
173,taking the tolerance values of the measuring s...,payments to providers of capital,27
249,"with the help of energy management, the divisi...",energy consumption,100
...,...,...,...
6545,a forest plantation managed by new forests in ...,payments to providers of capital,27
6610,"waste_reduction, recycling business initiative...",business partners,64
6650,"waste_reduction: 71,670 tons/year sugar cane r...",operation costs,75
6708,the amount of fossil-based electricity has bee...,payments to government by country,37


In [15]:
results.to_csv("Fuzzy_Token Set Ratio_Generic_KPI_Mapping.csv")