In [1]:
import pandas as pd
import numpy as np

from os import path, makedirs
from json import load

# Read file

In [2]:
clustered_triples = True

In [3]:
if clustered_triples:
    file_names = ['triples_wizardLM_filtering.json']
    folder_path = path.join('..', 'outputs', 'genSRL', 'clustered_t80')
else:
    file_names = ['triples_wizardLM_filtering_setA.json', 'triples_wizardLM_filtering_setB.json', 'triples_wizardLM_filtering_setC.json',
                  'triples_wizardLM_filtering_setD.json', 'triples_wizardLM_filtering_setE.json']
    folder_path = path.join('..', 'outputs', 'genSRL')

In [4]:
triple_data = dict()
for file_name in file_names:
    file_path = path.join(folder_path, file_name)
    with open(file_path) as fp:
        triple_data.update(load(fp))

In [5]:
print("COMPANIES:", len(triple_data.keys()), '\n')
print('\n'.join([ f'{idk + 1}) {companyName}'for idk, companyName in enumerate(sorted(triple_data.keys()))]))

COMPANIES: 129 

1) 3M Corporation
2) 3i Group plc
3) Activision Blizzard Inc
4) Adecco Group AG
5) Adidas AG
6) Air Canada
7) Air Liquide SA
8) Airbus SE
9) Alcon Inc
10) Alibaba Group Holding Limited
11) Alphabet Inc
12) Aluminum Corporation of China Limited
13) Amazoncom Inc
14) American Electric Power Company Inc
15) Amplifon
16) Apple Inc
17) ArcelorMittal SA
18) Assicurazioni Generali SpA
19) AstraZeneca PLC
20) AstraZeneca plc
21) BPER Banca SpA
22) Baidu Inc
23) Banco Santander SA
24) Bank of America Corp BofA
25) Bayer AG
26) British American Tobacco PLC
27) British American Tobacco plc
28) British Land Co PLC The
29) Broadcom Inc
30) Builders FirstSource Inc
31) CF Industries Holdings Inc
32) Campbell Soup Company
33) Canadian Pacific Railway Limited
34) Canon Inc
35) CarMax Inc
36) China Evergrande Group
37) China Petroleum Chemical Corporation
38) Cisco Systems Inc
39) Coca Cola
40) Commonwealth Bank of Australia
41) Croda International plc
42) Daikin Industries Ltd
43) Del

In [6]:
raw_data = []
for companyName, triples in triple_data.items():
    for triple in triples:
        raw_data.append({
            'company': companyName,
            'category': triple['esg_category'],
            'predicate': triple['predicate'],
            'object': triple['object']
        })
df = pd.DataFrame(raw_data)
display(df)

Unnamed: 0,company,category,predicate,object
0,3M Corporation,Access to Basic Services,Availability of,Education
1,3M Corporation,Access to Basic Services,Continues to provide,Customer education
2,3M Corporation,Accessibility,Approaching,With the understanding that our customers and ...
3,3M Corporation,Air Emissions,Attributed to,"Coating lines, reactors and mixing"
4,3M Corporation,Air Emissions,Contribution to,Unhealthy levels of ozone
...,...,...,...,...
54866,adidas AG,Supply Chain,Percentage of,Raw materials third-party certified to an envi...
54867,adidas AG,Supply Chain,Working with,520 independent supplier facilities (Tier 1)
54868,adidas AG,Water,Setting a goal,80% of applicable suppliers
54869,adidas AG,Workplace Practices,Requires compliance with,AR 2020 - Working Conditions in our Supply Cha...


In [7]:
prompt_triples = [
    {
        "category": "Water", 
        "predicate": "Reduction by", 
        "object": "20% in the data center's potable water usage"
    },{
        "category": "Water", 
        "predicate": "Decrease of", 
        "object": "The data center's water withdrawal from 3.874.000 litres to 2.367.000 litres",
    },{
        "category": "Labor Practices", 
        "predicate": "Introduction of", 
        "object": "An innovative program",
    },{
        "category": "Employee Development", 
        "predicate": "Completion of", 
        "object": "Ethics training",
    }
]

In [8]:
for att in ['category', 'predicate', 'object']:
    df[att] = df[att].str.lower().str.strip()
    
    prompt_strings = set([triple[att].lower().strip() for triple in prompt_triples])
    
    check_triples = df[att].apply(lambda generated_text: [string for string in prompt_strings if generated_text.lower().strip() == string])
    
    check_triples = check_triples.map(lambda strings: ', '.join(strings) if len(strings) > 0 else '_no_match')
    check_triples = check_triples.groupby(check_triples).count() / len(check_triples)
    check_triples = check_triples.drop(index = '_no_match')
    num_prompt_triples = int(check_triples.sum() * len(df))
    
    print(f"Prompt {att}s ({len(prompt_strings)}): {prompt_strings}")
    print(f"Prompt triples: {num_prompt_triples} / {len(df)} ({num_prompt_triples / len(df) * 100:.2f}%)")
    display(check_triples.sort_values(ascending = False).round(4).to_frame())

    counter = df.loc[df[att].map(lambda string: string in list(check_triples.index)), 'company'].value_counts()
    total = df['company'].value_counts()
    normalized_counter = counter / total

    validity_df = pd.concat([counter, total, normalized_counter.round(4) * 100], axis=1, keys=['count', 'total', 'normalized (%)']).dropna()
    validity_df = validity_df.sort_values(by = ['count', 'normalized (%)'], ascending=False).astype({'count': int})

    display(validity_df)
    

#df['action'] = df['category'].str.lower().str.strip() + ' ' + df['predicate'].str.lower().str.strip() + ' ' + df['object'].str.lower().str.strip()

Prompt categorys (3): {'employee development', 'water', 'labor practices'}
Prompt triples: 3008 / 54871 (5.48%)


Unnamed: 0_level_0,category
category,Unnamed: 1_level_1
water,0.0326
employee development,0.0223


Unnamed: 0_level_0,count,total,normalized (%)
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coca Cola,61,443,13.77
Microsoft Corporation,55,479,11.48
DuPont,49,624,7.85
Sony Corporation,48,806,5.96
Campbell Soup Company,47,493,9.53
...,...,...,...
National Grid PLC,3,38,7.89
adidas AG,1,31,3.23
Alphabet Inc,1,40,2.50
Netflix Inc,1,197,0.51


Prompt predicates (4): {'completion of', 'decrease of', 'introduction of', 'reduction by'}
Prompt triples: 1237 / 54871 (2.25%)


Unnamed: 0_level_0,predicate
predicate,Unnamed: 1_level_1
introduction of,0.0158
completion of,0.006
decrease of,0.0007


Unnamed: 0_level_0,count,total,normalized (%)
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Samsung Electronics Co Ltd,29,711,4.08
ENI SpA,27,514,5.25
Amazoncom Inc,23,621,3.70
Telecom Italia SpA,22,750,2.93
Enel SpA,21,922,2.28
...,...,...,...
Novo Nordisk A S,1,48,2.08
PepsiCo Inc,1,59,1.69
Tesco PLC,1,61,1.64
Iveco Group NV,1,144,0.69


Prompt objects (4): {"20% in the data center's potable water usage", "the data center's water withdrawal from 3.874.000 litres to 2.367.000 litres", 'an innovative program', 'ethics training'}
Prompt triples: 405 / 54871 (0.74%)


Unnamed: 0_level_0,object
object,Unnamed: 1_level_1
ethics training,0.0035
an innovative program,0.0031
20% in the data center's potable water usage,0.0008
the data center's water withdrawal from 3.874.000 litres to 2.367.000 litres,0.0


Unnamed: 0_level_0,count,total,normalized (%)
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Samsung Electronics Co Ltd,14,711,1.97
British American Tobacco plc,11,742,1.48
General Motors Co GM,10,341,2.93
Telecom Italia SpA,10,750,1.33
Walt Disney Co,9,401,2.24
...,...,...,...
Coca Cola,1,443,0.23
NVIDIA Corp,1,468,0.21
Saudi Aramco,1,487,0.21
Cisco Systems Inc,1,509,0.20
