In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from tika import parser
import re
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")

model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# Create the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)


Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:
# Create a Class to parse PDF
class PDFParser:
    def __init__(self, file_path):
        self.file_path = file_path
        self.raw = parser.from_file(self.file_path)
        self.text = self.raw['content']

    def get_text(self):
        return self.text

    def get_text_clean(self):
        text = self.text
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def get_text_clean_list(self):
        text = self.get_text_clean()
        text_list = text.split('.')
        return text_list

In [3]:
# Get report from responsibilityreports.com
mcdonalds_url = "https://www.responsibilityreports.com/Click/2534"
pp = PDFParser(mcdonalds_url)
sentences = pp.get_text_clean_list()

print(f"The McDonalds CSR report has {len(sentences):,d} sentences")


2023-06-04 12:59:26,473 [MainThread  ] [INFO ]  Retrieving https://www.responsibilityreports.com/Click/2534 to /tmp/click-2534.


The McDonalds CSR report has 598 sentences


In [4]:
sentences

[' 2021 – 2022 Purpose & Impact Global Progress Summary McDonald’s Corporation This past year showed us what makes McDonald’s unique, once again',
 ' Guided by our core values, we’ve experienced first-hand how our focused actions – both big and small – can translate into meaningful experiences for our customers, bringing our purpose to feed and foster community to life each day',
 ' With the strength of our full System, we’ve worked together to build a more diverse, equitable and inclusive business, source more food responsibly, adopt more sustainable practices, and implement innovative and credible solutions in our ongoing quest to be a good neighbor in the communities where we live, work and serve',
 ' We are proud of the work we do to make a difference – and will continue to help uphold this promise in all of the communities in which we operate',
 ' Showing Up for Our Communities Ray Kroc used to say, “None of us is as good as all of us” – a phrase that serves as a constant reminder

In [10]:
result = classifier(sentences)
df = pd.DataFrame(result)

In [30]:
df.groupby(['label']).mean().sort_values('score', ascending = False)

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
Waste_And_Hazardous_Materials_Management,0.979885
Critical_Incident_Risk_Management,0.915521
Customer_Privacy,0.890653
Water_And_Wastewater_Management,0.868754
GHG_Emissions,0.808698
Supply_Chain_Management,0.800456
Employee_Engagement_Inclusion_And_Diversity,0.771948
Human_Rights_And_Community_Relations,0.752327
Product_Quality_And_Safety,0.746792
Ecological_Impacts,0.718146


In [31]:
# We can also convert the workflow above into a function and can easily compare the scores with other companies'
def run_classifier(url):
    pp = PDFParser(url)
    sentences = pp.get_text_clean_list()
    print(f"The CSR report has {len(sentences):,d} sentences")
    result = classifier(sentences)
    df = pd.DataFrame(result)
    return(df)

In [37]:
# Let's try to look at Amazon
amzn = run_classifier("https://www.responsibilityreports.com/Click/2015")

2023-06-04 13:32:10,054 [MainThread  ] [INFO ]  Retrieving https://www.responsibilityreports.com/Click/2015 to /tmp/click-2015.


The CSR report has 3,006 sentences


In [38]:
amzn.groupby(['label']).mean().sort_values('score', ascending = False)

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
Water_And_Wastewater_Management,0.944439
Waste_And_Hazardous_Materials_Management,0.822303
Supply_Chain_Management,0.82229
Energy_Management,0.790863
Physical_Impacts_Of_Climate_Change,0.787895
Business_Ethics,0.744366
Labor_Practices,0.738375
Employee_Engagement_Inclusion_And_Diversity,0.730735
Human_Rights_And_Community_Relations,0.726501
Ecological_Impacts,0.67996


In [39]:
# Let's look at another company from a different sector - Newmont Mining
nm = run_classifier("https://www.responsibilityreports.com/Click/1772")


2023-06-04 13:34:56,616 [MainThread  ] [INFO ]  Retrieving https://www.responsibilityreports.com/Click/1772 to /tmp/click-1772.


The CSR report has 12,897 sentences


In [40]:
nm.groupby(['label']).mean().sort_values('score', ascending = False)

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
Water_And_Wastewater_Management,0.915708
Air_Quality,0.816752
Employee_Health_And_Safety,0.780301
Physical_Impacts_Of_Climate_Change,0.775978
Human_Rights_And_Community_Relations,0.773394
Supply_Chain_Management,0.709297
Ecological_Impacts,0.699983
GHG_Emissions,0.693242
Labor_Practices,0.644533
Waste_And_Hazardous_Materials_Management,0.636957
