### This notebook generates the initial dataset for the project 

- process 60 ESG reports, about 7000 pages of pdf files

In [1]:
# autoreload imports
%load_ext autoreload
%autoreload 2

In [3]:
# Parse all esg reports to text using aws textract

import os
import random
from textract import extractTextFromPdf

path = os.path.join('..','data', 'reports')

# files = os.listdir(path)
files5 = [
    'AuSol_Memoria_y_Estados_Financieros_2023.pdf',
    'Grupo MR 2022-2023.pdf',
    'Pampa-2022-Reporte-Sustentabilidad.pdf',
    'Telecom Argentina - Infografia_Memoria Anual Integrada 2022.pdf',
    'Central Puerto-Reporte-de-Sustentabilidad-2023-VF.pdf'
]


for file in files5:
    outputFile = os.path.join('..','data', 'texts', file.replace('.pdf', '.txt'))

    if file.endswith('.pdf') and not os.path.exists(outputFile):
        
        # extract text from pdf
        print(f'\n[+] Extracting text from {file}')
        text = extractTextFromPdf(os.path.join(path, file))
        
        # save text to file in data/texts
        with open(outputFile, 'w') as f:
            f.write(text)

In [9]:
# Generate structured summaries 
from summary import getSummary
from getClient import getClient
import json

client = getClient(os.path.join('..','openai_key.txt'))

for file in files5:
    outputFile = os.path.join('..','data', 'summaries', file.replace('.pdf', '.json'))

    if file.endswith('.pdf') and not os.path.exists(outputFile):
        
        # extract text from pdf
        print(f'\n[+] Generating summary for {file}')
        summary = getSummary(os.path.join('..','data', 'texts', file.replace('.pdf', '.txt')), client, "..")
        
        # save summary to file in data/summaries as a json file
        with open(outputFile, 'w') as f:
            json.dump(summary, f, indent=4)


[+] Generating summary for AuSol_Memoria_y_Estados_Financieros_2023.pdf
[+] ESG report parsed and uploaded successfully
[*] Generating summary...


In [11]:
# Generate guideline feedback
from guidelines import getGuidelineFeedback

for file in files5:
    outputFile = os.path.join('..','data', 'feedbackGuideline', file.replace('.pdf', '.txt'))

    if file.endswith('.pdf') and not os.path.exists(outputFile):
        
        
        print(f'\n[+] Generating guideline feedback for {file}')
        feedback = getGuidelineFeedback(os.path.join('..','data', 'texts', file.replace('.pdf', '.txt')), client, "..")
        
        # save feedback to file in data/feedback as a txt file
        with open(outputFile, 'w') as f:
            f.write(feedback)


[+] Generating guideline feedback for AuSol_Memoria_y_Estados_Financieros_2023.pdf
[+] ESG report uploaded successfully
[*] Generating feedback on guideline...

[+] Generating guideline feedback for Grupo MR 2022-2023.pdf
[+] ESG report uploaded successfully
[*] Generating feedback on guideline...

[+] Generating guideline feedback for Telecom Argentina - Infografia_Memoria Anual Integrada 2022.pdf
[+] ESG report uploaded successfully
[*] Generating feedback on guideline...

[+] Generating guideline feedback for Central Puerto-Reporte-de-Sustentabilidad-2023-VF.pdf
[+] ESG report uploaded successfully
[*] Generating feedback on guideline...


In [14]:
# Generate final conclusions
from conclusions import getConclusion

for file in files5:
    outputFile = os.path.join('..','data', 'conclusions', file.replace('.pdf', '.txt'))

    if file.endswith('.pdf') and not os.path.exists(outputFile):
        
        print(f'\n[+] Generating conclusions for {file}')
        conclusions = getConclusion(os.path.join('..','data', 'texts', file.replace('.pdf', '.txt')), client, "..")
        
        # save conclusions to file in data/conclusions as a txt file
        with open(outputFile, 'w') as f:
            f.write(conclusions)


[+] Generating conclusions for Telecom Argentina - Infografia_Memoria Anual Integrada 2022.pdf
[+] ESG report uploaded successfully
[*] Generating feedback on guideline...

[+] Generating conclusions for Central Puerto-Reporte-de-Sustentabilidad-2023-VF.pdf
[+] ESG report uploaded successfully
[*] Generating feedback on guideline...
