In [None]:
import os
import openai
import sys
import pandas as pd
sys.path.append('d:\\VScodeProject\\ESG-GPT\\notebooks\\')
from dotenv import load_dotenv
from utils import utils
from openai import OpenAI
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [None]:
# american or taiwan
NATION = 'american'
# csv source path
CSV_SOURCE_PATH = '../../data/csv_source'
# csv output path
CSV_OUTPUT_PATH = '../../data/csv_gpt_label'
# max token length (because we desire only output number label like [3, 4, 5] so 10 token is enough, and can save your money if accident happen)
MAX_TOKENS = 10

In [None]:
file_list = utils.get_all_file_list('american')

In [None]:
class OpenAILabel():
    def __init__(self, file_list: list[str], nation: str, csv_source_path: str, csv_output_path: str, max_tokens: int) -> None:
        self.client = OpenAI()
        self.file_list = file_list
        self.NATION = nation
        self.CSV_SOURCE_PATH = csv_source_path
        self.CSV_OUTPUT_PATH = csv_output_path
        self.MAX_TOKENS = max_tokens
        
    @staticmethod
    def _generate_prompt(paragraph: str):
        PROMPT = [
            {"role": "system", "content": "The assistant is a ESG-investing field advisor,choose at most top three related keys of the paragraph. Return the corresponding values each range from 0 to 26 using LDA_table dictionary below."\
            "LDA_table dictionary below constructed by {'key': value},return only the value without key."
            "Do not return the key of the LDA_table dictionary."
            "LDA_table dictionary:\
            {'Business ethics': 0, 'Data security': 1, 'Access_And_Affordability': 2, 'Business_Model_Resilience': 3, 'Competitive_Behavior': 4,\
            'Critical_Incident_Risk_Management': 5, 'Customer_Welfare': 6, 'Director_Removal': 7, 'Employee_Engagement_Inclusion_And_Diversity': 8,\
            'Employee_Health_And_Safety': 9, 'Human_Rights_And_Community_Relations': 10, 'Labor_Practices': 11, 'Management_Of_Legal_And_Regulatory_Framework': 12,\
            'Physical_Impacts_Of_Climate_Change': 13, 'Product_Quality_And_Safety': 14, 'Product_Design_And_Lifecycle_Management': 15,\
            'Selling_Practices_And_Product_Labeling': 16, 'Supply_Chain_Management': 17, 'Systemic_Risk_Management': 18,\
            'Waste_And_Hazardous_Materials_Management': 19, 'Water_And_Wastewater_Management': 20, 'Air_Quality': 21,\
            'Customer_Privacy': 22, 'Ecological_Impacts': 23, 'Energy_Management': 24, 'GHG_Emissions': 25, 'No_sufficient_information': 26}"},
            {"role": "user", "content": f"paragraph: {paragraph}"},
            {"role": "assistant", "content": "[value]"},
        ]
        return PROMPT
    
    def loop_folder(self):
        for file in self.file_list:
            print(file)
            try:
                file_name = file.strip('.pdf')
                df = pd.read_csv(f'{self.CSV_SOURCE_PATH}/{self.NATION}/{file_name}.csv')
                df = df['paragraph']
                label, broken = self.get_label(df)
                label = self.fill_fail_sentence(label, broken, df)
                self.gen_output_csv(df, label, file_name)
            except Exception as e:
                print(f'Error on {file}', e)
                
                
    def get_label(self, df: pd.Series) -> list:
        label = []
        broken = []
        for i in range(len(df)):
            paragraph = df[i]
            if(i == len(df) // 2):
                print(i)
            try:
                completion = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=self._generate_prompt(paragraph),
                max_tokens = self.MAX_TOKENS
                )
            except Exception as e:
                broken.append(i)
                print(i, e)
            else:
                label.append(completion.choices[0].message.content)
                # print(df[i], completion.choices[0].message.content) # use this to check response
        return label, broken
    
    def fill_fail_sentence(self, label: list, broken: list, df: pd.Series):
        while(len(broken)):
            broke_idx = broken[0]
            paragraph = df[broke_idx]
            del broken[0]
            try:
                completion = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=self._generate_prompt(paragraph),
                max_tokens = self.MAX_TOKENS
                )
            except Exception as e:
                print(e)
                broken.append(broke_idx)
            else:
                label.insert(broke_idx, completion.choices[0].message.content)
                print(broke_idx, completion.choices[0].message.content)
        return label
    
    def gen_output_csv(self, df: pd.Series, label: list, file_name: str):
        ans = pd.DataFrame({'paragraph': df, 'label': label})
        ans.to_csv(f'{self.CSV_OUTPUT_PATH}/{self.NATION}/{file_name}.csv', index=False)

In [None]:
openai_label = OpenAILabel(file_list, NATION, CSV_SOURCE_PATH, CSV_OUTPUT_PATH, MAX_TOKENS)
openai_label.loop_folder()