# Notebook - Data Augmentation

In [1]:
from openai import OpenAI
import pandas as pd

client = OpenAI()

In [3]:
data_df = pd.read_parquet("../dataset/train_df_all_english.parquet")

In [4]:
data_df.groupby("impact_length_idx").head(5).sort_values("impact_length_idx")

Unnamed: 0,url,Translation,impact_length_idx,language
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",0.0,English
16,https://www.esgtoday.com/eni-doubles-sustainab...,Eni Doubles Sustainability-Linked Bond Offerin...,0.0,English
19,https://www.esgtoday.com/bank-of-america-ceo-s...,Bank of America CEO: Sustainable Finance Deman...,0.0,English
21,https://www.esgtoday.com/republican-politician...,"Republican Politicians Target Proxy Firms ISS,...",0.0,English
22,https://www.esgtoday.com/guest-post-esg-isnt-a...,Guest Post: ESG isn’t About Altruism – it’s Ab...,0.0,English
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,1.0,English
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,1.0,English
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,1.0,English
7,https://www.esgtoday.com/red-states-sue-to-sto...,Red States Sue to Stop Biden Administration Ru...,1.0,English
11,https://www.esgtoday.com/methane-emissions-det...,Methane Emissions Detection Platform Kuva Rais...,1.0,English


In [13]:
data_augmentation_dict = {
    "less than 2 years": [data_df.loc[4]["Translation"], data_df.loc[19]["Translation"]],
    "between 2 and 5 years": [data_df.loc[7]["Translation"], data_df.loc[11]["Translation"]],
    "more than 5 years": [data_df.loc[6]["Translation"], data_df.loc[8]["Translation"]]
}

In [60]:
data_augmentation_messages = []

repeat_n = 2 # call api 2 more times

for impact_duration, text_list in data_augmentation_dict.items():

    template = f"""Give 10 examples of real news related to ESG (Environmental, Social, Governance) that has an impact duration {impact_duration}. Each example should have a news title and news summary, connected with "||". Generate these examples in english language. Two example news:

    "{text_list[0]}"
    
    "{text_list[1]}"
    
    The output should only contain a numbered list of news.
    
    """
    
    conversation = [
            {"role": "system", "content": "You are an ESG analyst, skilled assessing the level and duration an event in the news article might have on the company"},
            {"role": "user", "content": template}
        ]
    
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=conversation
    )
    
    message = completion.choices[0].message.content
    
    print(message)
    
    data_augmentation_messages.append(message)
    
    for i in range(repeat_n):
        assitant_message = {"role": "assistant", "content": message}
        new_user_message = {"role": "user", "content": "Give me 10 more real examples."}
        
        conversation.append(assitant_message)
        conversation.append(new_user_message)
        
        completion = client.chat.completions.create(
            model="gpt-4",
            messages=conversation
        )
        
        message = completion.choices[0].message.content
    
        print(message)
    
        data_augmentation_messages.append(message)


1. "Nestle Accelerates Net Zero Pledge with $3.6 Billion Sustainability Investment || The Swiss company announced it will spend around $3.6 billion over the next five years on efforts that support its aim to zero out its greenhouse gas emissions by 2050, demonstrating an increased focus on operational efficiency and climate-friendly product innovation."

2. "Salesforce’s Work.Com Revamps for Post-Pandemic Work || Salesforce is debuting employee management and productivity tools for a post-pandemic working environment. Their platform, Work.Com, aims to incorporate wellness checks, emergency response management, contact tracing, vaccination verification, and employee training."

3. "Amazon Faces Product Liability Lawsuits After Hoverboard Fires || A US appeals court ruled that Amazon can be held liable for defective goods sold on its marketplace. The decision follows several lawsuits against Amazon from users whose hoverboards caught fire. Amazon has since removed hoverboards from its pl

In [62]:
augmented_dataframes = []

for idx, response in enumerate(data_augmentation_messages):
    augmented_news_list = response.split("\n\n")
    augmented_news_list = [i.split(". ", maxsplit=1)[1] for i in augmented_news_list]
    
    augmented_df = pd.DataFrame(augmented_news_list, columns=["sentence"])
    augmented_df["impact_length_idx"] = idx // 3
    
    augmented_dataframes.append(augmented_df)
    
augmented_df = pd.concat(augmented_dataframes).reset_index(drop=True)

In [63]:
augmented_df["url"] = "ChatGPT augmented with Bing Search"
augmented_df[["title", "content"]] = augmented_df["sentence"].str.split(" \|\| ", expand=True)
augmented_df["language"] = "English"

In [64]:
augmented_df

Unnamed: 0,sentence,impact_length_idx,url,title,content,language
0,"""Nestle Accelerates Net Zero Pledge with $3.6 ...",0,ChatGPT augmented with Bing Search,"""Nestle Accelerates Net Zero Pledge with $3.6 ...",The Swiss company announced it will spend arou...,English
1,"""Salesforce’s Work.Com Revamps for Post-Pandem...",0,ChatGPT augmented with Bing Search,"""Salesforce’s Work.Com Revamps for Post-Pandem...",Salesforce is debuting employee management and...,English
2,"""Amazon Faces Product Liability Lawsuits After...",0,ChatGPT augmented with Bing Search,"""Amazon Faces Product Liability Lawsuits After...",A US appeals court ruled that Amazon can be he...,English
3,"""Future Super Fills Multibillion-Dollar Green ...",0,ChatGPT augmented with Bing Search,"""Future Super Fills Multibillion-Dollar Green ...","Australian financial services provider, Future...",English
4,"""LinkedIn Invests in New Racial Justice Fund |...",0,ChatGPT augmented with Bing Search,"""LinkedIn Invests in New Racial Justice Fund",Microsoft-owned LinkedIn announced a new racia...,English
...,...,...,...,...,...,...
85,"""Uber Pledges to be Fully Electric by 2040 || ...",2,ChatGPT augmented with Bing Search,"""Uber Pledges to be Fully Electric by 2040",Ridesharing company Uber pledges to make its e...,English
86,"""Kellogg Company Targets 50% Reduction in Glob...",2,ChatGPT augmented with Bing Search,"""Kellogg Company Targets 50% Reduction in Glob...",Kellogg Company has set a goal to reduce green...,English
87,"""Tata Motors Aims to be Carbon Neutral by 2039...",2,ChatGPT augmented with Bing Search,"""Tata Motors Aims to be Carbon Neutral by 2039","Tata Motors, one of India's largest auto manuf...",English
88,"""Starbucks Commits to Achieve Resource Positiv...",2,ChatGPT augmented with Bing Search,"""Starbucks Commits to Achieve Resource Positiv...",,English


In [65]:
augmented_df.to_parquet("../dataset/train_df_English_augmented.parquet")

# Use the Augmented df and relabel to ensure data quality

In [3]:
augmented_df = pd.read_parquet("../dataset/train_df_English_augmented.parquet")
 

In [5]:
augmented_df.head()

Unnamed: 0,sentence,impact_length_idx,url,title,content,language
0,"""Nestle Accelerates Net Zero Pledge with $3.6 ...",0,ChatGPT augmented with Bing Search,"""Nestle Accelerates Net Zero Pledge with $3.6 ...",The Swiss company announced it will spend arou...,English
1,"""Salesforce’s Work.Com Revamps for Post-Pandem...",0,ChatGPT augmented with Bing Search,"""Salesforce’s Work.Com Revamps for Post-Pandem...",Salesforce is debuting employee management and...,English
2,"""Amazon Faces Product Liability Lawsuits After...",0,ChatGPT augmented with Bing Search,"""Amazon Faces Product Liability Lawsuits After...",A US appeals court ruled that Amazon can be he...,English
3,"""Future Super Fills Multibillion-Dollar Green ...",0,ChatGPT augmented with Bing Search,"""Future Super Fills Multibillion-Dollar Green ...","Australian financial services provider, Future...",English
4,"""LinkedIn Invests in New Racial Justice Fund |...",0,ChatGPT augmented with Bing Search,"""LinkedIn Invests in New Racial Justice Fund",Microsoft-owned LinkedIn announced a new racia...,English


In [33]:
def generate_gpt_label(sentence):
        
    template = f"""Given the following news, output 0 if the ESG impact duration is below 2 years, 1 if the ESG impact duration is between 2 and 5 year and 2 if the ESG impact duration is more than 5 years. You only need to output the number, and do not need any further explanation.
    
    {sentence}"""
    
    conversation = [
            {"role": "system", "content": "You are an ESG analyst, skilled assessing the level and duration an event in the news article might have on the company"},
            {"role": "user", "content": template}
        ]
    
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=conversation,
        temperature=0.7
    )
    
    message = completion.choices[0].message.content
    
    return int(message)


augmented_df["gpt_label"] = augmented_df["sentence"].apply(generate_gpt_label)

In [34]:
augmented_df.groupby("gpt_label")["gpt_label"].count()

gpt_label
0     3
1    26
2    61
Name: gpt_label, dtype: int64

In [41]:
len(augmented_df[augmented_df["gpt_label"] == augmented_df["impact_length_idx"]])

38

In [43]:
def word_count(sentence):
    if not None:
        return len(sentence.split( ))
    return 0

In [45]:
augmented_df

Unnamed: 0,sentence,impact_length_idx,url,title,content,language,gpt_label
0,"""Nestle Accelerates Net Zero Pledge with $3.6 ...",0,ChatGPT augmented with Bing Search,"""Nestle Accelerates Net Zero Pledge with $3.6 ...",The Swiss company announced it will spend arou...,English,1
1,"""Salesforce’s Work.Com Revamps for Post-Pandem...",0,ChatGPT augmented with Bing Search,"""Salesforce’s Work.Com Revamps for Post-Pandem...",Salesforce is debuting employee management and...,English,1
2,"""Amazon Faces Product Liability Lawsuits After...",0,ChatGPT augmented with Bing Search,"""Amazon Faces Product Liability Lawsuits After...",A US appeals court ruled that Amazon can be he...,English,1
3,"""Future Super Fills Multibillion-Dollar Green ...",0,ChatGPT augmented with Bing Search,"""Future Super Fills Multibillion-Dollar Green ...","Australian financial services provider, Future...",English,1
4,"""LinkedIn Invests in New Racial Justice Fund |...",0,ChatGPT augmented with Bing Search,"""LinkedIn Invests in New Racial Justice Fund",Microsoft-owned LinkedIn announced a new racia...,English,1
...,...,...,...,...,...,...,...
85,"""Uber Pledges to be Fully Electric by 2040 || ...",2,ChatGPT augmented with Bing Search,"""Uber Pledges to be Fully Electric by 2040",Ridesharing company Uber pledges to make its e...,English,2
86,"""Kellogg Company Targets 50% Reduction in Glob...",2,ChatGPT augmented with Bing Search,"""Kellogg Company Targets 50% Reduction in Glob...",Kellogg Company has set a goal to reduce green...,English,2
87,"""Tata Motors Aims to be Carbon Neutral by 2039...",2,ChatGPT augmented with Bing Search,"""Tata Motors Aims to be Carbon Neutral by 2039","Tata Motors, one of India's largest auto manuf...",English,2
88,"""Starbucks Commits to Achieve Resource Positiv...",2,ChatGPT augmented with Bing Search,"""Starbucks Commits to Achieve Resource Positiv...",,English,2


In [44]:
augmented_df["content_wc"] = augmented_df["content"].apply(word_count)

AttributeError: 'NoneType' object has no attribute 'split'