# Notebook - Data Augmentation

In [None]:
from openai import OpenAI
import pandas as pd

client = OpenAI()

In [None]:
data_df = pd.read_parquet("../dataset/train_df_all_english.parquet")

In [None]:
data_df.groupby("impact_length_idx").head(5).sort_values("impact_length_idx")

In [None]:
data_augmentation_dict = {
    "less than 2 years": [data_df.loc[4]["Translation"], data_df.loc[19]["Translation"]],
    "between 2 and 5 years": [data_df.loc[7]["Translation"], data_df.loc[11]["Translation"]],
    "more than 5 years": [data_df.loc[6]["Translation"], data_df.loc[8]["Translation"]]
}

In [None]:
data_augmentation_messages = []

repeat_n = 2 # call api 2 more times

for impact_duration, text_list in data_augmentation_dict.items():

    template = f"""Give 10 examples of real news related to ESG (Environmental, Social, Governance) that has an impact duration {impact_duration}. Each example should have a news title and news summary, connected with "||". Generate these examples in english language. Two example news:

    "{text_list[0]}"
    
    "{text_list[1]}"
    
    The output should only contain a numbered list of news.
    
    """
    
    conversation = [
            {"role": "system", "content": "You are an ESG analyst, skilled assessing the level and duration an event in the news article might have on the company"},
            {"role": "user", "content": template}
        ]
    
    print(template)
    
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=conversation
    )
    
    message = completion.choices[0].message.content
    
    print(message)
    
    data_augmentation_messages.append(message)
    
    for i in range(repeat_n):
        assitant_message = {"role": "assistant", "content": message}
        new_user_message = {"role": "user", "content": "Give me 10 more real examples."}
        
        conversation.append(assitant_message)
        conversation.append(new_user_message)
        
        completion = client.chat.completions.create(
            model="gpt-4",
            messages=conversation
        )
        
        message = completion.choices[0].message.content
    
        print(message)
    
        data_augmentation_messages.append(message)


In [None]:
augmented_dataframes = []

for idx, response in enumerate(data_augmentation_messages):
    augmented_news_list = response.split("\n\n")
    augmented_news_list = [i.split(". ", maxsplit=1)[1] for i in augmented_news_list]
    
    augmented_df = pd.DataFrame(augmented_news_list, columns=["sentence"])
    augmented_df["impact_length_idx"] = idx // 3
    
    augmented_dataframes.append(augmented_df)
    
augmented_df = pd.concat(augmented_dataframes).reset_index(drop=True)

In [None]:
augmented_df["url"] = "ChatGPT augmented with Bing Search"
augmented_df[["title", "content"]] = augmented_df["sentence"].str.split(" \|\| ", expand=True)
augmented_df["language"] = "English"

In [None]:
augmented_df

In [None]:
augmented_df.to_parquet("../dataset/train_df_English_augmented.parquet")

# Use the Augmented df and relabel to ensure data quality

In [None]:
augmented_df = pd.read_parquet("../dataset/train_df_English_augmented.parquet")
 

In [None]:
def word_count(sentence):
    if sentence is not None:
        return len(sentence.split(" "))
    else:
        0

augmented_df["content"].apply(word_count).describe()

In [None]:
augmented_df.head()

In [None]:
def generate_gpt_label(sentence):
        
    template = f"""Given the following news, output 0 if the ESG impact duration is below 2 years, 1 if the ESG impact duration is between 2 and 5 year and 2 if the ESG impact duration is more than 5 years. You only need to output the number, and do not need any further explanation.
    
    {sentence}"""
    
    conversation = [
            {"role": "system", "content": "You are an ESG analyst, skilled assessing the level and duration an event in the news article might have on the company"},
            {"role": "user", "content": template}
        ]
    
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=conversation,
        temperature=0.7
    )
    
    message = completion.choices[0].message.content
    
    return int(message)


augmented_df["gpt_label"] = augmented_df["sentence"].apply(generate_gpt_label)

In [None]:
augmented_df.groupby("gpt_label")["gpt_label"].count()

In [None]:
len(augmented_df[augmented_df["gpt_label"] == augmented_df["impact_length_idx"]])

In [None]:
def word_count(sentence):
    if sentence is not None:
        return len(sentence.split( ))
    return 0

In [None]:
augmented_df["content_wc"] = augmented_df["content"].apply(word_count)

In [None]:
augmented_df["content_wc"].describe()