# Data Augmentation with Reuter News

In [78]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

### Load Dataset

In [2]:
# load dataset
dataset = load_dataset("reuters21578","ModLewis")

In [3]:
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

df = pd.concat([train_df, test_df]).reset_index(drop=True)
df = df[['text', 'text_type', "title"]]

print(f"The dataset has {len(df)} news.")
df = df.drop_duplicates()
print(f"The dataset has {len(df)} news.")
# cleaning up BRIEF and non-text
df = df[df["text_type"] == '"NORM"'].copy()
print(f"The dataset has {len(df)} news.")

The dataset has 19813 news.
The dataset has 19595 news.
The dataset has 17712 news.


In [4]:
def word_count(sentence):
    if sentence is not None:
        return len(sentence.split(" "))
    return 0

df["text_wc"] = df["text"].apply(word_count)
min_wc = np.percentile(df["text_wc"], 1)
max_wc = np.percentile(df["text_wc"], 99)
df = df[(df["text_wc"] >= min_wc) & (df["text_wc"] <= max_wc)].copy()
print(f"The dataset has {len(df)} news.")
df["text_wc"].describe()

The dataset has 17410 news.


count    17410.000000
mean       131.177714
std        119.089422
min         20.000000
25%         60.000000
50%         90.000000
75%        163.000000
max        713.000000
Name: text_wc, dtype: float64

In [5]:
def clean_data(df, col):

    # change to lower and remove spaces on either side
    df[col] = df[col].apply(lambda x: x.lower().strip())
    # remove ^lt > pattern
    df[col] = df[col].apply(lambda x: re.sub(r'&lt;[^>]+>', '', x))
    # remove extra spaces in between
    df[col] = df[col].apply(lambda x: re.sub(' +', ' ', x))

    return df

df = clean_data(df, "title")
df["text"] = df["text"].str.replace("\n", " ")
df = clean_data(df, "text")

In [6]:
df = df.reset_index(drop=True)

### Filter down to ESG related News

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg',
                                          truncation=True,
                                          padding='max_length',
                                          max_length=512)
esg_pipeline = pipeline("text-classification", 
                        model=finbert, 
                        tokenizer=tokenizer)

In [None]:
batch = 128
for i in range(0, len(df) // batch + 1):
    sentences = list(df.loc[i * batch: (i + 1) * batch]["text"])
    sentences = [s[:512] for s in sentences]
    results = esg_pipeline(sentences)
    df.loc[i * batch: (i + 1) * batch, "esg_label"] = [x["label"] for x in results]
    df.loc[i * batch: (i + 1) * batch, "esg_score"] = [x["score"] for x in results]

In [None]:
df.to_parquet("../dataset/temp_reuter.parquet")

In [100]:
df = pd.read_parquet("../dataset/temp_reuter.parquet")

In [9]:
df.groupby("esg_label")["esg_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
esg_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Environmental,445.0,0.737755,0.176262,0.330253,0.589691,0.750825,0.907906,0.991086
Governance,89.0,0.522634,0.102706,0.35006,0.455747,0.492664,0.59092,0.830855
,14314.0,0.938402,0.111449,0.33517,0.954603,0.984908,0.99131,0.995978
Social,2562.0,0.753372,0.170418,0.346135,0.611664,0.777065,0.913833,0.988577


In [10]:
# filter down
df = df[(df["esg_label"] != "None") & (df["esg_score"] > 0.5)]
print(f"There are {len(df)} augmented news.")
df.groupby("esg_label")["esg_score"].describe()

There are 2741 augmented news.


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
esg_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Environmental,393.0,0.775998,0.149653,0.504137,0.649648,0.783682,0.919137,0.991086
Governance,40.0,0.611769,0.084351,0.503932,0.54316,0.595629,0.662771,0.830855
Social,2308.0,0.786391,0.145179,0.501986,0.659425,0.811584,0.92244,0.988577


In [None]:
df.to_parquet("../dataset/temp_reuter_filtered.parquet")

### Use GPT 4 to further filter related news

In [None]:
from openai import OpenAI

client = OpenAI()

In [None]:
template = """
Given the following news, output -1 if the news is not related to ESG (environmental, social, and governance) and won't have any ESG impact. Output 0 if the ESG impact duration is below 2 years, 1 if the ESG impact duration is between 2 and 5 year and 2 if the ESG impact duration is more than 5 years. You only need to output the number, and do not need any further explanation.

News:"""

In [None]:
def generate_gpt_label(sentence, model):
    content = template + sentence
    
    conversation = [
            {"role": "system", "content": "You are an ESG analyst, skilled assessing the level and duration an event in the news article might have on the company"},
            {"role": "user", "content": content}
        ]
    
    completion = client.chat.completions.create(
        model=model,
        messages=conversation,
        temperature=0.7
    )
    
    message = completion.choices[0].message.content
    
    try:
        return int(message)
    except:
        return None

In [None]:
df["gpt_4_lable"] = df["text"].apply(generate_gpt_label)

df.to_parquet("../dataset/temp_reuter_openai_filtered.parquet")

In [12]:
df = pd.read_parquet("../dataset/temp_reuter_openai_filtered.parquet")

In [18]:
print(f"We generate labels for {len(df[df['gpt_4_label'] != -1])} news.")
print(f"And there are {len(df[df['gpt_4_label'] == -1])} news not related to ESG.")
df.groupby(["esg_label","gpt_4_label"])["text"].count()

We generate labels for 847 news.
And there are 1894 news not related to ESG.


esg_label      gpt_4_label
Environmental  -1              149
                0               38
                1               99
                2              107
Governance     -1               28
                1               11
                2                1
Social         -1             1717
                0              128
                1              420
                2               43
Name: text, dtype: int64

In [None]:
df = df[df["gpt_4_label"] != -1].reset_index()

df.to_parquet("../dataset/reuter_gpt4_label.parquet")

### Aggregate with other models

In [144]:
df = pd.read_parquet("../dataset/temp_reuter_openai_filtered.parquet")
df_pythia = pd.read_parquet("../dataset/temp_reuter_pythia_filtered.parquet").reset_index(drop=True)
df_neox = pd.read_parquet("../dataset/temp_reuter_neox_filtered.parquet").reset_index(drop=True)
df_gemini = pd.read_parquet("../dataset/temp_reuter_gemini_filtered.parquet").reset_index(drop=True)

In [145]:
df_pythia["pythia_label"] = df_pythia["pythia_label"].fillna(-1)
df_neox["neox_label"] = df_neox["neox_label"].fillna(-1)
df_gemini["gemini_label"] = df_gemini["gemini_label"].fillna(-1)

df_pythia["pythia_label"] = df_pythia["pythia_label"].astype(int)
df_neox["neox_label"] = df_neox["neox_label"].astype(int)
df_gemini["gemini_label"] = df_gemini["gemini_label"].astype(int)

In [146]:
df_pythia.groupby("pythia_label")["text"].count()

pythia_label
-1     117
 0    1185
 1    1439
Name: text, dtype: int64

In [147]:
df_neox.groupby("neox_label")["text"].count()

neox_label
-1       4
 0     160
 1    1331
 2    1246
Name: text, dtype: int64

In [148]:
df_gemini.groupby("gemini_label")["text"].count()

gemini_label
-1    2447
 0      51
 1      92
 2     151
Name: text, dtype: int64

In [177]:
aggregated_df = df.merge(df_pythia).merge(df_neox).merge(df_gemini)

In [178]:
aggregated_df = aggregated_df[((aggregated_df["gpt_4_label"] == aggregated_df["pythia_label"]) | 
                               (aggregated_df["gpt_4_label"] == aggregated_df["neox_label"]) | 
                               (aggregated_df["gpt_4_label"] == aggregated_df["gemini_label"])) & 
                              (aggregated_df["gpt_4_label"] != -1)].reset_index(drop=True)
print(f"There are {len(aggregated_df)} news with valid label.")

There are 646 news with valid label.


In [179]:
def group_sentences(sentences, sent_size):
    return [sentences[i : i + sent_size] for i in range(0, len(sentences), sent_size)]

def segment_articles(df, sent_size):
    df['sent_tokenize'] = df['text'].apply(sent_tokenize)
    df['new_text'] = df['sent_tokenize'].apply(lambda x: group_sentences(x, sent_size))
    seg_df = df.explode('new_text')
    seg_df["new_text"] = seg_df["new_text"].apply(lambda x: " ".join(x))
    seg_df = seg_df.drop(columns="sent_tokenize")
    return seg_df
    
# segment Chinese and Korean articles
sent_size = 5
aggregated_df = segment_articles(aggregated_df, sent_size)
aggregated_df["text_wc"] = aggregated_df["new_text"].apply(word_count)
aggregated_df = aggregated_df[(aggregated_df["text_wc"] > 10)]
aggregated_df = aggregated_df.reset_index(drop=True)

In [180]:
print(f"There are {len(aggregated_df)} news with valid label and text length.")
aggregated_df.groupby(["gpt_4_label"])["text"].count()

There are 1221 news with valid label and text length.


gpt_4_label
0    186
1    858
2    177
Name: text, dtype: int64

In [181]:
aggregated_df = aggregated_df.rename(columns = {"new_text": "content", 
                                                "gpt_4_label": "label"})
aggregated_df["feature"] = aggregated_df["title"] + " || " + aggregated_df["content"]
aggregated_df["group_indicator"] = -1
aggregated_df["language"] = "English"

columns = ["title", "content", "feature", "label", "group_indicator", "language"]
aggregated_df = aggregated_df[columns]

aggregated_df.to_parquet("../dataset/augmentation_dataset.parquet")