# Data Augmentation with Reuter News

In [1]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from datasets import load_dataset
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer('english')

stop_words = set(stopwords.words('english'))

### Load Dataset

In [5]:
# load dataset
dataset = load_dataset("reuters21578","ModLewis")

Downloading data:   0%|          | 0.00/2.93M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/517k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6188 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/13625 [00:00<?, ? examples/s]

Generating unused split:   0%|          | 0/722 [00:00<?, ? examples/s]

In [93]:
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

df = pd.concat([train_df, test_df]).reset_index(drop=True)
df = df[['text', 'text_type', "title"]]

print(f"The dataset has {len(df)} news.")
df = df.drop_duplicates()
print(f"The dataset has {len(df)} news.")
# cleaning up BRIEF and non-text
df = df[df["text_type"] == '"NORM"'].copy()
print(f"The dataset has {len(df)} news.")

The dataset has 19813 news.
The dataset has 19595 news.
The dataset has 17712 news.


In [94]:
def word_count(sentence):
    if sentence is not None:
        return len(sentence.split(" "))
    return 0

df["text_wc"] = df["text"].apply(word_count)
min_wc = np.percentile(df["text_wc"], 1)
max_wc = np.percentile(df["text_wc"], 99)
df = df[(df["text_wc"] >= min_wc) & (df["text_wc"] <= max_wc)].copy()
print(f"The dataset has {len(df)} news.")
df["text_wc"].describe()

The dataset has 17410 news.


count    17410.000000
mean       131.177714
std        119.089422
min         20.000000
25%         60.000000
50%         90.000000
75%        163.000000
max        713.000000
Name: text_wc, dtype: float64

In [132]:
def clean_data(df, col):

    # change to lower and remove spaces on either side
    df[col] = df[col].apply(lambda x: x.lower().strip())
    # remove ^lt > pattern
    df[col] = df[col].apply(lambda x: re.sub(r'&lt;[^>]+>', '', x))
    # remove extra spaces in between
    df[col] = df[col].apply(lambda x: re.sub(' +', ' ', x))

    return df

df = clean_data(df, "title")
df["text"] = df["text"].str.replace("\n", " ")
df = clean_data(df, "text")

In [120]:
df = df.reset_index(drop=True)

### Filter down to ESG related News

In [110]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg',
                                          truncation=True,
                                          padding='max_length',
                                          max_length=512)
esg_pipeline = pipeline("text-classification", 
                        model=finbert, 
                        tokenizer=tokenizer)

In [122]:
batch = 128
for i in range(0, len(df) // batch + 1):
    sentences = list(df.loc[i * batch: (i + 1) * batch]["text"])
    sentences = [s[:512] for s in sentences]
    results = esg_pipeline(sentences)
    df.loc[i * batch: (i + 1) * batch, "esg_label"] = [x["label"] for x in results]
    df.loc[i * batch: (i + 1) * batch, "esg_score"] = [x["score"] for x in results]

In [5]:
df.groupby("esg_label")["esg_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
esg_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Environmental,445.0,0.737755,0.176262,0.330253,0.589691,0.750825,0.907906,0.991086
Governance,89.0,0.522634,0.102706,0.35006,0.455747,0.492664,0.59092,0.830855
,14314.0,0.938402,0.111449,0.33517,0.954603,0.984908,0.99131,0.995978
Social,2562.0,0.753372,0.170418,0.346135,0.611664,0.777065,0.913833,0.988577


In [None]:
df.to_parquet("../dataset/temp_reuter.parquet")

In [139]:
df = pd.read_parquet("../dataset/temp_reuter.parquet")

In [140]:
df

Unnamed: 0,text,text_type,title,text_wc,esg_label,esg_score
0,showers continued throughout the week in the b...,"""NORM""",bahia cocoa review,497,Environmental,0.612098
1,standard oil co and bp north america inc said ...,"""NORM""",standard oil to form financial unit,74,,0.986776
2,texas commerce bancshares inc's texas commerce...,"""NORM""",texas commerce bancshares files plan,58,,0.960852
3,bankamerica corp is not under pressure to act ...,"""NORM""",talking point/bankamerica equity offer,466,,0.989025
4,the u.s. agriculture department reported the f...,"""NORM""",national average prices for farmer-owned reserve,466,,0.963944
...,...,...,...,...,...,...
17405,the japan/india-pakistan-gulf/japan shipping c...,"""NORM""",japan/india conference cuts gulf war risk charges,74,Social,0.476759
17406,the soviet union's industrial output is growin...,"""NORM""",soviet industrial growth/trade slower in 1987,346,,0.991287
17407,six black miners have been killed and two inju...,"""NORM""",six killed in south african gold mine accident,76,,0.740564
17408,the prospect of a dominant alliance of sociali...,"""NORM""",projections show swiss voters want tried parties,188,Social,0.814969


In [137]:
df = df[(df["esg_label"] != "None") & (df["esg_score"] > 0.5)]
print(f"There are {len(df)} augmented news.")
# df = df[~((df["esg_label"] == "Social") & (df["esg_score"] < 0.8))]
df.groupby("esg_label")["esg_score"].describe()

There are 2741 augmented news.


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
esg_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Environmental,393.0,0.775998,0.149653,0.504137,0.649648,0.783682,0.919137,0.991086
Governance,40.0,0.611769,0.084351,0.503932,0.54316,0.595629,0.662771,0.830855
Social,2308.0,0.786391,0.145179,0.501986,0.659425,0.811584,0.92244,0.988577


In [138]:
df.to_parquet("../dataset/temp_reuter_filtered.parquet")

### Use GPT 4 to further filter related news

In [141]:
from openai import OpenAI

client = OpenAI()

In [142]:
template = """
Given the following news, output -1 if the news is not related to ESG (environmental, social, and governance) and won't have any ESG impact. Output 0 if the ESG impact duration is below 2 years, 1 if the ESG impact duration is between 2 and 5 year and 2 if the ESG impact duration is more than 5 years. You only need to output the number, and do not need any further explanation.

News:"""

In [52]:
def generate_gpt_label(sentence, model):
    content = template + sentence
    
    conversation = [
            {"role": "system", "content": "You are an ESG analyst, skilled assessing the level and duration an event in the news article might have on the company"},
            {"role": "user", "content": content}
        ]
    
    completion = client.chat.completions.create(
        model=model,
        messages=conversation,
        temperature=0.7
    )
    
    message = completion.choices[0].message.content
    
    try:
        return int(message)
    except:
        return None

df["gpt_4_label"] = df["text"].apply(lambda x: generate_gpt_label(x, "gpt-4"))

In [124]:
df.to_parquet("dataset/temp_reuter_openai_filtered.parquet")

OSError: Cannot save file into a non-existent directory: 'dataset'

In [90]:
subset_df.groupby(["esg_label","gpt_4_label"])["text"].count()

esg_label      gpt_4_label
Environmental  -1             10
                0              6
                1             10
                2              2
Governance     -1              1
                0              1
Social         -1             42
                0             14
                1             10
                2              4
Name: text, dtype: int64