# Notebook - Data Preprocessing

In [1]:
import sys
sys.path.append('/Users/xinyunrong/Desktop/code/ml-esg-3/')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from nltk.tokenize import sent_tokenize, RegexpTokenizer

# Step 1: Load translated data and clean

In [3]:
df_trs = pd.read_parquet("../dataset/Translation_Dataset.parquet")
print(f"This file has {len(df_trs)} samples.")
df_trs.head()

This file has 2320 samples.


Unnamed: 0,url,title,content,impact_length_idx,language,title_eng,content_eng
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: “Personalised portfolios demand th...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: “Personalised portfolios demand th...
3,https://www.esgtoday.com/ukraine-war-inflation...,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...,2.0,English,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...,0.0,English,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...


In [4]:
regexp_tokenizer = RegexpTokenizer(r'\w+')

def word_count(article):
    return len(regexp_tokenizer.tokenize(article))
    

df_trs["title_wc"] = df_trs["title_eng"].apply(word_count)
df_trs["content_wc"] = df_trs["content_eng"].apply(word_count)

df_trs[["title_wc", "content_wc"]].describe()

Unnamed: 0,title_wc,content_wc
count,2320.0,2320.0
mean,14.267672,365.278017
std,5.98823,405.737352
min,4.0,8.0
25%,11.0,74.0
50%,13.0,125.0
75%,16.0,577.0
max,48.0,3974.0


In [5]:
# Korean and Chinese dataset are too long, indicating the needs to further segmenation

print(df_trs.groupby("language")["title_wc"].describe())
print(df_trs.groupby("language")["content_wc"].describe())

          count       mean       std  min   25%   50%   75%   max
language                                                         
Chinese   352.0  24.309659  7.916093  4.0  19.0  24.0  30.0  48.0
English   545.0  11.400000  2.934581  5.0   9.0  11.0  13.0  21.0
French    654.0  13.172783  3.088157  7.0  11.0  13.0  15.0  22.0
Korean    769.0  12.634590  3.111705  4.0  11.0  12.0  15.0  25.0
          count        mean         std    min    25%    50%     75%     max
language                                                                    
Chinese   352.0  925.994318  524.688593  173.0  564.5  764.0  1171.5  3974.0
English   545.0   63.425688   27.222302    8.0   44.0   59.0    80.0   194.0
French    654.0   84.678899   23.265120   16.0   69.0   84.0    98.0   225.0
Korean    769.0  561.180754  236.733538  144.0  388.0  519.0   693.0  1455.0


# Step 2: Segment Chinese and Korean Dataset

In [6]:
# Segment Chinese and Korean articles into every n sentences as they are too long

def group_sentences(sentences, sent_size):
    return [sentences[i : i + sent_size] for i in range(0, len(sentences), sent_size)]

def segment_articles(df, sent_size):
    df['sent_tokenize'] = df['content_eng'].apply(sent_tokenize)
    df['content_eng_short'] = df['sent_tokenize'].apply(lambda x: group_sentences(x, sent_size))
    seg_df = df.explode('content_eng_short')
    seg_df["content_eng_short"] = seg_df["content_eng_short"].apply(lambda x: " ".join(x))
    seg_df = seg_df.drop(columns="sent_tokenize")
    return seg_df

# split the dataset to segment
chn_kor_trs = df_trs[df_trs["language"].isin(["Korean", "Chinese"])].copy()
eng_fre_trs = df_trs[~df_trs["language"].isin(["Korean", "Chinese"])].copy()
    
# segment Chinese and Korean articles
sent_size = 5
chn_kor_seg = segment_articles(chn_kor_trs, sent_size)
chn_kor_seg["content_wc"] = chn_kor_seg["content_eng_short"].apply(word_count)

# concatenate with English and French articles
eng_fre_seg = eng_fre_trs
eng_fre_seg["content_eng_short"] = eng_fre_seg["content_eng"]

df_seg = pd.concat([chn_kor_seg, eng_fre_seg]).reset_index(drop=True)


In [7]:
print(f"Segmenting {len(chn_kor_trs)} Korean & Chinese samples into {len(chn_kor_seg)} samples.")
print(f"Expanding {len(df_trs)} samples to {len(df_seg)} samples.")
df_seg.groupby("language")["content_wc"].describe()

Segmenting 1121 Korean & Chinese samples into 5447 samples.
Expanding 2320 samples to 6646 samples.


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Chinese,1924.0,169.412682,64.362227,2.0,130.0,170.0,207.0,425.0
English,545.0,63.425688,27.222302,8.0,44.0,59.0,80.0,194.0
French,654.0,84.678899,23.26512,16.0,69.0,84.0,98.0,225.0
Korean,3523.0,122.494465,41.591435,2.0,99.0,124.0,150.0,336.0


In [8]:
# Prepare dataset for data augmentation
df_seg = df_seg.drop(columns=['url', 'title', 'content', 'content_eng', 'title_wc', 'content_wc'])

df_seg = df_seg.rename(columns={
    "title_eng": "title",
    "content_eng_short": "content",
    "impact_length_idx": "label"
})

df_seg["feature"] = df_seg["title"] + ' || ' + df_seg["content"]

df_seg.head()

Unnamed: 0,label,language,title,content,feature
0,2.0,Chinese,Is the boss himself a punch card machine? Bill...,Date: 2022-01-04 work-life balance in the mode...,Is the boss himself a punch card machine? Bill...
1,2.0,Chinese,Is the boss himself a punch card machine? Bill...,Microsoft's market capitalization will reach 1...,Is the boss himself a punch card machine? Bill...
2,2.0,Chinese,Is the boss himself a punch card machine? Bill...,"From Bill Gates' words, it can be inferred tha...",Is the boss himself a punch card machine? Bill...
3,0.0,Chinese,"Mercedes-Maybach is going electric too, with t...","Date: 2021-09-06Two years ago, IAA Mobility wa...","Mercedes-Maybach is going electric too, with t..."
4,0.0,Chinese,"Mercedes-Maybach is going electric too, with t...","One is the new EQE, which is positioned under ...","Mercedes-Maybach is going electric too, with t..."


# Step 3: Filter non-ESG related sample


In [9]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg',
                                          truncation=True,
                                          padding='max_length',
                                          max_length=512)
esg_pipeline = pipeline("text-classification", 
                        model=finbert, 
                        tokenizer=tokenizer)


In [10]:
batch = 64
for i in range(0, len(df_seg) // batch + 1):
    sentences = list(df_seg.loc[i * batch: (i + 1) * batch]["content"])
    sentences = [s[:512] for s in sentences]
    results = esg_pipeline(sentences)
    df_seg.loc[i * batch: (i + 1) * batch, "esg_label"] = [x["label"] for x in results]
    df_seg.loc[i * batch: (i + 1) * batch, "esg_score"] = [x["score"] for x in results]

In [11]:
# Filter out segmented Chinese and Korean paragraphs that are non ESG related.

none_news = df_seg[(df_seg["esg_label"] == "None") & (df_seg["esg_score"] > 0.9) & (df_seg["language"].isin(["Chinese", "Korean"]))]
print(f"There are {len(none_news)} samples that are not ESG related with over 0.9 probability.")

There are 531 samples that are not ESG related with over 0.9 probability.


In [12]:
df_seg = df_seg[~df_seg.index.isin(none_news.index)].reset_index(drop=True)
print(f"We filter down to {len(df_seg)} samples.")

We filter down to 6115 samples.


In [13]:
df_seg = df_seg.drop(columns=["esg_label", "esg_score"])

# Step 4: Train Test Split

In [14]:
df_seg.groupby("label")["label"].count()

label
0.0    2143
1.0    1354
2.0    2618
Name: label, dtype: int64

In [15]:
# assign group id for each title before train test split
groups = list(df_seg["title"].unique())
group_indic_dict = {}
for i, v in enumerate(groups):
    group_indic_dict[v] = i
    
df_seg["group_indicator"] = df_seg["title"].map(group_indic_dict)

In [16]:
df_seg.groupby("group_indicator").head(1).groupby("label")["label"].count()

label
0.0    562
1.0    353
2.0    656
Name: label, dtype: int64

In [17]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split the data ensuring the same group is not in both train and test sets
for train_idx, test_idx in gss.split(df_seg['feature'], 
                                     df_seg['label'], 
                                     df_seg['group_indicator']):
    train_set = df_seg.iloc[train_idx]
    valid_set = df_seg.iloc[test_idx]

In [18]:
print(train_set.groupby("label")["label"].count())
print(valid_set.groupby("label")["label"].count())

label
0.0    1681
1.0    1092
2.0    2104
Name: label, dtype: int64
label
0.0    462
1.0    262
2.0    514
Name: label, dtype: int64


In [19]:
print(train_set.groupby("language")["label"].count())
print(valid_set.groupby("language")["label"].count())

language
Chinese    1421
English     433
French      543
Korean     2480
Name: label, dtype: int64
language
Chinese    341
English    112
French     111
Korean     674
Name: label, dtype: int64


In [20]:
train_set.to_parquet("../dataset/training_dataset.parquet")
valid_set.to_parquet("../dataset/validation_dataset.parquet")