# Notebook - Data Preprocessing

In [1]:
import sys
sys.path.append('/Users/xinyunrong/Desktop/code/ml-esg-3/')

In [7]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, RegexpTokenizer

# Step 1: Load translated data and clean

In [32]:
df_trs = pd.read_parquet("../dataset/Translation_Dataset.parquet")
print(f"This file has {len(df_trs)} samples.")
df_trs.head()

This file has 2320 samples.


Unnamed: 0,url,title,content,impact_length_idx,language,title_eng,content_eng
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: “Personalised portfolios demand th...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: “Personalised portfolios demand th...
3,https://www.esgtoday.com/ukraine-war-inflation...,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...,2.0,English,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...,0.0,English,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...


In [33]:
regexp_tokenizer = RegexpTokenizer(r'\w+')

def word_count(article):
    return len(regexp_tokenizer.tokenize(article))
    

df_trs["title_wc"] = df_trs["title_eng"].apply(word_count)
df_trs["content_wc"] = df_trs["content_eng"].apply(word_count)

df_trs[["title_wc", "content_wc"]].describe()

Unnamed: 0,title_wc,content_wc
count,2320.0,2320.0
mean,14.267672,365.278017
std,5.98823,405.737352
min,4.0,8.0
25%,11.0,74.0
50%,13.0,125.0
75%,16.0,577.0
max,48.0,3974.0


In [47]:
# Korean and Chinese dataset are too long, indicating the needs to further segmenation

print(df_trs.groupby("language")["title_wc"].describe())
print(df_trs.groupby("language")["content_wc"].describe())

          count       mean       std  min   25%   50%   75%   max
language                                                         
Chinese   352.0  24.309659  7.916093  4.0  19.0  24.0  30.0  48.0
English   545.0  11.400000  2.934581  5.0   9.0  11.0  13.0  21.0
French    654.0  13.172783  3.088157  7.0  11.0  13.0  15.0  22.0
Korean    769.0  12.634590  3.111705  4.0  11.0  12.0  15.0  25.0
          count        mean         std    min    25%    50%     75%     max
language                                                                    
Chinese   352.0  925.994318  524.688593  173.0  564.5  764.0  1171.5  3974.0
English   545.0   63.425688   27.222302    8.0   44.0   59.0    80.0   194.0
French    654.0   84.678899   23.265120   16.0   69.0   84.0    98.0   225.0
Korean    769.0  561.180754  236.733538  144.0  388.0  519.0   693.0  1455.0


# Step 2: Segment Chinese and Korean Dataset

In [39]:
# Segment Chinese and Korean articles into every n sentences as they are too long

def group_sentences(sentences, sent_size):
    return [sentences[i : i + sent_size] for i in range(0, len(sentences), sent_size)]

def segment_articles(df, sent_size):
    df['sent_tokenize'] = df['content_eng'].apply(sent_tokenize)
    df['content_eng_short'] = df['sent_tokenize'].apply(lambda x: group_sentences(x, sent_size))
    seg_df = df.explode('content_eng_short')
    seg_df["content_eng_short"] = seg_df["content_eng_short"].apply(lambda x: " ".join(x))
    seg_df = seg_df.drop(columns="sent_tokenize")
    return seg_df

# split the dataset to segment
chn_kor_trs = df_trs[df_trs["language"].isin(["Korean", "Chinese"])].copy()
eng_fre_trs = df_trs[~df_trs["language"].isin(["Korean", "Chinese"])].copy()
    
# segment Chinese and Korean articles
sent_size = 5
chn_kor_seg = segment_articles(chn_kor_trs, sent_size)

# concatenate with English and French articles
eng_fre_seg = eng_fre_trs
eng_fre_seg["content_eng_short"] = eng_fre_seg["content_eng"]

df_seg = pd.concat([chn_kor_seg, eng_fre_seg]).reset_index(drop=True)


In [46]:
print(f"Segmenting {len(chn_kor_trs)} samples into {len(chn_kor_seg)} samples.")
chn_kor_seg["content_wc"] = chn_kor_seg["content_eng_short"].apply(word_count)



Segmenting 1121 samples into 5381 samples.


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Chinese,1900.0,171.14,64.654063,7.0,131.0,171.0,208.0,425.0
Korean,3481.0,123.406492,41.848859,6.0,100.0,125.0,150.0,336.0


In [None]:
# English and French articles are already segmented
eng_seg = eng_trs
eng_seg["content_eng_short"] = eng_seg["content_eng"]
fre_seg = fre_trs
fre_seg["content_eng_short"] = fre_seg["content_eng"]

# Step 3: Aggregate Segmented Datasets

In [None]:
columns = ["title_eng", "content_eng_short", "impact_length_idx", "language"]

eng_seg = eng_seg[columns]
fre_seg = fre_seg[columns]
kor_seg = kor_seg[columns]
chn_seg = chn_seg[columns]

In [None]:
train_df = pd.concat([eng_seg, fre_seg, kor_seg, chn_seg]).reset_index(drop=True)
print(f"Train dataset for short news article has {len(train_df)} samples.")

In [None]:
train_df = train_df.rename(columns={
    "title_eng": "title",
    "content_eng_short": "content",
    "impact_length_idx": "label"
})

In [None]:
# inspect the segmented articles length stats
train_df["content"].apply(word_count).describe()

# Step 4: Augment the dataset