In [None]:
import sys
sys.path.append('/Users/xinyunrong/Desktop/code/ml-esg-3/')

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from src.constant import *

# Step 1: Load translated data and clean

In [None]:
eng_trs = pd.read_parquet("../dataset/train_df_English_translated.parquet")
fre_trs = pd.read_parquet("../dataset/train_df_French_translated.parquet")
kor_trs = pd.read_parquet("../dataset/train_df_Korean_translated.parquet")
chn_trs = pd.read_parquet("../dataset/train_df_Chinese_translated.parquet")

In [None]:
# ["url", "title", "content", "impact_length_idx", "language", "title_eng", "content_eng"]

In [None]:
eng_trs = eng_trs.drop_duplicates()
fre_trs = fre_trs.drop_duplicates() # from 661 to 654
kor_trs = kor_trs.drop_duplicates() # from 800 to 771
chn_trs = chn_trs.drop_duplicates()

In [None]:
# eng_trs["Translation"] = eng_trs["sentence"] # English dataset doesn't need translation

eng_trs["title_eng"] = eng_trs["title"]
eng_trs["content_eng"] = eng_trs["content"]

In [None]:
# filter very short sentences and non-sentences (defined by no . at the end of the sentence)
def Chinese_article_preprocess(article):
    sentences = article.split("\n")
    filtered_sentences = [text for text in sentences if len(text.split(" ")) > 5 ]
    title = filtered_sentences[0]
    filtered_sentences = filtered_sentences[1:]
    filtered_sentences = [text for text in filtered_sentences if text[-1] == "."]
    article = title + " " + " ".join(filtered_sentences)
    return article

chn_trs["Translation"] = chn_trs["Translation"].map(Chinese_article_preprocess)

# Step 2: Segment Chinese and Korean Dataset

In [None]:
# Segment Chinese and Korean articles into every n sentences as they are too long

def group_sentences(sentences, sent_size):
    return [sentences[i : i + sent_size] for i in range(0, len(sentences), sent_size)]

def segment_articles(df, sent_size):
    df['sent_tokenize'] = df['content_eng'].apply(sent_tokenize)
    df['content_eng_short'] = df['sent_tokenize'].apply(group_sentences, sent_size)
    seg_df = df.explode('content_eng_short')
    seg_df["content_eng_short"] = seg_df["content_eng_short"].apply(lambda x: " ".join(x))
    seg_df = seg_df.drop(columns="sent_tokenize")
    return seg_df
    
sent_size = 5
chn_seg = segment_articles(chn_trs, sent_size)
kor_seg = segment_articles(kor_trs, sent_size)

def word_count(text):
    return len(text.split(" "))

# filter short sentences
chn_seg = chn_seg[chn_seg["content_eng_short"].apply(word_count) > 30]
kor_seg = kor_seg[kor_seg["content_eng_short"].apply(word_count) > 30]

In [None]:
# English and French articles are already segmented
eng_seg = eng_trs
eng_seg["content_eng_short"] = eng_seg["content_eng"]
fre_seg = fre_trs
fre_seg["content_eng_short"] = fre_seg["content_eng"]

# Step 3: Aggregate Segmented Datasets

In [None]:
columns = ["title_eng", "content_eng_short", "impact_length_idx", "language"]

eng_seg = eng_seg[columns]
fre_seg = fre_seg[columns]
kor_seg = kor_seg[columns]
chn_seg = chn_seg[columns]

In [None]:
train_df = pd.concat([eng_seg, fre_seg, kor_seg, chn_seg]).reset_index(drop=True)
print(f"Train dataset for short news article has {len(train_df)} samples.")

In [None]:
train_df = train_df.rename(columns={
    "title_eng": "title",
    "content_eng_short": "content",
    "impact_length_idx": "label"
})

In [None]:
# inspect the segmented articles length stats
train_df["content"].apply(word_count).describe()

# Step 4: Augment the dataset