In [None]:
import json
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
COLUMNS = ["url", "title", "content", "impact_length_idx", "language"]

# English Dataset

In [None]:
eng_path = "../dataset/ML-ESG-3_Trainset_English.json"
with open(eng_path) as f:
  eng_json = json.load(fp=f)
eng_df = pd.DataFrame.from_dict(eng_json)

In [None]:
eng_df.head(1)

In [None]:
print(len(eng_df))

In [None]:
eng_df.groupby("impact_level")["URL"].count()

In [None]:
eng_df.groupby("impact_length")["URL"].count()

In [None]:
eng_df = eng_df.rename(columns={"URL": "url",
                                "news_title": "title",
                                "news_content": "content"})

label_dict = {
    "Less than 2 years": 0,
    "2 to 5 years": 1,
    "More than 5 years": 2
}
eng_df["impact_length_idx"] = eng_df["impact_length"].map(label_dict)

eng_df["language"] = "English"
eng_df = eng_df[COLUMNS]

In [None]:
eng_df.head(1)

# Chinese Dataset

In [None]:
chn_path = "../dataset/ML-ESG-3_Trainset_Chinese.json"

In [None]:
with open(chn_path) as f:
  chn_json = json.load(f)
chn_df = pd.DataFrame.from_dict(chn_json)

In [None]:
chn_df.head(1)

In [None]:
chn_df["Impact_Duration"] = chn_df["Impact_Duration"].apply(lambda x: x[0])

In [None]:
chn_df.groupby("Impact_Duration")["URL"].count()

In [None]:
chn_df = chn_df.rename(columns={
    "URL": "url",
    "News_Headline": "title",
})
label_dict = {
    "<2": 0,
    "2~5": 1,
    ">5": 2,
    "NotRelatedtoCompany": None,
    "NotRelatedtoESGTopic": None
}
chn_df["impact_length_idx"] = chn_df["Impact_Duration"].map(label_dict)
chn_df = chn_df[chn_df["impact_length_idx"].notna()]
chn_df["language"] = "Chinese"

In [None]:
def parse_content(link):
  response = urllib.request.urlopen(link)
  web_content = response.read()
  soup = BeautifulSoup(web_content, 'html.parser')

  main_content_div = soup.find('div', class_='content_bottom')
  main_content_text = main_content_div.get_text(separator='\n', strip=True)

  cut_off_point = main_content_text.find("日期：")
  if cut_off_point != -1:
      main_content_text = main_content_text[cut_off_point:]

  cut_off_point = main_content_text.find("延伸閱讀")
  if cut_off_point != -1:
      main_content_text = main_content_text[:cut_off_point]
  return main_content_text

chn_df["content"] = chn_df["url"].apply(parse_content)

In [None]:
chn_df = chn_df[COLUMNS]

In [None]:
chn_df.head(1)

# French Dataset

In [None]:
fr_path = "../dataset/ML-ESG-3_Trainset_French.json"
with open(fr_path) as f:
  fr_json = json.load(fp=f)
fr_df = pd.DataFrame.from_dict(fr_json)

In [None]:
fr_df.head(1)

In [None]:
print(len(fr_df))

In [None]:
fr_df.groupby("impact_length")["URL"].count()

In [None]:
fr_df = fr_df.rename(columns={"URL": "url",
                              "news_title": "title",
                              "news_content": "content"})

label_dict = {
    "Less than 2 years": 0,
    "2 to 5 years": 1,
    "More than 5 years": 2
}
fr_df["impact_length_idx"] = fr_df["impact_length"].map(label_dict)

fr_df["language"] = "French"
fr_df = fr_df[COLUMNS]

In [None]:
fr_df.head(1)

# Japanese Dataset

In [None]:
jp_path = "../dataset/ML-ESG-3_Trainset_Japanese.json"
with open(jp_path) as f:
  jp_json = json.load(fp=f)
jp_df = pd.DataFrame.from_dict(jp_json)

In [None]:
jp_df.head(1)

In [None]:
print(len(jp_df))

In [None]:
jp_df.groupby("Impact_duration")["ID"].count()

In [None]:
jp_df.groupby("Relevancy")["ID"].count()

In [None]:
# Ignore JP dataset due to small size

# Korean Dataset

In [None]:
kr_path = "../dataset/ML-ESG-3_Trainset_Korean.json"
with open(kr_path) as f:
  kr_json = json.load(fp=f)
kr_df = pd.DataFrame.from_dict(kr_json)

In [None]:
kr_df.head(1)

In [None]:
print(len(kr_df))

In [None]:
kr_df.groupby("impact_duration")["url"].count()

In [None]:
kr_df.groupby("impact_type")["url"].count()

In [None]:
label_dict = {
    "less than 2 years": 0,
    "2 to 5 years": 1,
    "more than 5 years": 2
}
kr_df["impact_length_idx"] = kr_df["impact_duration"].map(label_dict)
kr_df["language"] = "Korean"
kr_df = kr_df[COLUMNS]

In [None]:
kr_df.head(1)

# Data Aggregation

In [None]:
df = pd.concat([eng_df, chn_df, fr_df, kr_df], ignore_index=True)

In [None]:
df.groupby("language").count()

In [None]:
df.groupby("impact_length_idx").count()

In [None]:
print(len(df))

In [None]:
df.to_parquet("dataset/train_df.parquet")