In [56]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
import json
import pandas as pd

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [5]:
%cd drive/MyDrive/github/ml-esg-3

/content/drive/MyDrive/github/ml-esg-3


In [136]:
COLUMNS = ["url", "title", "content", "impact_length_idx", "language"]

# English Dataset

In [142]:
eng_path = "dataset/ML-ESG-3_Trainset_English.json"
with open(eng_path) as f:
  eng_json = json.load(fp=f)
eng_df = pd.DataFrame.from_dict(eng_json)

In [143]:
eng_df.head(1)

Unnamed: 0,URL,news_title,news_content,impact_level,impact_length
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,low,2 to 5 years


In [144]:
print(len(eng_df))

545


In [145]:
eng_df.groupby("impact_level")["URL"].count()

impact_level
high      196
low       106
medium    243
Name: URL, dtype: int64

In [146]:
eng_df.groupby("impact_length")["URL"].count()

impact_length
2 to 5 years         198
Less than 2 years     82
More than 5 years    265
Name: URL, dtype: int64

In [None]:
eng_df = eng_df.rename(columns={"URL": "url",
                                "news_title": "title",
                                "news_content": "content"})

label_dict = {
    "Less than 2 years": 0,
    "2 to 5 years": 1,
    "More than 5 years": 2
}
eng_df["impact_length_idx"] = eng_df["impact_length"].map(label_dict)

eng_df["language"] = "English"
eng_df = eng_df[COLUMNS]

In [148]:
eng_df.head(1)

Unnamed: 0,url,title,content,impact_length_idx,language
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,1,English


# Chinese Dataset

In [53]:
chn_path = "dataset/ML-ESG-3_Trainset_Chinese.json"

In [86]:
with open(chn_path) as f:
  chn_json = json.load(f)
chn_df = pd.DataFrame.from_dict(chn_json)

In [87]:
chn_df.head(1)

Unnamed: 0,pk,URL,News_Headline,Impact_Duration
0,1421,https://esg.businesstoday.com.tw/article/categ...,老闆本人就是打卡機？比爾蓋茲背下員工車牌，計算每人工作時數,[>5]


In [109]:
chn_df["Impact_Duration"] = chn_df["Impact_Duration"].apply(lambda x: x[0])

In [110]:
chn_df.groupby("Impact_Duration")["URL"].count()

Impact_Duration
2~5                      62
<2                       87
>5                      203
NotRelatedtoCompany     593
NotRelatedtoESGTopic     50
Name: URL, dtype: int64

In [134]:
### TODO: load content from url

In [139]:
chn_df = chn_df.rename(columns={
    "URL": "url",
    "News_Headline": "title",
})
label_dict = {
    "<2": 0,
    "2~5": 1,
    ">5": 2,
    "NotRelatedtoCompany": None,
    "NotRelatedtoESGTopic": None
}
chn_df["impact_length_idx"] = chn_df["Impact_Duration"].map(label_dict)
chn_df = chn_df[chn_df["impact_length_idx"].notna()]
chn_df["content"] = None
chn_df["language"] = "Chinese"
chn_df = chn_df[COLUMNS]

In [149]:
chn_df.head(1)

Unnamed: 0,url,title,content,impact_length_idx,language
0,https://esg.businesstoday.com.tw/article/categ...,老闆本人就是打卡機？比爾蓋茲背下員工車牌，計算每人工作時數,,2.0,Chinese


# French Dataset

In [84]:
fr_path = "dataset/ML-ESG-3_Trainset_French.json"
with open(fr_path) as f:
  fr_json = json.load(fp=f)
fr_df = pd.DataFrame.from_dict(fr_json)

In [85]:
fr_df.head(1)

Unnamed: 0,URL,news_title,news_content,impact_level,impact_length
0,https://www.novethic.fr/actualite/energie/tran...,Interdiction à la location des passoires therm...,"Depuis le 1er janvier, les passoires thermique...",low,More than 5 years


In [112]:
print(len(fr_df))

661


In [111]:
fr_df.groupby("impact_length")["URL"].count()

impact_length
2 to 5 years         231
Less than 2 years    131
More than 5 years    299
Name: URL, dtype: int64

In [150]:
fr_df = fr_df.rename(columns={"URL": "url",
                              "news_title": "title",
                              "news_content": "content"})

label_dict = {
    "Less than 2 years": 0,
    "2 to 5 years": 1,
    "More than 5 years": 2
}
fr_df["impact_length_idx"] = fr_df["impact_length"].map(label_dict)

fr_df["language"] = "French"
fr_df = fr_df[COLUMNS]

In [151]:
fr_df.head(1)

Unnamed: 0,url,title,content,impact_length_idx,language
0,https://www.novethic.fr/actualite/energie/tran...,Interdiction à la location des passoires therm...,"Depuis le 1er janvier, les passoires thermique...",2,French


# Japanese Dataset

In [115]:
jp_path = "dataset/ML-ESG-3_Trainset_Japanese.json"
with open(jp_path) as f:
  jp_json = json.load(fp=f)
jp_df = pd.DataFrame.from_dict(jp_json)

In [116]:
jp_df.head(1)

Unnamed: 0,ID,Text,Relevancy,ESG_type,Impact_type,Impact_duration
0,237672_6,「環境レポーティング」、「事業における環境効率」など環境分野に関する取り組みについて高い評価...,Relevant,Environmental,Opportunity,within_2_years


In [117]:
print(len(jp_df))

53


In [118]:
jp_df.groupby("Impact_duration")["ID"].count()

Impact_duration
                          3
between_2_and_5_years     8
longer_than_5_years      15
within_2_years           27
Name: ID, dtype: int64

In [152]:
jp_df.groupby("Relevancy")["ID"].count()

Relevancy
Irrelevant     3
Relevant      50
Name: ID, dtype: int64

In [None]:
# Ignore JP dataset due to small size

# Korean Dataset

In [120]:
kr_path = "dataset/ML-ESG-3_Trainset_Korean.json"
with open(kr_path) as f:
  kr_json = json.load(fp=f)
kr_df = pd.DataFrame.from_dict(kr_json)

In [121]:
kr_df.head(1)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,url,category,title,content,impact_type,impact_duration
0,0,0,https://www.esgeconomy.com/news/articleView.ht...,지속가능경제,액화수소 시대 열린다...연 4만톤 설비 연내 가동,"연내에 두산애너빌리티와 SK E&S, 효성중공업의 액화수소 플랜트가 준공돼 연간 최...",opportunity,2 to 5 years


In [123]:
print(len(kr_df))

800


In [122]:
kr_df.groupby("impact_duration")["url"].count()

impact_duration
2 to 5 years         142
less than 2 years    446
more than 5 years    212
Name: url, dtype: int64

In [125]:
kr_df.groupby("impact_type")["url"].count()

impact_type
cannot distinguish    109
opportunity           462
risk                  229
Name: url, dtype: int64

In [154]:
label_dict = {
    "less than 2 years": 0,
    "2 to 5 years": 1,
    "more than 5 years": 2
}
kr_df["impact_length_idx"] = kr_df["impact_duration"].map(label_dict)
kr_df["language"] = "Korean"
kr_df = kr_df[COLUMNS]

In [155]:
kr_df.head(1)

Unnamed: 0,url,title,content,impact_length_idx,language
0,https://www.esgeconomy.com/news/articleView.ht...,액화수소 시대 열린다...연 4만톤 설비 연내 가동,"연내에 두산애너빌리티와 SK E&S, 효성중공업의 액화수소 플랜트가 준공돼 연간 최...",1,Korean


# Data Aggregation

In [160]:
df = pd.concat([eng_df, chn_df, fr_df, kr_df], ignore_index=True)

In [162]:
df.groupby("language").count()

Unnamed: 0_level_0,url,title,content,impact_length_idx
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese,352,352,0,352
English,545,545,545,545
French,661,661,661,661
Korean,800,800,800,800


In [163]:
df.groupby("impact_length_idx").count()

Unnamed: 0_level_0,url,title,content,language
impact_length_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,746,746,659,746
1.0,633,633,571,633
2.0,979,979,776,979


In [161]:
print(len(df))

2358


In [164]:
df.to_parquet("dataset/train_df.parquet")