# Notebook - Training Dataset Preparation

In [64]:
import pandas as pd

### Load dataset

In [65]:
eng_seg = pd.read_parquet("../dataset/train_df_English_translated.parquet")
fre_seg = pd.read_parquet("../dataset/train_df_French_translated.parquet")
kor_seg = pd.read_parquet("../dataset/train_df_Korean_translated_segmented.parquet")
chn_esg = pd.read_parquet("../dataset/train_df_Chinese_translated_segmented.parquet")

In [66]:
chn_esg["title"] = chn_esg["sentence"].apply(lambda x: x.split("||")[0].strip())
kor_seg["title"] = kor_seg["sentence"].apply(lambda x: x.split("||")[0].strip())
chn_esg = chn_esg[["title", "language", "grouped_sentences","impact_length_idx"]]
kor_seg = kor_seg[["title", "language", "grouped_sentences","impact_length_idx"]]

eng_seg["grouped_sentences"] = eng_seg["sentence"]
fre_seg["grouped_sentences"] = fre_seg["sentence"]
eng_seg = eng_seg[["title", "language", "grouped_sentences","impact_length_idx"]]
fre_seg = fre_seg[["title", "language", "grouped_sentences","impact_length_idx"]]

In [67]:
df = pd.concat([chn_esg, fre_seg, kor_seg, eng_seg])
df["source"] = "normal"

### Load augmented dataset

In [68]:
aug_seg = pd.read_parquet("../dataset/train_df_English_augmented.parquet")

In [69]:
aug_seg["title"] = aug_seg["title"].apply(lambda x: x[1:len(x)])
aug_seg["grouped_sentences"] = aug_seg["sentence"].apply(lambda x: x[1:len(x)-1])
aug_seg["source"] = "gpt"
aug_seg = aug_seg[["title", "language", "grouped_sentences","impact_length_idx","source"]]

In [70]:
df = pd.concat([df, aug_seg])

In [71]:
groups = list(set(df["title"].tolist()))
group_indic_dict = {}
for i,v in enumerate(groups):
    group_indic_dict[v] = i

In [72]:
df["group_indicator"] = df["title"].apply(lambda x: group_indic_dict[x])
df["feature"] = df["grouped_sentences"]
df["label"] = df["impact_length_idx"]
df = df[["source", "group_indicator", "language", "title", "feature", "label"]]
df

Unnamed: 0,source,group_indicator,language,title,feature,label
545,normal,1547,Chinese,老闆本人就是打卡機？比爾蓋茲背下員工車牌，計算每人工作時數,The boss himself is a clock-punching machine? ...,2.0
545,normal,1547,Chinese,老闆本人就是打卡機？比爾蓋茲背下員工車牌，計算每人工作時數,Although Microsoft is now a world-renowned and...,2.0
546,normal,694,Chinese,Mercedes-Maybach也要電動化，全新賓士EQE、AMG首款電動車將齊聚慕尼黑車展！,"Mercedes-Maybach is going electric too, with t...",0.0
546,normal,694,Chinese,Mercedes-Maybach也要電動化，全新賓士EQE、AMG首款電動車將齊聚慕尼黑車展！,"In addition to the already announced EQA, EQB,...",0.0
546,normal,694,Chinese,Mercedes-Maybach也要電動化，全新賓士EQE、AMG首款電動車將齊聚慕尼黑車展！,IAA Mobility Pre-Night on the 5th of September...,0.0
...,...,...,...,...,...,...
85,gpt,652,English,Uber Pledges to be Fully Electric by 2040,Uber Pledges to be Fully Electric by 2040 || R...,2.0
86,gpt,456,English,Kellogg Company Targets 50% Reduction in Globa...,Kellogg Company Targets 50% Reduction in Globa...,2.0
87,gpt,288,English,Tata Motors Aims to be Carbon Neutral by 2039,Tata Motors Aims to be Carbon Neutral by 2039 ...,2.0
88,gpt,1372,English,Starbucks Commits to Achieve Resource Positive...,Starbucks Commits to Achieve Resource Positive...,2.0


In [73]:
df.to_parquet("../dataset/training_dataset.parquet")

In [74]:
eng_seg = pd.read_parquet("../dataset/train_df_English_translated.parquet")

In [75]:
eng_seg

Unnamed: 0,url,title,content,impact_length_idx,language,sentence
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: \n“Personalised portfolios demand ...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...
3,https://www.esgtoday.com/ukraine-war-inflation...,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...,2.0,English,"Ukraine War, Inflation Reduction Act Driving F..."
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...,0.0,English,"EU Regulators Welcome, Critique New European S..."
...,...,...,...,...,...,...
540,https://www.esgtoday.com/methane-emissions-det...,Methane Emissions Detection Platform Kuva Rais...,"Stefan Bokaemper, CEO of Kuva Systems, said: “...",1.0,English,Methane Emissions Detection Platform Kuva Rais...
541,https://www.esgtoday.com/eaton-appoints-harold...,Eaton Appoints Harold Jones as Chief Sustainab...,Eaton Appoints Harold Jones as Chief Sustainab...,1.0,English,Eaton Appoints Harold Jones as Chief Sustainab...
542,https://www.esgtoday.com/ssga-outlines-2021-st...,"SSGA Outlines 2021 Stewardship Priorities, Wil...","In his letter, Taraporevala wrote: “As a signa...",0.0,English,"SSGA Outlines 2021 Stewardship Priorities, Wil..."
543,https://www.esgtoday.com/survey-investors-shif...,Survey: Investors Shifting to Offense on Clima...,O’Brien said: “Investors globally are increasi...,0.0,English,Survey: Investors Shifting to Offense on Clima...


In [76]:
eng_seg_cons = eng_seg.groupby("title").agg({"impact_length_idx":"first",'sentence':"sum","language":"first","content":"sum"})

In [77]:
eng_seg_cons

Unnamed: 0_level_0,impact_length_idx,sentence,language,content
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
$11.4T Investor Coalition Pressures Fast Food Industry on Environmental Record,2.0,$11.4T Investor Coalition Pressures Fast Food ...,English,"A global investor coalition, facilitated by gl..."
100 EU Cities Commit to Reach Climate Neutrality by 2030,2.0,100 EU Cities Commit to Reach Climate Neutrali...,English,The European Commission announced that it has ...
"3M to Invest $1 Billion Towards Achieving New Climate, Water Goals",2.0,3M to Invest $1 Billion Towards Achieving New ...,English,3M’s new goals include achieving carbon neutra...
"Accenture Announces New Sustainability Goals, Appoints Chief Responsibility Officer",2.0,"Accenture Announces New Sustainability Goals, ...",English,The firm’s zero waste initiatives will include...
Accenture Continues Series of ESG Acquisitions with Purchase of Sustainability Consultancy akzente,0.0,Accenture Continues Series of ESG Acquisitions...,English,The deal marks the third recent ESG-focused ac...
...,...,...,...,...
Zero Carbon Building Materials Startup Prometheus Raises $8 Million,2.0,Zero Carbon Building Materials Startup Prometh...,English,"Loren Burnett, Co-Founder, President, and CEO ..."
abrdn Says it Will Vote Against Companies Who Do Not Meet Diversity Expectations,2.0,abrdn Says it Will Vote Against Companies Who ...,English,In a statement announcing the updated DEI requ...
bp Enters Offshore Wind Market in $1.1B Deal and Partnership with Equinor,2.0,bp Enters Offshore Wind Market in $1.1B Deal a...,English,One month after launching its energy transitio...
"dv01 Acquires Pragmic Technologies, to Provide ESG Ratings for Structured Products",1.0,"dv01 Acquires Pragmic Technologies, to Provide...",English,"Charlie Oshman, CEO and Co-Founder of Pragmic ..."
