# Notebook - DeepL Translation

In [1]:
import os
import pandas as pd
import zhconv
from tqdm import tqdm
import time
import deepl

F:\Softwares\Anaconda\envs\Data_Science\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
F:\Softwares\Anaconda\envs\Data_Science\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


### API Connection

In [35]:
auth_key = os.environ["DEEPL_TOKEN"]
translator = deepl.Translator(auth_key)
result = translator.translate_text("你好", target_lang="EN-US")
print(result.text)

How are you?


### Import dataset

In [61]:
chn_dataset = pd.read_parquet("../dataset_latest/train_df_Chinese.parquet")
kor_dataset = pd.read_parquet("../dataset_latest/train_df_Korean.parquet")
fre_dataset = pd.read_parquet("../dataset_latest/train_df_French.parquet")
eng_dataset = pd.read_parquet("../dataset_latest/train_df.parquet")
del fre_dataset["sentence"]
del kor_dataset["sentence"]
del chn_dataset["sentence"]
chn_dataset["title_traditional"] = chn_dataset["title"]
chn_dataset["content_traditional"] = chn_dataset["content"]
chn_dataset["title"] = chn_dataset["title"].apply(lambda x: zhconv.convert(x, 'zh-hans'))
chn_dataset["content"] = chn_dataset["content"].apply(lambda x: zhconv.convert(x, 'zh-hans'))

### Data Cleaning

In [62]:
# 1 Drop duplicates
chn_dataset.drop_duplicates(inplace=True)
kor_dataset.drop_duplicates(inplace=True)
fre_dataset.drop_duplicates(inplace=True)
eng_dataset.drop_duplicates(inplace=True)

# 2 delete "\n", strip text
chn_dataset["title"] = chn_dataset["title"].apply(lambda x: x.replace("\n","").strip())
chn_dataset["title_traditional"] = chn_dataset["title_traditional"].apply(lambda x: x.replace("\n","").strip())
chn_dataset["content"] = chn_dataset["content"].apply(lambda x: x.replace("\n","").strip())
chn_dataset["content_traditional"] = chn_dataset["content_traditional"].apply(lambda x: x.replace("\n","").strip())
kor_dataset["title"] = kor_dataset["title"].apply(lambda x: x.replace("\n","").strip())
kor_dataset["content"] = kor_dataset["content"].apply(lambda x: x.replace("\n","").strip())
fre_dataset["title"] = fre_dataset["title"].apply(lambda x: x.replace("\n","").strip())
fre_dataset["content"] = fre_dataset["content"].apply(lambda x: x.replace("\n","").strip())
eng_dataset["title"] = eng_dataset["title"].apply(lambda x: x.replace("\n","").strip())
eng_dataset["content"] = eng_dataset["content"].apply(lambda x: x.replace("\n","").strip())

### Translations

In [27]:
# define an empty list
translation_lst = []
data_language = "Korean"

In [28]:
if data_language == "Chinese":
    translation_dataset = chn_dataset.copy()
elif data_language == "French":
    translation_dataset = fre_dataset.copy()
elif data_language == "Korean":
    translation_dataset = kor_dataset.copy()
else:
    translation_dataset = eng_dataset.copy()

In [42]:
for i in tqdm(range(len(translation_lst), len(translation_dataset))):
    
    # title translation
    title = str(translation_dataset.iloc[i]['title'])
    title_value = translator.translate_text(title, target_lang="EN-US").text
    
    # content translation
    content = str(translation_dataset.iloc[i]['content'])
    content_value = translator.translate_text(content, target_lang="EN-US").text
    
    # append to translation list
    translation_lst.append([title_value, content_value])
    
    # slow down the speed
    time.sleep(1)

100%|██████████| 159/159 [06:36<00:00,  2.49s/it]


In [43]:
translation_dataset["translation"] = translation_lst
translation_dataset["title_eng"] = translation_dataset["translation"]
translation_dataset["content_eng"] = translation_dataset["translation"].apply(lambda x: x[1])
# save to local disk
translation_dataset.to_parquet("../dataset_latest/Translation_"+data_language+"_Dataset.parquet")

In [63]:
eng_dataset["title_eng"] = eng_dataset["title"]
eng_dataset["content_eng"] = eng_dataset["content"]
eng_dataset.to_parquet("../dataset_latest/Translation_"+"English"+"_Dataset.parquet")

### Load Translation Dataset

In [6]:
trans_eng = pd.read_parquet("../dataset_latest/Translation_English_Dataset.parquet")
trans_chn = pd.read_parquet("../dataset_latest/Translation_Chinese_Dataset.parquet")
trans_kor = pd.read_parquet("../dataset_latest/Translation_Korean_Dataset.parquet")
trans_fre = pd.read_parquet("../dataset_latest/Translation_French_Dataset.parquet")

In [7]:
trans_eng = trans_eng[["url", "title", "content", "impact_length_idx", "language", "title_eng", "content_eng"]]
trans_chn = trans_chn[["url", "title", "content", "impact_length_idx", "language", "title_eng", "content_eng"]]
trans_kor = trans_kor[["url", "title", "content", "impact_length_idx", "language", "title_eng", "content_eng"]]
trans_fre = trans_fre[["url", "title", "content", "impact_length_idx", "language", "title_eng", "content_eng"]]
trans_eng = trans_eng.loc[trans_eng["language"]=="English"]

In [8]:
df = pd.concat([trans_eng,trans_chn,trans_kor,trans_fre])
df.to_parquet("../dataset_latest/Translation_Dataset.parquet")

In [9]:
trans_eng



Unnamed: 0,url,title,content,impact_length_idx,language,title_eng,content_eng
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: “Personalised portfolios demand th...,1.0,English,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: “Personalised portfolios demand th...
3,https://www.esgtoday.com/ukraine-war-inflation...,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...,2.0,English,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...,0.0,English,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...
...,...,...,...,...,...,...,...
540,https://www.esgtoday.com/methane-emissions-det...,Methane Emissions Detection Platform Kuva Rais...,"Stefan Bokaemper, CEO of Kuva Systems, said: “...",1.0,English,Methane Emissions Detection Platform Kuva Rais...,"Stefan Bokaemper, CEO of Kuva Systems, said: “..."
541,https://www.esgtoday.com/eaton-appoints-harold...,Eaton Appoints Harold Jones as Chief Sustainab...,Eaton Appoints Harold Jones as Chief Sustainab...,1.0,English,Eaton Appoints Harold Jones as Chief Sustainab...,Eaton Appoints Harold Jones as Chief Sustainab...
542,https://www.esgtoday.com/ssga-outlines-2021-st...,"SSGA Outlines 2021 Stewardship Priorities, Wil...","In his letter, Taraporevala wrote: “As a signa...",0.0,English,"SSGA Outlines 2021 Stewardship Priorities, Wil...","In his letter, Taraporevala wrote: “As a signa..."
543,https://www.esgtoday.com/survey-investors-shif...,Survey: Investors Shifting to Offense on Clima...,O’Brien said: “Investors globally are increasi...,0.0,English,Survey: Investors Shifting to Offense on Clima...,O’Brien said: “Investors globally are increasi...
