In [31]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize


In [32]:
df_English = pd.read_parquet("../dataset/train_df_English_translated.parquet")
df_French = pd.read_parquet("../dataset/train_df_French_translated.parquet")
df_Korean = pd.read_parquet("../dataset/train_df_Korean_translated.parquet")
df_Chinese = pd.read_parquet("../dataset/train_df_Chinese_translated.parquet")

In [33]:
columns = ["sentence", "Translation", "impact_length_idx", "language"]

In [34]:
df_English["Translation"] = df_English["sentence"]
df_English = df_English[columns]
df_Chinese = df_Chinese[columns]
df_Korean = df_Korean[columns]
df_French = df_French[columns]

In [35]:
df = pd.concat([df_English, df_Chinese, df_Korean, df_French]).reset_index(drop=True)

In [36]:
df.to_parquet("../dataset/train_df_all_english.parquet")

# Word Count EDA

In [37]:
def word_count(text):
    return len(text.split(" "))

In [38]:
df["wc"] = df["Translation"].map(word_count)

In [39]:
df.groupby("language").agg({"wc": np.mean})

  df.groupby("language").agg({"wc": np.mean})


Unnamed: 0_level_0,wc
language,Unnamed: 1_level_1
Chinese,903.184659
English,73.86422
French,96.636914
Korean,555.97


# Preprocess Chinese text

In [40]:
# filter very short sentences and non-sentences (defined by no . at the end of the sentence)
def Chinese_article_preprocess(article):
    sentences = article.split("\n")
    filtered_sentences = [text for text in sentences if len(text.split(" ")) > 5 ]
    title = filtered_sentences[0]
    filtered_sentences = filtered_sentences[1:]
    filtered_sentences = [text for text in filtered_sentences if text[-1] == "."]
    article = title + " " + " ".join(filtered_sentences)
    return article

In [41]:
df_Chinese["processed"] = df_Chinese["Translation"].map(Chinese_article_preprocess)

In [42]:
# we reduced the average word count from 900 to 750.
df_Chinese["wc"] = df_Chinese["processed"].map(word_count)
print(np.mean(df_Chinese["wc"]))

749.0795454545455


In [43]:
df_Chinese['sentences'] = df_Chinese['processed'].apply(sent_tokenize)

def group_sentences(sentences):
    return [sentences[i:i+5] for i in range(0, len(sentences), 5)]

df_Chinese['grouped_sentences'] = df_Chinese['sentences'].apply(group_sentences)

new_df_Chinese = df_Chinese.explode('grouped_sentences')

new_df_Chinese["grouped_sentences"] = new_df_Chinese["grouped_sentences"].apply(lambda x: " ".join(x))

In [44]:
print(len(df_Chinese))
print(len(new_df_Chinese))

352
1717


In [45]:
new_df_Chinese["wc"] = new_df_Chinese["grouped_sentences"].apply(word_count)
print(np.mean(new_df_Chinese["wc"]))

153.57483983692487


In [46]:
new_df_Chinese.groupby("impact_length_idx")["grouped_sentences"].count()

impact_length_idx
0.0     337
1.0     315
2.0    1065
Name: grouped_sentences, dtype: int64

# Preprocess Korean

In [47]:
df_Korean['sentences'] = df_Korean['Translation'].apply(sent_tokenize)

def group_sentences(sentences):
    return [sentences[i:i+5] for i in range(0, len(sentences), 5)]

df_Korean['grouped_sentences'] = df_Korean['sentences'].apply(group_sentences)

new_df_Korean = df_Korean.explode('grouped_sentences')

new_df_Korean["grouped_sentences"] = new_df_Korean["grouped_sentences"].apply(lambda x: " ".join(x))

In [48]:
new_df_Korean["wc"] = new_df_Korean["grouped_sentences"].apply(word_count)
print(np.mean(new_df_Korean["wc"]))

121.0954458685574


In [49]:
print(len(df_Korean))
print(len(new_df_Korean))

800
3667


In [50]:
new_df_Korean.groupby("impact_length_idx")["grouped_sentences"].count()

impact_length_idx
0.0    1971
1.0     695
2.0    1001
Name: grouped_sentences, dtype: int64

In [51]:
new_df_Korean.to_parquet("../dataset/train_df_Korean_translated_segmented.parquet")
new_df_Chinese.to_parquet("../dataset/train_df_Chinese_translated_segmented.parquet")

In [57]:
df_English = pd.read_parquet("../dataset/train_df_English_translated.parquet")
df_French = pd.read_parquet("../dataset/train_df_French_translated.parquet")
df_Korean = pd.read_parquet("../dataset/train_df_Korean_translated_segmented.parquet")
df_Chinese = pd.read_parquet("../dataset/train_df_Chinese_translated_segmented.parquet")

In [58]:
columns = ["sentence", "Translation", "impact_length_idx", "language"]
df_English["Translation"] = df_English["sentence"]

In [59]:
df_English = df_English[columns]
df_Chinese = df_Chinese[columns]
df_Korean = df_Korean[columns]
df_French = df_French[columns]

In [60]:
df = pd.concat([df_English, df_Chinese, df_Korean, df_French]).reset_index(drop=True)

## Remove title

In [61]:
def title_remove(x):
    if "||" in x:
        return x.split("||")[1]
    else:
        return x

In [62]:
df["Translation"] = df["Translation"].apply(lambda x:title_remove(x))

In [63]:
df.to_parquet("../dataset/train_df_all_english_segmented.parquet")