In [1]:
from datasets import load_dataset
from deep_translator import GoogleTranslator

In [2]:
raw_dataset = load_dataset("go_emotions", name="raw")
simplified_dataset = load_dataset("go_emotions", name="simplified")

translator = GoogleTranslator(source="en", target="ru")


In [3]:
def translate_samples(samples):
    original_text = samples["text"]
    translated_batch = translator.translate_batch(original_text)

    # If the input text does not cointain any words, like ":(" or ":)"
    # translator will return None. So, I replace Nones with original text
    # as you cannont translate emojis and other stuff like this.
    for i in range(len(translated_batch)):
        if not translated_batch[i]:
            translated_batch[i] = original_text[i]
            print(f"Replaced {original_text[i]} vs {translated_batch[i]}")

    samples["ru_text"] = translated_batch
    return samples


In [4]:
ru_simplified_dataset = simplified_dataset.map(
    translate_samples, batched=True, batch_size=500
)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Replaced :^( vs :^(
Replaced :(( vs :((
Replaced (:) vs (:)


Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [5]:
ru_raw_dataset = raw_dataset.map(translate_samples, batched=True, batch_size=500)


Map:   0%|          | 0/211225 [00:00<?, ? examples/s]

Replaced :^) vs :^)
Replaced (:) vs (:)
Replaced :^( vs :^(
Replaced (:) vs (:)
Replaced 😘 ☂ ☂️☂️ vs 😘 ☂ ☂️☂️
Replaced :(( vs :((
Replaced :^) vs :^)
Replaced :^) vs :^)
Replaced :^( vs :^(
Replaced :^) vs :^)
Replaced :^( vs :^(
Replaced 😘 ☂ ☂️☂️ vs 😘 ☂ ☂️☂️
Replaced :(( vs :((
Replaced (:) vs (:)
Replaced :(( vs :((
Replaced 😘 ☂ ☂️☂️ vs 😘 ☂ ☂️☂️
Replaced :^) vs :^)


In [11]:
ru_simplified_dataset.push_to_hub("ru_go_emotions", config_name="simplified")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

In [12]:
ru_raw_dataset.push_to_hub("ru_go_emotions", config_name="raw")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/212 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [15]:
for split in ["train", "validation", "test"]:
    ru_simplified_dataset[split].to_csv(f"ru-go-emotions-simplified-{split}.csv")


Creating CSV from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

In [17]:
ru_raw_dataset["train"].to_csv("ru-go-emotions-raw.csv")

Creating CSV from Arrow format:   0%|          | 0/212 [00:00<?, ?ba/s]

70995325