# Create back-translations

Create back-translations from a source language to a target language using multinomial sampling.

## Setup environment

Restart the kernel after you have installed packages with `pip install` in the Notebook cell below.

In [1]:
!pip install -q -U sentencepiece transformers datasets sacrebleu lxml sentence-transformers accelerate fastai

[0m

In [2]:
from huggingface_hub import notebook_login
notebook_login(new_session=False)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Imports, config, etc

In [3]:
import os
import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd
import json

from utils import cleanup, set_seed
from translation import translate, back_translate

In [4]:
RANDOM_SEED = 27  # Set `RANDOM_SEED = None` to run without a seed

In [5]:
if RANDOM_SEED is not None:
    set_seed(RANDOM_SEED, reproducible=True)

In [6]:
def _save(savedir="./data", df=None, json_data=None, split=""):
    if df is None and json_data is None:
        print("Nothing to save")
        return None

    if not os.path.exists("./data"):
        os.mkdir("data")

    if df is not None:
        _ = df_bt.to_csv(f"data/dataset_bt_{DATA_SPLIT}.txt", sep="|", index=False)

    if json_data is not None:
        with open(f"data/dataset_bt_{DATA_SPLIT}.json", "w") as f:
            json.dump(back_translations_json, f, indent=4)

## Load dataset

In [7]:
ds = load_dataset("uvci/Koumankan_mt_dyu_fr")

Downloading readme:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/530k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8065 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1471 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1393 [00:00<?, ? examples/s]

Save a data locally as json files for ease of access later:

In [8]:
if not os.path.exists("./data"):
    os.mkdir("data")

for split in ("train", "validation", "test"):
    d = {
        "split": split,
        "data": [{
            "ID": row["ID"], "translation": {
                "dyu": row["translation"]["dyu"],
                "fr": row["translation"]["fr"]
            }} for row in ds[split]
        ]
    }
    with open(f"data/dataset_{split}.json", "w") as f:
        json.dump(d, f, indent=4)

## Create back-translations from the training data

Create back-translations from a source language (`src_lang`) to a target language (`tgt_lang`) using multinomial sampling. If `sample_src==True` then a source language to source language 'translation' is first generated to create more diverse training examples.

We loop over the `backtranslate` function several times to create more diverse translation examples through sampling.

In [9]:
MODEL_ID = './saved_models/fra-dyu-600M'
DATA_SPLIT = "train"
SRC_LANG = "fra_Latn"
TGT_LANG = "dyu_Latn"
SRC2SRC_SAMPLING = True
SRC2SRC_MODEL_ID = "facebook/nllb-200-distilled-1.3B"

In [10]:
tokenizer = NllbTokenizer.from_pretrained(MODEL_ID, src_lang=SRC_LANG, tgt_lang=TGT_LANG)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, device_map="cuda", torch_dtype=torch.bfloat16)
print(f"Memory footprint: {model.get_memory_footprint() / 1024**3 :.2f}GB")

if SRC2SRC_SAMPLING and SRC2SRC_MODEL_ID is not None:
    src2src_tokenizer = NllbTokenizer.from_pretrained(SRC2SRC_MODEL_ID, src_lang=SRC_LANG, tgt_lang=TGT_LANG)
    src2src_model = AutoModelForSeq2SeqLM.from_pretrained(SRC2SRC_MODEL_ID, device_map="cuda", torch_dtype=torch.bfloat16)
    print(f"Memory footprint: {src2src_model.get_memory_footprint() / 1024**3 :.2f}GB")

Memory footprint: 0.68GB


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Memory footprint: 2.56GB


In [11]:
back_translations = []
src_translations = []

for _ in range(15):
    dyu, fra = back_translate(
        ds, model, tokenizer, split=DATA_SPLIT, batch_size=64, src_lang=SRC_LANG,
        tgt_lang=TGT_LANG, sample_src=SRC2SRC_SAMPLING, src2src_model=src2src_model,
        src2src_tokenizer=src2src_tokenizer
    )
    back_translations += dyu
    src_translations += fra

  0%|          | 0/8065 [00:00<?, ?it/s]

2024-08-29 21:59:43.033859: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-29 21:59:43.033920: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-29 21:59:43.035068: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-29 21:59:43.041022: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

  0%|          | 0/8065 [00:00<?, ?it/s]

In [12]:
assert len(back_translations) == len(src_translations)

In [13]:
df_bt = pd.DataFrame({"dyu": back_translations, "fr": src_translations}).drop_duplicates()
df_bt

Unnamed: 0,dyu,fr
0,a bi ji min na,il boit de l'eau
1,a bi dalakolora lonbɛ,il se plaint toujours
2,ko komi fɛn,que comme quelque
3,bɛɛ bi bɔ gubeta kɔ,tout le monde sort sauf gubetta
4,ale le bi mankan kɛra,c'est lui il sonne
...,...,...
120962,nin maɲi,c'est pas bien cela
120964,kamele belebeleba ni barakamanw,des mecs grands et forts
120968,sira binani ni binani,mille sept cent soixante sept de deyrier route...
120969,bi zan jara kalanso ra,aujourd'hui à l'école jean jaures


In [14]:
back_translations_json = {
    "split": DATA_SPLIT,
    "data": [{"ID": 0, "translation": {"dyu": row["dyu"], "fr": row["fr"]}} for _, row in df_bt.iterrows()]
}

In [15]:
_save(savedir="./data", df=df_bt, json_data=back_translations_json, split=DATA_SPLIT)

In [16]:
cleanup()

## Create back-translations from the validation data

Create back-translations from a source language (`src_lang`) to a target language (`tgt_lang`) using multinomial sampling. If `sample_src==True` then a source language to source language 'translation' is first generated to create more diverse training examples.

We loop over the `backtranslate` function several times to create more diverse translation examples through sampling.

In [17]:
DATA_SPLIT = "validation"

In [18]:
back_translations = []
src_translations = []

for _ in range(10):
    dyu, fra = back_translate(
        ds, model, tokenizer, split=DATA_SPLIT, batch_size=16, src_lang=SRC_LANG,
        tgt_lang=TGT_LANG, sample_src=SRC2SRC_SAMPLING, src2src_model=src2src_model,
        src2src_tokenizer=src2src_tokenizer
    )
    back_translations += dyu
    src_translations += fra

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

  0%|          | 0/1471 [00:00<?, ?it/s]

In [19]:
assert len(back_translations) == len(src_translations)

In [20]:
df_bt = pd.DataFrame({"dyu": back_translations, "fr": src_translations}).drop_duplicates()
df_bt

Unnamed: 0,dyu,fr
0,i bi tɔgɔ diman le fɛ,tu porte un nom de fantaisie
1,ka taga ɲafɛ nin na saba,à trois points d'avance
2,tile bi wolola,le soleil est couché
3,a bi taga ni cogo kelen ye,dans le même mouvement
4,n ma dumini kɛ fɔlɔ,je n'ai pas encore déjeuné
...,...,...
14703,a tun ti yi kalo kelen kɔnɔ tuma dɔra,il était absent une semaine de temps en temps
14704,i ka sara ye juma le ye,quel est ton salaire
14706,gestɛl ka sira,rue de gestel
14708,a ye fo ka fo john fo n ye,dites au revoir à john de ma part


In [21]:
back_translations_json = {
    "split": DATA_SPLIT,
    "data": [{"ID": 0, "translation": {"dyu": row["dyu"], "fr": row["fr"]}} for _, row in df_bt.iterrows()]
}

In [22]:
_save(savedir="./data", df=df_bt, json_data=back_translations_json, split=DATA_SPLIT)

In [23]:
cleanup()

## Create back-translations from the test data

Create back-translations from a source language (`src_lang`) to a target language (`tgt_lang`) using multinomial sampling. If `sample_src==True` then a source language to source language 'translation' is first generated to create more diverse training examples.

We loop over the `backtranslate` function several times to create more diverse translation examples through sampling.

In [24]:
MODEL_ID = './saved_models/dyu-fra-600M'
DATA_SPLIT = "test"
SRC_LANG = "dyu_Latn"
TGT_LANG = "fra_Latn"
SRC2SRC_SAMPLING = False
SRC2SRC_MODEL_ID = None

In [25]:
tokenizer = NllbTokenizer.from_pretrained(MODEL_ID, src_lang=SRC_LANG, tgt_lang=TGT_LANG)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, device_map="cuda", torch_dtype=torch.bfloat16)
print(f"Memory footprint: {model.get_memory_footprint() / 1024**3 :.2f}GB")

Memory footprint: 0.68GB


In [26]:
back_translations = []
src_translations = []

for _ in range(5):
    dyu, fra = back_translate(
        ds, model, tokenizer, split=DATA_SPLIT, batch_size=64, src_lang=SRC_LANG,
        tgt_lang=TGT_LANG, sample_src=SRC2SRC_SAMPLING, src2src_model=src2src_model,
        src2src_tokenizer=src2src_tokenizer
    )
    back_translations += dyu
    src_translations += fra

  0%|          | 0/1393 [00:00<?, ?it/s]

  0%|          | 0/1393 [00:00<?, ?it/s]

  0%|          | 0/1393 [00:00<?, ?it/s]

  0%|          | 0/1393 [00:00<?, ?it/s]

  0%|          | 0/1393 [00:00<?, ?it/s]

In [27]:
assert len(back_translations) == len(src_translations)

In [28]:
df_bt = pd.DataFrame({"dyu": src_translations, "fr": back_translations}).drop_duplicates()
df_bt

Unnamed: 0,dyu,fr
0,an kelen duron le tun be yi,nous étions seuls
1,o ka papiye farana,leurs papiers sont cassés
2,n tɔrɔla kɔ tuguni,je souffre encore
3,i tun b'a daminɛ tan kɛ,tu débutais à dix
4,a kɛra ka ban,il est arrivé
...,...,...
6949,bi saba,trente et un livres
6952,a bi dhii tcjaman min ka sôrô ka la,il est le dernier de ces trois espèces
6955,en bè kountara dougou konnon,on l’achèterait tous à la ville
6956,an bina daminè tougou,on recommence par là


In [29]:
back_translations_json = {
    "split": DATA_SPLIT,
    "data": [{"ID": 0, "translation": {"dyu": row["dyu"], "fr": row["fr"]}} for _, row in df_bt.iterrows()]
}

In [30]:
_save(savedir="./data", df=df_bt, json_data=back_translations_json, split=DATA_SPLIT)

In [31]:
cleanup()