In [1]:
import json 
bm_fr = json.load(open("mt/all_bm_fr.json", "r"))
bm_en =json.load(open("mt/all_bm_en.json", "r"))

In [2]:
bm_fr[0]

{'fr': 'Parmi ces points, figurent la révision des réformes institutionnelles découlant de l’accord pour la paix, les questions sécuritaires et de développement.',
 'bm': 'O kɔnɔko yɔrɔw cɛla, fangabulonkow yɛlɛmaniw kɛli, minnu bɛ bɔ bɛnkansɛbɛn na bɛn kama, lakana ani yiriwali koɲɛw b’olu la.',
 'source': 'lafand'}

In [3]:
bm_en[0]

{'en': 'I', 'bm': 'Ne', 'source': 'google_smol_gatitos'}

In [4]:
def forme_instruction(dataset: list[dict], source_key: str, target_key: str):
    lang_names = {
        "en": "English",
        "bm": "Bambara",
        "fr": "French"
    }


    user_instruction = {
        "bm": "Nin bayɛlɛma {target_lang} la:\n\n{text_to_translate}",
        "fr": "Traduis ce texte en {target_lang} :\n\n{text_to_translate}",
        "en": "Translate this text to {target_lang} :\n\n{text_to_translate}"
    }

    instruction_forward = []
    instruction_reverse = []


    source_lang_name = lang_names[source_key]
    target_lang_name = lang_names[target_key]

    for sample in dataset:
        for instr_lang in user_instruction:
            prompt = user_instruction[instr_lang].format(
                target_lang=target_lang_name,
                text_to_translate=sample[source_key]
            )
            instruction_forward.append({
                "messages": [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": sample[target_key]}
                ]
            })


        for instr_lang in user_instruction:
            prompt = user_instruction[instr_lang].format(
                target_lang=source_lang_name,
                text_to_translate=sample[target_key]
            )
            instruction_reverse.append({
                "messages": [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": sample[source_key]}
                ]
            })

    return instruction_forward, instruction_reverse

In [None]:
def forme_instruction_v2(dataset: list[dict], source_key: str, target_key: str):
    """
    Prepare instruction dataset in chat format with proper text cleaning to avoid PyArrow issues.
    """
    lang_names = {
        "en": "English",
        "bm": "Bambara",
        "fr": "French"
    }

    user_instruction = {
        "bm": "Nin bayɛlɛma {target_lang} la:\n\n{text_to_translate}",
        "fr": "Traduis ce texte en {target_lang} :\n\n{text_to_translate}",
        "en": "Translate this text to {target_lang} :\n\n{text_to_translate}"
    }

    instruction_forward = []
    instruction_reverse = []

    source_lang_name = lang_names[source_key]
    target_lang_name = lang_names[target_key]

    def clean_text(text):
        """Clean and ensure text is properly formatted string"""
        if text is None:
            return ""

        text = str(text).strip()

        if isinstance(text, bytes):
            text = text.decode('utf-8', errors='ignore')
        return text

    for sample in dataset:

        source_text = clean_text(sample.get(source_key, ""))
        target_text = clean_text(sample.get(target_key, ""))
        
        if not source_text or not target_text:
            continue

        for instr_lang in user_instruction:
            prompt = user_instruction[instr_lang].format(
                target_lang=target_lang_name,
                text_to_translate=source_text
            )
            instruction_forward.append({
                "messages": [
                    {"role": "user", "content": clean_text(prompt)},
                    {"role": "assistant", "content": target_text}
                ]
            })

        for instr_lang in user_instruction:
            prompt = user_instruction[instr_lang].format(
                target_lang=source_lang_name,
                text_to_translate=target_text
            )
            instruction_reverse.append({
                "messages": [
                    {"role": "user", "content": clean_text(prompt)},
                    {"role": "assistant", "content": source_text}
                ]
            })

    return instruction_forward, instruction_reverse

In [23]:
fr_instruction_forward, fr_instruction_reverse = forme_instruction_v2(
    dataset=bm_fr, 
    source_key="fr", 
    target_key="bm"
)

en_instruction_forward, en_instruction_reverse = forme_instruction_v2(
    dataset=bm_en, 
    source_key="en", 
    target_key="bm"
)


all_dataset = (en_instruction_reverse + en_instruction_forward + 
               fr_instruction_forward + fr_instruction_reverse)

In [26]:
dataset_hg = Dataset.from_list(all_dataset)

In [30]:
from huggingface_hub import login 
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
dataset_hg.push_to_hub("sudoping01/bambara-instruct-dataset", "mt-instruction")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/560 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sudoping01/bambara-instruct-dataset/commit/74a8845911dc4de559266a429b53e98411c2401d', commit_message='Upload dataset', commit_description='', oid='74a8845911dc4de559266a429b53e98411c2401d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sudoping01/bambara-instruct-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sudoping01/bambara-instruct-dataset'), pr_revision=None, pr_num=None)

In [24]:
len(all_dataset)

559248

In [5]:
fr_instruction_forward, fr_instruction_reverse = forme_instruction(dataset=bm_fr, source_key="fr", target_key="bm")

In [7]:
fr_instruction_forward[0]

{'messages': [{'role': 'user',
   'content': 'Nin bayɛlɛma Bambara la:\n\nParmi ces points, figurent la révision des réformes institutionnelles découlant de l’accord pour la paix, les questions sécuritaires et de développement.'},
  {'role': 'assistant',
   'content': 'O kɔnɔko yɔrɔw cɛla, fangabulonkow yɛlɛmaniw kɛli, minnu bɛ bɔ bɛnkansɛbɛn na bɛn kama, lakana ani yiriwali koɲɛw b’olu la.'}]}

In [11]:
fr_instruction_forward[1]

{'messages': [{'role': 'user',
   'content': 'Traduis ce texte en Bambara :\n\nParmi ces points, figurent la révision des réformes institutionnelles découlant de l’accord pour la paix, les questions sécuritaires et de développement.'},
  {'role': 'assistant',
   'content': 'O kɔnɔko yɔrɔw cɛla, fangabulonkow yɛlɛmaniw kɛli, minnu bɛ bɔ bɛnkansɛbɛn na bɛn kama, lakana ani yiriwali koɲɛw b’olu la.'}]}

In [12]:
fr_instruction_forward[2]

{'messages': [{'role': 'user',
   'content': 'Translate this text to Bambara :\n\nParmi ces points, figurent la révision des réformes institutionnelles découlant de l’accord pour la paix, les questions sécuritaires et de développement.'},
  {'role': 'assistant',
   'content': 'O kɔnɔko yɔrɔw cɛla, fangabulonkow yɛlɛmaniw kɛli, minnu bɛ bɔ bɛnkansɛbɛn na bɛn kama, lakana ani yiriwali koɲɛw b’olu la.'}]}

In [13]:
fr_instruction_reverse[0]

{'messages': [{'role': 'user',
   'content': 'Nin bayɛlɛma French la:\n\nO kɔnɔko yɔrɔw cɛla, fangabulonkow yɛlɛmaniw kɛli, minnu bɛ bɔ bɛnkansɛbɛn na bɛn kama, lakana ani yiriwali koɲɛw b’olu la.'},
  {'role': 'assistant',
   'content': 'Parmi ces points, figurent la révision des réformes institutionnelles découlant de l’accord pour la paix, les questions sécuritaires et de développement.'}]}

In [14]:
fr_instruction_reverse[1]

{'messages': [{'role': 'user',
   'content': 'Traduis ce texte en French :\n\nO kɔnɔko yɔrɔw cɛla, fangabulonkow yɛlɛmaniw kɛli, minnu bɛ bɔ bɛnkansɛbɛn na bɛn kama, lakana ani yiriwali koɲɛw b’olu la.'},
  {'role': 'assistant',
   'content': 'Parmi ces points, figurent la révision des réformes institutionnelles découlant de l’accord pour la paix, les questions sécuritaires et de développement.'}]}

In [8]:
en_instruction_forward, en_instruction_reverse = forme_instruction(dataset=bm_en, source_key="en", target_key="bm")

In [9]:
en_instruction_forward[:3]

[{'messages': [{'role': 'user', 'content': 'Nin bayɛlɛma Bambara la:\n\nI'},
   {'role': 'assistant', 'content': 'Ne'}]},
 {'messages': [{'role': 'user',
    'content': 'Traduis ce texte en Bambara :\n\nI'},
   {'role': 'assistant', 'content': 'Ne'}]},
 {'messages': [{'role': 'user',
    'content': 'Translate this text to Bambara :\n\nI'},
   {'role': 'assistant', 'content': 'Ne'}]}]

In [10]:
en_instruction_reverse[:3]

[{'messages': [{'role': 'user', 'content': 'Nin bayɛlɛma English la:\n\nNe'},
   {'role': 'assistant', 'content': 'I'}]},
 {'messages': [{'role': 'user',
    'content': 'Traduis ce texte en English :\n\nNe'},
   {'role': 'assistant', 'content': 'I'}]},
 {'messages': [{'role': 'user',
    'content': 'Translate this text to English :\n\nNe'},
   {'role': 'assistant', 'content': 'I'}]}]

In [12]:
all_dataset = en_instruction_reverse + en_instruction_forward + fr_instruction_forward + fr_instruction_reverse

In [13]:
len(all_dataset)

559392

In [None]:
with open("mt/mt_instructions.json",  "w") as file : 
    json.dump(all_dataset, file)


In [None]:
with open("mt/mt_instruction.jsonl", "w", encoding="utf-8") as file:
    for conv in all_dataset:
        json.dump(conv, file, ensure_ascii=False)
        file.write("\n")