Code to take in preprocessed data from silnlp, format it into json format for LLMs

In [None]:
import json

# Prepare data for a casual LLM
#
# src_file_path (string): location of src file
# trg_file_path (string): location of trg file
# output_file_path (string): location to save prepared json file
# src (string): language/script tag (e.g., eng_Latn) for src data
# trg (string): language/script tag (e.g., eng_Latn) for trg data
#
def prepare_data_for_causal_llm(src_file_path, trg_file_path, output_file_path,src,trg):
    with open(src_file_path, 'r', encoding='utf-8') as src_file, open(trg_file_path, 'r', encoding='utf-8') as trg_file:
        src_sentences = src_file.readlines()
        trg_sentences = trg_file.readlines()

        assert len(src_sentences) == len(trg_sentences), "Files must have the same number of lines"

        with open(output_file_path, 'w') as output_file:
            for src_sentence, trg_sentence in zip(src_sentences, trg_sentences):
                # Combine src and trg for model input
                model_input = f"translate "+src+" to "+trg+": "+src_sentence.strip()# + " " + trg_sentence.strip()
                completion = f"{trg_sentence.strip()}\r\n"
                data = {
                    "model_inputs": model_input,
                    "completion": completion
                }
                json.dump(data, output_file)
                output_file.write("\n")

In [None]:
# Benchmark languages 

# List of all languages
# each language: [language name (folder name), source language, target language]
language_data = [['balti','urd_Arab','bft_Arab'],
                 ['bana','fra_Latn','bcw_Latn'],
                 ['bantawa','npi_Deva','bap_Deva'],
                 ['borong','tpi_Latn','ksr_Latn'],
                 ['gutob_gadaba','ory_Odia','gbj_Odia'],
                 ['hejazi','arb_Arab','acw_Arab'],
                 ['kisar','ind_Latn','kje_Latn'],
                 ['konda_dora','tel_Telu','kfc_Telu'],
                 ['kuvi','eng_Latn','kxv_Telu'],
                 ['kwaraae','eng_Latn','kwf_Latn'],
                 ['limbum','eng_Latn','lmp_Latn'],
                 ['mbugwe','swh_Latn','mgz_Latn'],
                 ['naxi','cmn_Latn','nxq_Latn'],
                 ['rajbanshi','npi_Deva','rjs_Deva'],
                 ['siddi','kan_Knda','mis_Knda'],
                 ['tai_nua','shn_Mymr','tdd_Mymr'],
                 ['waima','eng_Latn','rro_Latn'],
                 ['western_chawma','khm_Khmr','cja_Othr']] #cja_Cham

for item in language_data:
    language = item[0]
    src = item[1]
    trg = item[2]

    for label in ['train','val','test']:
        print(language,label)
        if language=='mbugwe' and label=='test': #no test data for mbugwe
            continue
        prepare_data_for_causal_llm(language+'/'+label+'.src.detok.txt', language+'/'+label+'.trg.detok.txt', "/Users/laura/llmResearch/all_llm_data/"+language+'_'+label+'_data.jsonl',src,trg)

In [None]:
# Backtranslation experiments
language = "saj-Sahu_2024_12_11"
path = "/Users/laura/S/MT/experiments/Demo_Laura/bt_experiments/" + language + "/"
prepare_data_for_causal_llm(path + "train.src.detok.txt", path + "train.trg.detok.txt", "/Users/laura/silnlp/scripts/llms/data/bt_experiments/" + language + "_train_data.jsonl","saj_Latn","eng_Latn")

In [3]:
prepare_data_for_causal_llm(path + "test.src.detok.txt", path + "test.trg.detok.txt", "/Users/laura/silnlp/scripts/llms/data/bt_experiments/" + language + "_test_data.jsonl","saj_Latn","eng_Latn")