In [None]:
# ORU GPUs
# on command line: "module load git"
import os
os.environ['PATH'] = '/opt/software/git/2.41.0-GCCcore-12.3.0-nodocs/bin:' + os.environ['PATH']

%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# restart kernel

In [None]:
# SIL Dallas GPUs
%pip install --upgrade pip
%pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install torch torchvision torchaudio
%pip install datasets
%pip install transformers
%pip install rich
%pip install clearml
%pip install boto3
%pip install nbconvert # added so clearml can read jupyter notebook
%pip install bitsandbytes
%pip install --force-reinstall "trl<0.9.0"
%pip install --force-reinstall "xformers<0.0.27"
%pip install peft

# restart kernel

In [1]:
import torch
import json
from datasets import Dataset, DatasetDict
import os
from clearml import Task
from trl import SFTTrainer
from transformers import TrainingArguments
import boto3
from botocore.exceptions import ClientError
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import json

In [2]:
class LanguageModel:

  # dtype = None for auto detection (unsloth)
  def __init__(self, experiment_name, language, oru=False, max_seq_length=2048, unsloth=True, dtype=None, load_in_4bit=True):

    if oru:
      self.data_path = 'all_llm_data/'
      self.output_path = ''
    else: # not oru (SIL Dallas GPUs)
      # SIL Dallas GPUs
      self.data_path = '/root/all_llm_data/'
      self.output_path = '/root/'

    # Read in credentials
    with open(self.output_path + 'credentials.json','r') as file:
      self.credentials = json.load(file)

    # Set up clearML environment variables
    os.environ["CLEARML_API_HOST"] = self.credentials['clearml']['CLEARML_API_HOST']
    os.environ["CLEARML_API_ACCESS_KEY"] = self.credentials['clearml']['CLEARML_API_ACCESS_KEY']
    os.environ["CLEARML_API_SECRET_KEY"] = self.credentials['clearml']['CLEARML_API_SECRET_KEY']
    os.environ["CLEARML_WEB_HOST"] = self.credentials['clearml']['CLEARML_WEB_HOST']
    os.environ["CLEARML_FILES_HOST"] = self.credentials['clearml']['CLEARML_FILES_HOST']

    # Set up s3 bucket
    self.s3 = boto3.client('s3',
      aws_access_key_id=self.credentials['s3']['aws_access_key_id'],
      aws_secret_access_key=self.credentials['s3']['aws_secret_access_key'],
    )

    # set up access_token
    self.access_token = self.credentials['huggingface']['access_token']

    # set up parameters for experiment
    self.experiment_name = experiment_name
    self.language = language
    self.max_seq_length = max_seq_length
    self.unsloth = unsloth
    self.dtype = dtype
    self.load_in_4bit = load_in_4bit

    self.prompt = """### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    self.s3_path = "MT/experiments/Demo_Laura/trained_models/"


  def download_file(self, object_name, file_name):
    bucket = self.credentials['s3']['bucket']

    try:
      self.s3.download_file(bucket, object_name, file_name)
    except ClientError as e:
      print(e)
      return False
    print("Success!")
    return True
  
  
  def upload_file(self, file_name, object_name):
    bucket = self.credentials['s3']['bucket']

    if object_name is None:
      object_name = file_name
    try:
      self.s3.upload_file(file_name, bucket, Key=object_name)
    except ClientError as e:
      print(e)
      return False
    print("Success!")
    return True


  # Load model from s3 bucket - downloads model and loads it for use
  #
  # verbose (bool): whether to output extra information
  #
  def load_model_s3(self, verbose=True):
    # make the directory to save the files in
    folderPath = self.output_path + self.experiment_name
    if not os.path.exists(folderPath):
      os.makedirs(folderPath)

    # Loop through all files in model folder and download them
    bucket = self.credentials['s3']['bucket']
    response = self.s3.list_objects(Bucket=bucket, Prefix=self.s3_path + self.experiment_name + "/")
    for content in response.get('Contents', []):
      full_path = content.get('Key')
      file = full_path.split('/')[-1] # just the file name; not the path to it
      if verbose:
        print(file)
      self.download_file(full_path, folderPath + '/' + file)

    # Load it for use
    self.load_model_file()

  
  # Load model from huggingface
  #
  # hf_model_name (string): name of huggingface model
  # verbose (bool): whether to output extra information
  #
  def load_model_hf(self, hf_model_name, verbose=True):
    self.load_model_internal(False, hf_model_name, verbose)


  # Load model from file path
  #
  # verbose (bool): whether to output extra information
  #
  def load_model_file(self, verbose=True):
    folderPath = self.output_path + self.experiment_name
    self.load_model_internal(True, folderPath, verbose)


  # For internal use only - load model given specified parameters
  #
  # from_file (bool): True if loading from file path, False if loading from huggingface
  # name (string): name of filepath if from_path, name of huggingface model if not from_file
  # verbose (bool): whether to output extra information
  #
  def load_model_internal(self, from_file, name, verbose=True):
    start = time.time()
    
    if self.unsloth:
      from unsloth import FastLanguageModel # don't load this unless we're using unsloth

      if not from_file:
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
          model_name = name,
          max_seq_length = self.max_seq_length,
          dtype = self.dtype,
          load_in_4bit = self.load_in_4bit,
          token = self.access_token,
          device_map={"":0}
        )
      else: # from_file
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
          name,
          max_seq_length = self.max_seq_length,
          dtype = self.dtype,
          load_in_4bit = self.load_in_4bit,
          token = self.access_token,
          device_map={"":0}
        )

      self.model = FastLanguageModel.get_peft_model(
        self.model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
      )

    else: # not unsloth
      self.model = AutoModelForCausalLM.from_pretrained(name, token = self.access_token, device_map={"":0}) #, attn_implementation='eager')
      self.tokenizer = AutoTokenizer.from_pretrained(name, token=self.access_token)

    self.EOS_TOKEN = self.tokenizer.eos_token # Must add EOS_TOKEN

    end = time.time()
    if verbose:
      print("Time to load model",end-start,"seconds")


  # Format input to LLM
  def formatting_prompts_func(self,examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
      # Must add EOS_TOKEN, otherwise your generation will go on forever!
      text = self.prompt.format(instruction, input, output) + self.EOS_TOKEN
      texts.append(text)
    return { "text" : texts, }


  # Format input to LLM
  def formatting_prompts_func_generation(self,examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    texts = []
    for instruction, input in zip(instructions, inputs):
      # empty output for generation:
      text = self.prompt.format(instruction, input, "")
      texts.append(text)
    return { "text" : texts, }


  def read_data(self,dataset_label,language,limit=None):
    # Initialize a dictionary to hold the lists for each field
    dataset_dict = {'input': [], 'output': [], 'instruction': []}
        
    # Open the file and read line by line
    with open(self.data_path + language + '_' + dataset_label + '_data.jsonl', 'r', encoding='utf-8') as file:
      for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        instruction = json_object.get('model_inputs', '')[:30]
        dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
        dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output
        dataset_dict['instruction'].append(instruction)

    if limit:
      dataset_dict['input'] = dataset_dict['input'][:limit]
      dataset_dict['output'] = dataset_dict['output'][:limit]
      dataset_dict['instruction'] = dataset_dict['instruction'][:limit]
    
    # Convert the dictionary of lists into a `Dataset`
    dataset = Dataset.from_dict(dataset_dict)

    return dataset.map(self.formatting_prompts_func, batched = True,)


  def read_data_multilingual(self,dataset_label,path):
    # Initialize a dictionary to hold the lists for each field
    dataset_dict = {'input': [], 'output': [], 'instruction': []}
        
    # Open the file and read line by line
    with open(self.data_path + path, 'r', encoding='utf-8') as file:
      for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        dataset_dict['input'].append(json_object.get('input', ''))
        dataset_dict['output'].append(json_object.get('output', ''))
        dataset_dict['instruction'].append(json_object.get('instruction', ''))
    
    # Convert the dictionary of lists into a `Dataset`
    dataset = Dataset.from_dict(dataset_dict)

    return dataset.map(self.formatting_prompts_func, batched = True,)


  def read_data_xri(self,dataset_label,language1_path,language2_path):
    # Initialize a dictionary to hold the lists for each field
    dataset_dict = {'input': [], 'output': [], 'instruction': []}

    with open(self.data_path + language1_path + '.' + dataset_label + '.txt', 'r', encoding='utf-8') as file:
      dataset_dict['input'] = [i[:-1] for i in file.readlines()] # remove newline from each line
    with open(self.data_path + language2_path + '.' + dataset_label + '.txt', 'r', encoding='utf-8') as file:
      dataset_dict['output'] = [i[:-1] for i in file.readlines()] # remove newline from each line

    for i in dataset_dict['input']: # same instruction for all input
      dataset_dict['instruction'].append("Translate swh_Latn to kcz_Latn")
        
    # Convert the dictionary of lists into a `Dataset`
    dataset = Dataset.from_dict(dataset_dict)

    return dataset.map(self.formatting_prompts_func, batched = True,)
  

  def train(self,dataset_train,num_train_epochs=1):
    task = Task.init(project_name="HuggingFace Transformers",
                 task_name=self.experiment_name,
                 output_uri=False) # don't save any of the models to clearml
    task.set_parameters_as_dict({ # don't save any checkpoints to clearml
      'save_checkpoints': False
    })

    if self.unsloth:
      from unsloth import is_bfloat16_supported # don't import this unless we're using unsloth

      optim = "adamw_8bit"
      fp16 = not is_bfloat16_supported()
      bf16 = is_bfloat16_supported()
    else: # not unsloth:
      optim = "adamw_hf"
      fp16 = False
      bf16 = False

    training_arguments = TrainingArguments(
      per_device_train_batch_size = 2,
      gradient_accumulation_steps = 4,
      warmup_steps = 5,
      num_train_epochs = num_train_epochs, # Set this for 1 full training run.
      learning_rate = 2e-4,
      fp16 = fp16,
      bf16 = bf16,
      logging_steps = 1,
      optim = optim,
      weight_decay = 0.01,
      lr_scheduler_type = "linear",
      seed = 3407,
      output_dir = "/root/checkpoints", # Directory to save checkpoints.
      save_steps=500,              # Save a checkpoint every 500 steps.
      save_total_limit=1,          # Keep only the 1 most recent checkpoints.
      save_strategy="steps",       # Save checkpoints based on steps.
    )

    trainer = SFTTrainer(
      model = self.model,
      tokenizer = self.tokenizer,
      train_dataset = dataset_train,
      dataset_text_field = "text",
      max_seq_length = self.max_seq_length,
      dataset_num_proc = 2,
      packing = False, # Can make training 5x faster for short sequences.
      args = training_arguments,
    )

    #@title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

    #@title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory         /max_memory*100, 3)
    lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    task.close() # close clearml task


  def generate(self,dataset,dataset_label,language,num_inputs=10,verbose=True):
    dataset_inference = dataset.map(self.formatting_prompts_func_generation, batched = True,)

    max_num = len(dataset_inference)
    if dataset_label == "train":
      max_num = 250 # only do 250 training examples
    
    if self.unsloth:
      from unsloth import FastLanguageModel # don't load this unless we're using unsloth
      
      FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference

    # generate in batches (to avoid running out of memory)
    generated = []

    starting_num = 0
    while starting_num < max_num:
      torch.cuda.empty_cache()
      if verbose:
        print(starting_num)

      # tokenize all inputs for validation data
      ending_num = min(starting_num + num_inputs, max_num)      
      inputs = self.tokenizer(
      [
        dataset_inference['text'][starting_num:ending_num]
      ], return_tensors = "pt").to("cuda")
      starting_num += num_inputs

      assert len(inputs['input_ids']) <= num_inputs

      outputs = self.model.generate(**inputs, max_new_tokens = self.max_seq_length, use_cache = True)
      generated += self.tokenizer.batch_decode(outputs)

    generated = [i[i.find("Response:")+len("Response:")+1:-len("<|end_of_text|>")] for i in generated]

    file_name = self.experiment_name + "_" + dataset_label + "_" + language + "_generated.txt"
    with open(self.output_path + file_name,"w",encoding="utf-8") as file:
      for sentence in generated:
        file.write(sentence.replace("\n"," ")+"\n") # only new lines should be after generation output

    self.upload_file(self.output_path + file_name, self.s3_path + file_name)

    return generated

  # Save model locally and to s3 bucket
  #
  # verbose (bool): whether to print out extra information
  #
  def save_model(self, verbose=True):
    # Save model locally
    self.model.save_pretrained_merged(self.output_path + self.experiment_name, self.tokenizer, save_method = "merged_16bit",)
    
    # Upload model to s3 bucket
    path = self.output_path + self.experiment_name
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] 
    for file in files:
      if verbose:
        print(file)
      self.upload_file(path + '/' + file, self.s3_path + self.experiment_name + "/" + file)

In [None]:
# XRI data
torch.cuda.empty_cache()

language = 'kcz'
lm = LanguageModel('llama3.1-8b_xri_'+language, "meta-llama/Meta-Llama-3.1-8B", language, oru=False, max_seq_length=2048, unsloth=True)

print("Loading model")
lm.load_model()

print("Reading training data")
dataset_train = lm.read_data_xri("train","swh-XriKonongo.2024_08_12","kcz-XriKonongo.2024_08_12")
print("Reading validation data")
dataset_train = lm.read_data_xri("dev","swh-XriKonongo.2024_08_12","kcz-XriKonongo.2024_08_12")

print("Training")
lm.train(dataset_train,num_train_epochs=5)

In [3]:
torch.cuda.empty_cache()

# If you are loading a model previously trained with unsloth from a file, need to set unsloth=False (maybe I'll fix this in later iterations of the code)
# For 103 languages model - max_seq_length = 10500; for 300 languages - 11500
# For Afro-Asiatic model - max_seq_length = 7000
lm = LanguageModel('llama3.1-8b_unsloth_austro_asiatic_languages_gutob_gadaba_1epoch', "multilingual", oru=False, max_seq_length=12500, unsloth=True)

#print("Loading model")
#lm.load_model()

In [None]:
# Takes less than a minute for 300 languages!
lm.download_file("MT/experiments/Demo_Laura/multilingual_llm/austro_asiatic_languages.jsonl", "/root/all_llm_data/austro_asiatic_languages.jsonl")

In [None]:
lm.load_model_hf("unsloth/Meta-Llama-3.1-8B-bnb-4bit")

In [None]:
lm.load_model_file()

In [None]:
print("Reading training data")
#dataset_train = lm.read_data("train")
dataset_train = lm.read_data_multilingual("train","austro_asiatic_languages.jsonl") # takes less than a minute for 103 languages - if way longer, may need to restart GPU

In [None]:
# What is the max number of tokens in this dataset?
tokens = lm.tokenizer(dataset_train['text']) # takes less than 2 minutes for 103 languages dataset
print(max([len(i) for i in tokens['input_ids']]))

In [None]:
# Add language and script tokens to tokenizer
# Only need to do this when training

# tokens for 300 languages model
#tokens = ['luo_', 'mjg_', 'nhe_', 'hwc_', 'qwh_', 'sus_', 'sag_', 'knj_', 'cab_', 'mgc_', 'azj_', 'khm_', 'Tibt', 'tob_', 'dyi_', 'uzn_', 'che_', 'nob_', 'ara_', 'pdc_', 'kik_', 'Mlym', 'lns_', 'tzo_', 'jav_', 'rus_', 'smo_', 'new_', 'ded_', 'asm_', 'mri_', 'gux_', 'xpe_', 'ctd_', 'hin_', 'taj_', 'lat_', 'kmg_', 'mps_', 'ikk_', 'tgl_', 'nya_', 'quc_', 'peo_', 'llb_', 'urd_', 'ell_', 'tuv_', 'awa_', 'kak_', 'tyv_', 'ibo_', 'tzh_', 'uru_', 'khk_', 'hui_', 'btx_', 'Hang', 'spa_', 'qub_', 'nia_', 'zsm_', 'nep_', 'Kali', 'bis_', 'ifa_', 'wal_', 'skr_', 'pci_', 'mam_', 'mar_', 'mdy_', 'bbc_', 'pcm_', 'mbt_', 'hch_', 'urp_', 'nld_', 'tby_', 'bru_', 'swk_', '</VERSE>', 'yor_', 'Cyrl', 'nhw_', 'shu_', 'hre_', 'npi_', 'pan_', 'eng_', 'cym_', 'aze_', 'lug_', 'gez_', 'ilo_', 'ayr_', 'kpg_', 'chu_', 'gug_', 'sur_', 'cfm_', 'shn_', 'ota_', 'wsg_', 'cak_', 'ydd_', 'tpi_', 'qxn_', 'lbm_', 'Telu', 'cmn_', 'cat_', 'gri_', 'bud_', 'Mymr', 'ewe_', 'bza_', 'pxm_', 'qve_', 'cnh_', 'mtr_', 'ory_', 'blx_', 'sid_', 'bim_', 'cmo_', 'poh_', 'gla_', 'vag_', 'kyu_', 'hne_', 'cms_', 'uzb_', 'srp_', 'dan_', 'tke_', 'yua_', 'syc_', 'hau_', 'por_', 'zho_', 'tha_', 'qxo_', 'bdq_', 'kzn_', 'Orya', 'mya_', 'rar_', 'Guru', 'hmo_', 'mzw_', 'tel_', 'ind_', 'mbb_', 'gjn_', 'yao_', 'mal_', 'fas_', 'crh_', 'epo_', 'kss_', 'apb_', 'jiv_', 'nuj_', 'Beng', 'nlg_', 'gaz_', 'ita_', 'oro_', 'pua_', 'mzh_', 'mro_', 'alp_', 'ngl_', 'Thai', 'ceb_', 'lif_', 'esn_', 'Gujr', 'Taml', 'tur_', 'enx_', 'ckb_', 'wls_', 'kek_', 'ces_', 'fao_', 'bsq_', 'qvw_', 'lin_', 'ifb_', 'kfb_', 'mnk_', 'tat_', 'vmk_', 'lao_', 'wol_', 'swh_', 'rmc_', 'Cans', 'Syrc', 'arb_', 'pbu_', 'izz_', 'twi_', 'chv_', 'zyb_', 'flr_', 'slr_', 'ron_', 'apy_', 'rad_', 'cac_', 'bmu_', 'sgw_', 'cmt_', 'Laoo', 'quy_', 'ifk_', 'bts_', 'mvf_', 'tir_', 'azb_', 'tuk_', 'cuk_', 'pol_', 'kpz_', 'fij_', 'nde_', 'enn_', 'Hira', 'dyu_', 'sda_', 'lzh_', 'qvh_', 'nch_', 'cag_', 'tam_', 'lvs_', 'glv_', 'sun_', 'pis_', 'msy_', 'tem_', 'som_', 'bod_', 'eza_', 'hak_', 'kac_', 'hea_', 'hlt_', 'amh_', 'tgk_', 'mpe_', 'dig_', 'hbo_', 'ctu_', 'lcp_', 'deu_', 'kij_', 'hil_', 'Grek', 'loz_', 'Khmr', 'ton_', 'qxr_', 'guj_', 'ben_', 'sna_', 'fra_', 'eus_', 'yal_', 'bel_', 'Ethi', 'plt_', 'sdo_', 'wuv_', 'nno_', 'yap_', 'pov_', 'iba_', 'kmr_', 'rug_', 'Latn', 'quz_', 'gui_', 'swe_', 'jpn_', 'raw_', 'mif_', 'enl_', 'anu_', 'trv_', 'mfz_', 'klu_', 'Arab', 'gun_', 'tdx_', 'gej_', 'heb_', 'pes_', 'kjb_', 'vas_', 'kin_', 'beo_', 'Hani', 'vie_', 'ban_', 'jra_', 'ike_', 'fin_', 'xon_', 'gle_', 'prs_', 'ukr_', 'rop_', 'qvm_', 'slk_', 'kor_', 'crk_', 'Deva', 'zlm_', 'rml_', 'mos_', 'bns_']

# tokens for Afro-Asiatic model
tokens = ['Arab', 'Hira', 'arq_', 'kqy_', 'ttq_', 'gez_', 'arb_', 'bst_', 'tsb_', 'Ethi', 'thv_', 'apc_', 'syc_', 'mfi_', 'gde_', 'mmy_', 'hau_', 'taq_', 'bds_', 'wal_', 'mdx_', 'sur_', 'gax_', 'heb_', 'mqb_', 'ayh_', 'aii_', 'rel_', 'ars_', 'som_', 'sid_', 'gaz_', 'sgw_', 'Latn', 'xan_', 'rif_', 'tir_', 'arz_', '</VERSE>', 'shu_', 'kab_', 'aeb_', 'afb_', 'mfh_', 'moy_', 'amh_', 'mif_', 'xed_', 'daa_', 'Syrc', 'dwr_', 'kqp_', 'meq_', 'hbo_', 'mdy_', 'dsh_', 'apd_']

lm.tokenizer.add_tokens(tokens)
lm.model.resize_token_embeddings(len(lm.tokenizer))

In [None]:
#print("Reading validation data")
#dataset_val = lm.read_data("val")
#if language != "mbugwe": # no test data for mbugwe
#    print("Reading test data")
#    dataset_test = lm.read_data("test")

print("Training")
lm.train(dataset_train,num_train_epochs=1)
lm.save_model()

#print("Generating results for validation data")
#generated = lm.generate(dataset_val,"val")

In [None]:
# Validate on all languages - need to update this code to finetune on the language first before inference
languages = ['balti','bana','bantawa','borong','gutob_gadaba','hejazi','kisar','konda_dora','kuvi','kwaraae','limbum','naxi','rajbanshi','siddi','tai_nua','waima','western_chawma']
for language in languages:
    dataset_val = lm.read_data("val",language)
    generated = lm.generate(dataset_val,"val",language)

In [None]:
bana_train = lm.read_data("train","bana")

In [None]:
lm.train(bana_train,num_train_epochs=4)

In [None]:
lm.experiment_name = "llama3.1-8b_unsloth_afro_asiatic_languages_bana_5epochs"
lm.save_model()

In [9]:
dataset_val = lm.read_data("val","gutob_gadaba")
generated_val = lm.generate(dataset_val,"val","gutob_gadaba")

In [None]:
print(len(dataset_val))
print(len(generated_val))

In [None]:
print(len(dataset_test))
print(len(generated_test))

In [None]:
dataset_test = lm.read_data("test","gutob_gadaba",limit=250)

In [None]:

generated_test = lm.generate(dataset_test,"test","gutob_gadaba")

In [None]:
# Try generated validation output 10 verses at a time, same as it was trained on

# Initialize a dictionary to hold the lists for each field
dataset_dict = {'input': [], 'output': [], 'instruction': []}
    
# Open the file and read line by line
with open(lm.data_path + 'waima' + '_' + 'val' + '_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        instruction = json_object.get('model_inputs', '')[:30]
        dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
        dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output
        dataset_dict['instruction'].append(instruction)

In [None]:
new_dataset_dict = {'input': [], 'output': [], 'instruction': []}

instruction = dataset_dict['instruction'][0]
starting = 0
while starting < len(dataset_dict['input']):
    ending = starting + 10
    if ending > len(dataset_dict['input']):
        ending = dataset_dict['input']
    new_dataset_dict['input'].append(' </VERSE> '.join(dataset_dict['input'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['output'].append(' </VERSE> '.join(dataset_dict['output'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['instruction'].append(instruction)
    starting = ending

In [None]:
dataset_val = Dataset.from_dict(new_dataset_dict)
dataset_val = dataset_val.map(lm.formatting_prompts_func, batched = True,)

In [None]:
# Initialize a dictionary to hold the lists for each field
dataset_dict = {'input': [], 'output': [], 'instruction': []}
    
# Open the file and read line by line
with open(lm.data_path + 'waima' + '_' + 'train' + '_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        instruction = json_object.get('model_inputs', '')[:30]
        dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
        dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output
        dataset_dict['instruction'].append(instruction)

new_dataset_dict = {'input': [], 'output': [], 'instruction': []}

instruction = dataset_dict['instruction'][0]
starting = 0
while starting < len(dataset_dict['input']):
    ending = starting + 10
    if ending > len(dataset_dict['input']):
        ending = len(dataset_dict['input'])
    new_dataset_dict['input'].append(' </VERSE> '.join(dataset_dict['input'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['output'].append(' </VERSE> '.join(dataset_dict['output'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['instruction'].append(instruction)
    starting = ending

In [None]:
dataset_train = Dataset.from_dict(new_dataset_dict)
dataset_train = dataset_train.map(lm.formatting_prompts_func, batched = True,)

In [None]:
lm.train(dataset_train,num_train_epochs=1)

In [None]:
generated = lm.generate(dataset_val,"val","waima")