In [None]:
# ORU GPUs
# on command line: "module load git"
import os
os.environ['PATH'] = '/opt/software/git/2.41.0-GCCcore-12.3.0-nodocs/bin:' + os.environ['PATH']

%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# restart kernel

In [None]:
# SIL Dallas GPUs
%pip install unsloth
%pip install torch torchvision torchaudio
%pip install datasets
%pip install transformers
%pip install rich
%pip install clearml
%pip install boto3
%pip install nbconvert # added so clearml can read jupyter notebook
%pip install bitsandbytes
%pip install --force-reinstall "trl<0.9.0"
%pip install --force-reinstall "xformers<0.0.27"
%pip install peft

# restart kernel

In [1]:
import torch
import json
from datasets import Dataset, DatasetDict
import os
from clearml import Task
from trl import SFTTrainer
from transformers import TrainingArguments
import boto3
from botocore.exceptions import ClientError
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import json

In [12]:
class LanguageModel:

  # dtype = None for auto detection (unsloth)
  def __init__(self, experiment_name, hf_model_name, language, oru=False, max_seq_length=2048, unsloth=True, dtype=None, load_in_4bit=True):

    if oru:
      self.data_path = 'all_llm_data/'
      self.output_path = ''
    else: # not oru (SIL Dallas GPUs)
      # SIL Dallas GPUs
      self.data_path = '/root/all_llm_data/'
      self.output_path = '/root/'

    # Read in credentials
    with open(self.output_path + 'credentials.json','r') as file:
      self.credentials = json.load(file)

    # Set up clearML environment variables
    os.environ["CLEARML_API_HOST"] = self.credentials['clearml']['CLEARML_API_HOST']
    os.environ["CLEARML_API_ACCESS_KEY"] = self.credentials['clearml']['CLEARML_API_ACCESS_KEY']
    os.environ["CLEARML_API_SECRET_KEY"] = self.credentials['clearml']['CLEARML_API_SECRET_KEY']
    os.environ["CLEARML_WEB_HOST"] = self.credentials['clearml']['CLEARML_WEB_HOST']
    os.environ["CLEARML_FILES_HOST"] = self.credentials['clearml']['CLEARML_FILES_HOST']

    # Set up s3 bucket
    self.s3 = boto3.client('s3',
      aws_access_key_id=self.credentials['s3']['aws_access_key_id'],
      aws_secret_access_key=self.credentials['s3']['aws_secret_access_key'],
    )

    # set up access_token
    self.access_token = self.credentials['huggingface']['access_token']

    # set up parameters for experiment
    self.experiment_name = experiment_name
    self.hf_model_name = hf_model_name
    self.language = language
    self.max_seq_length = max_seq_length
    self.unsloth = unsloth
    self.dtype = dtype
    self.load_in_4bit = load_in_4bit

    self.prompt = """### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""


  def download_file(self, object_name, file_name):
    bucket = self.credentials['s3']['bucket']

    try:
      self.s3.download_file(bucket, object_name, file_name)
    except ClientError as e:
      print(e)
      return False
    print("Success!")
    return True

  
  
  def upload_file(self, file_name, object_name):
    bucket = self.credentials['s3']['bucket']

    if object_name is None:
      object_name = file_name
    try:
      self.s3.upload_file(file_name, bucket, Key=object_name)
    except ClientError as e:
      print(e)
      return False
    print("Success!")
    return True

  
  def load_model(self, verbose=True):
    start = time.time()
    
    if self.unsloth:
      from unsloth import FastLanguageModel # don't load this unless we're using unsloth

      self.model, self.tokenizer = FastLanguageModel.from_pretrained(
        model_name = self.hf_model_name,
        max_seq_length = self.max_seq_length,
        dtype = self.dtype,
        load_in_4bit = self.load_in_4bit,
        token = self.access_token
      )

      self.model = FastLanguageModel.get_peft_model(
        self.model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
      )

    else: # not unsloth
      self.model = AutoModelForCausalLM.from_pretrained(self.hf_model_name, token = self.access_token, device_map={"":0}) #, attn_implementation='eager')
      self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_name, token=self.access_token)

    self.EOS_TOKEN = self.tokenizer.eos_token # Must add EOS_TOKEN

    end = time.time()
    if verbose:
      print("Time to load model",end-start,"seconds")


  # Format input to LLM
  def formatting_prompts_func(self,examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
      # Must add EOS_TOKEN, otherwise your generation will go on forever!
      text = self.prompt.format(instruction, input, output) + self.EOS_TOKEN
      texts.append(text)
    return { "text" : texts, }


  # Format input to LLM
  def formatting_prompts_func_generation(self,examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    texts = []
    for instruction, input in zip(instructions, inputs):
      # empty output for generation:
      text = self.prompt.format(instruction, input, "")
      texts.append(text)
    return { "text" : texts, }


  def read_data(self,dataset_label,language):
    # Initialize a dictionary to hold the lists for each field
    dataset_dict = {'input': [], 'output': [], 'instruction': []}
        
    # Open the file and read line by line
    with open(self.data_path + language + '_' + dataset_label + '_data.jsonl', 'r', encoding='utf-8') as file:
      for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        instruction = json_object.get('model_inputs', '')[:30]
        dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
        dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output
        dataset_dict['instruction'].append(instruction)
    
    # Convert the dictionary of lists into a `Dataset`
    dataset = Dataset.from_dict(dataset_dict)

    return dataset.map(self.formatting_prompts_func, batched = True,)


  def read_data_multilingual(self,dataset_label,path):
    # Initialize a dictionary to hold the lists for each field
    dataset_dict = {'input': [], 'output': [], 'instruction': []}
        
    # Open the file and read line by line
    with open(self.data_path + path, 'r', encoding='utf-8') as file:
      for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        dataset_dict['input'].append(json_object.get('input', ''))
        dataset_dict['output'].append(json_object.get('output', ''))
        dataset_dict['instruction'].append(json_object.get('instruction', ''))
    
    # Convert the dictionary of lists into a `Dataset`
    dataset = Dataset.from_dict(dataset_dict)

    return dataset.map(self.formatting_prompts_func, batched = True,)


  def read_data_xri(self,dataset_label,language1_path,language2_path):
    # Initialize a dictionary to hold the lists for each field
    dataset_dict = {'input': [], 'output': [], 'instruction': []}

    with open(self.data_path + language1_path + '.' + dataset_label + '.txt', 'r', encoding='utf-8') as file:
      dataset_dict['input'] = [i[:-1] for i in file.readlines()] # remove newline from each line
    with open(self.data_path + language2_path + '.' + dataset_label + '.txt', 'r', encoding='utf-8') as file:
      dataset_dict['output'] = [i[:-1] for i in file.readlines()] # remove newline from each line

    for i in dataset_dict['input']: # same instruction for all input
      dataset_dict['instruction'].append("Translate swh_Latn to kcz_Latn")
        
    # Convert the dictionary of lists into a `Dataset`
    dataset = Dataset.from_dict(dataset_dict)

    return dataset.map(self.formatting_prompts_func, batched = True,)
  

  def train(self,dataset_train,num_train_epochs=1):
    task = Task.init(project_name="HuggingFace Transformers",
                 task_name=self.experiment_name,
                 output_uri=False) # don't save any of the models to clearml
    task.set_parameters_as_dict({ # don't save any checkpoints to clearml
      'save_checkpoints': False
    })

    if self.unsloth:
      from unsloth import is_bfloat16_supported # don't import this unless we're using unsloth

      optim = "adamw_8bit"
      fp16 = not is_bfloat16_supported()
      bf16 = is_bfloat16_supported()
    else: # not unsloth:
      optim = "adamw_hf"
      fp16 = False
      bf16 = False

    training_arguments = TrainingArguments(
      per_device_train_batch_size = 2,
      gradient_accumulation_steps = 4,
      warmup_steps = 5,
      num_train_epochs = num_train_epochs, # Set this for 1 full training run.
      learning_rate = 2e-4,
      fp16 = fp16,
      bf16 = bf16,
      logging_steps = 1,
      optim = optim,
      weight_decay = 0.01,
      lr_scheduler_type = "linear",
      seed = 3407,
      output_dir = "outputs",
    )

    trainer = SFTTrainer(
      model = self.model,
      tokenizer = self.tokenizer,
      train_dataset = dataset_train,
      dataset_text_field = "text",
      max_seq_length = self.max_seq_length,
      dataset_num_proc = 2,
      packing = False, # Can make training 5x faster for short sequences.
      args = training_arguments,
    )

    #@title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

    #@title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory         /max_memory*100, 3)
    lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    task.close() # close clearml task


  def generate(self,dataset,dataset_label,language,max_new_tokens=128):
    dataset_inference = dataset.map(self.formatting_prompts_func_generation, batched = True,)

    max_num = len(dataset_inference)
    if dataset_label == "train":
      max_num = 250 # only do 250 training examples
    
    if self.unsloth:
      from unsloth import FastLanguageModel # don't load this unless we're using unsloth
      
      FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference
    generated = []
    for i in range(max_num):
      if i%20==0:
        print(i)
      inputs = self.tokenizer(
      [
        dataset_inference['text'][i]
      ], return_tensors = "pt").to("cuda")
      outputs = self.model.generate(**inputs, max_new_tokens = max_new_tokens, use_cache = True)
      generated += self.tokenizer.batch_decode(outputs)

    generated = [i[i.find("Response:")+len("Response:")+1:-len("<|end_of_text|>")] for i in generated]

    file_name = self.experiment_name + "_" + dataset_label + "_" + language + "_generated.txt"
    with open(self.output_path + file_name,"w",encoding="utf-8") as file:
      for sentence in generated:
        file.write(sentence+"\n")

    self.upload_file(self.output_path + file_name, \
      "MT/experiments/Demo_Laura/trained_models/" + file_name)

    return generated

  
  def save_model(self):
    self.model.save_pretrained_merged(self.output_path + self.experiment_name, self.tokenizer, save_method = "merged_16bit",)

    files = ['config.json',
      'generation_config.json',
      'model-00001-of-00004.safetensors',
      'model-00002-of-00004.safetensors',
      'model-00003-of-00004.safetensors',
      'model-00004-of-00004.safetensors',
      'model.safetensors.index.json',
      'special_tokens_map.json',
      'tokenizer_config.json',
      'tokenizer.json']
    
    for file in files:
      print(file)
      self.upload_file(self.output_path + self.experiment_name + '/' + file, \
        "MT/experiments/Demo_Laura/trained_models/" + self.experiment_name + \
        "/" + file)


  def load_model_s3(self):
    # make the directory to save the files in
    folderPath = self.output_path + self.experiment_name
    if not os.path.exists(folderPath):
      os.makedirs(folderPath)

    files = ['config.json',
      'generation_config.json',
      'model-00001-of-00004.safetensors',
      'model-00002-of-00004.safetensors',
      'model-00003-of-00004.safetensors',
      'model-00004-of-00004.safetensors',
      'model.safetensors.index.json',
      'special_tokens_map.json',
      'tokenizer_config.json',
      'tokenizer.json']

    for file in files:
      print(file)
      self.download_file("MT/experiments/Demo_Laura/trained_models/" + self.experiment_name + \
        "/" + file, folderPath + '/' + file)

In [None]:
# XRI data
torch.cuda.empty_cache()

language = 'kcz'
lm = LanguageModel('llama3.1-8b_xri_'+language, "meta-llama/Meta-Llama-3.1-8B", language, oru=False, max_seq_length=2048, unsloth=True)

print("Loading model")
lm.load_model()

print("Reading training data")
dataset_train = lm.read_data_xri("train","swh-XriKonongo.2024_08_12","kcz-XriKonongo.2024_08_12")
print("Reading validation data")
dataset_train = lm.read_data_xri("dev","swh-XriKonongo.2024_08_12","kcz-XriKonongo.2024_08_12")

print("Training")
lm.train(dataset_train,num_train_epochs=5)

In [13]:
torch.cuda.empty_cache()

lm = LanguageModel('llama3.1-8b_unsloth_103languages', "unsloth/Meta-Llama-3.1-8B-bnb-4bit", "multilingual", oru=False, max_seq_length=10500, unsloth=True)

#print("Loading model")
#lm.load_model()

In [None]:
lm.load_model_s3()

In [None]:
print("Reading training data")
#dataset_train = lm.read_data("train")
dataset_train = lm.read_data_multilingual("train","103languages.jsonl") # takes less than a minute - if way longer, may need to restart GPU

In [None]:
# What is the max number of tokens in this dataset?
tokens = lm.tokenizer(dataset_train['text']) # takes less than 2 minutes
print(max([len(i) for i in tokens['input_ids']]))

In [None]:
# Add language and script tokens to tokenizer
tokens = ['eus_', 'ded_', 'tzo_', 'ydd_', 'eng_', 'gle_', 'mbt_', 'qxr_', 'Grek', 'cym_', 'rmc_', 'btx_', 'peo_', 'flr_', 'Guru', 'heb_', 'qvw_', 'gez_', 'pes_', 'qwh_', 'pbu_', 'bsq_', 'rml_', 'tel_', 'mbb_', 'fij_', 'ara_', 'hch_', 'Arab', 'tdx_', 'ron_', 'Taml', 'vag_', 'quy_', 'pis_', 'yap_', 'tir_', 'jav_', 'dyi_', 'wal_', 'ilo_', 'chv_', 'gri_', 'qxo_', 'ceb_', 'glv_', 'mro_', 'tgk_', 'deu_', 'srp_', 'hui_', 'ifa_', 'kmr_', 'sun_', 'esn_', 'ban_', 'bim_', 'hre_', 'pxm_', 'mtr_', 'ind_', 'mya_', 'crk_', 'ike_', 'mri_', 'ory_', 'qub_', 'cac_', 'kor_', 'kek_', 'tur_', 'azj_', 'nhe_', 'nep_', 'lzh_', 'Orya', 'sur_', 'gla_', 'quz_', 'uru_', 'zho_', 'kzn_', 'urp_', 'fra_', 'bns_', 'hmo_', 'pci_', 'awa_', 'raw_', 'tby_', 'bza_', 'tke_', 'prs_', 'Kali', 'sna_', 'fin_', 'kyu_', 'mal_', 'Thai', 'blx_', 'cms_', 'nuj_', 'pol_', 'ben_', 'ewe_', 'mzw_', 'lbm_', 'nia_', 'uzn_', 'yor_', 'mps_', 'tyv_', 'mnk_', 'ota_', 'zlm_', 'lvs_', 'nno_', 'sdo_', 'bis_', 'npi_', 'hak_', 'pua_', 'shn_', 'bmu_', 'kss_', 'syc_', 'taj_', 'kjb_', 'kac_', 'gui_', 'sid_', 'gun_', 'nld_', 'zsm_', 'zyb_', 'knj_', 'guj_', 'bbc_', 'fao_', '</VERSE>', 'jpn_', 'qve_', 'wsg_', 'Deva', 'ibo_', 'Cans', 'apy_', 'cak_', 'arb_', 'hil_', 'qxn_', 'apb_', 'Hang', 'vmk_', 'nlg_', 'ngl_', 'gug_', 'lug_', 'new_', 'tat_', 'ayr_', 'che_', 'Cyrl', 'khk_', 'bud_', 'nya_', 'aze_', 'dig_', 'ell_', 'ckb_', 'ukr_', 'fas_', 'tuk_', 'rug_', 'khm_', 'bod_', 'bts_', 'swe_', 'mar_', 'quc_', 'hlt_', 'mgc_', 'epo_', 'kin_', 'Gujr', 'msy_', 'rus_', 'uzb_', 'bdq_', 'dan_', 'Mlym', 'vie_', 'yal_', 'beo_', 'llb_', 'cuk_', 'pan_', 'eza_', 'skr_', 'cab_', 'kik_', 'asm_', 'Laoo', 'tuv_', 'xpe_', 'klu_', 'nch_', 'ifb_', 'pcm_', 'cmo_', 'mjg_', 'crh_', 'hne_', 'qvm_', 'plt_', 'kpg_', 'por_', 'hau_', 'Ethi', 'sda_', 'Hira', 'Mymr', 'cmt_', 'lns_', 'lao_', 'ifk_', 'bru_', 'mfz_', 'nhw_', 'dyu_', 'Syrc', 'kfb_', 'spa_', 'mpe_', 'kpz_', 'mam_', 'jiv_', 'tam_', 'wol_', 'wls_', 'tpi_', 'pdc_', 'lat_', 'enx_', 'enl_', 'amh_', 'slk_', 'ces_', 'rad_', 'gej_', 'nde_', 'sag_', 'xon_', 'lif_', 'oro_', 'kij_', 'cat_', 'Latn', 'iba_', 'gux_', 'hea_', 'bel_', 'ita_', 'hwc_', 'alp_', 'qvh_', 'cag_', 'wuv_', 'Khmr', 'tob_', 'mif_', 'vas_', 'swh_', 'Beng', 'Hani', 'urd_', 'kak_', 'ikk_', 'som_', 'hin_', 'anu_', 'tha_', 'swk_', 'mdy_', 'ctd_', 'hbo_', 'tgl_', 'chu_', 'mvf_', 'jra_', 'kmg_', 'luo_', 'azb_', 'cnh_', 'sgw_', 'mos_', 'izz_', 'poh_', 'lin_', 'sus_', 'yua_', 'ctu_', 'tem_', 'cmn_', 'Tibt', 'gjn_', 'twi_', 'rar_', 'ton_', 'tzh_', 'smo_', 'pov_', 'shu_', 'trv_', 'nob_', 'Telu', 'cfm_', 'yao_', 'lcp_', 'mzh_', 'slr_', 'loz_', 'gaz_', 'rop_', 'enn_']

lm.tokenizer.add_tokens(tokens)
lm.model.resize_token_embeddings(len(lm.tokenizer))

In [None]:
#print("Reading validation data")
#dataset_val = lm.read_data("val")
#if language != "mbugwe": # no test data for mbugwe
#    print("Reading test data")
#    dataset_test = lm.read_data("test")

print("Training")
lm.train(dataset_train,num_train_epochs=5)
lm.save_model()

#print("Generating results for validation data")
#generated = lm.generate(dataset_val,"val")

In [None]:
# Validate on all languages - need to update this code to finetune on the language first before inference
languages = ['balti','bana','bantawa','borong','gutob_gadaba','hejazi','kisar','konda_dora','kuvi','kwaraae','limbum','naxi','rajbanshi','siddi','tai_nua','waima','western_chawma']
for language in languages:
    dataset_val = lm.read_data("val",language)
    generated = lm.generate(dataset_val,"val",language)

In [None]:
waima_train = lm.read_data("train","waima")

In [None]:
lm.train(waima_train,num_train_epochs=1)

In [None]:
dataset_val = lm.read_data("val","waima")
generated = lm.generate(dataset_val,"val","waima")

In [None]:
lm.save_model()

In [None]:
# Try generated validation output 10 verses at a time, same as it was trained on

# Initialize a dictionary to hold the lists for each field
dataset_dict = {'input': [], 'output': [], 'instruction': []}
    
# Open the file and read line by line
with open(lm.data_path + 'waima' + '_' + 'val' + '_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        instruction = json_object.get('model_inputs', '')[:30]
        dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
        dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output
        dataset_dict['instruction'].append(instruction)

In [None]:
new_dataset_dict = {'input': [], 'output': [], 'instruction': []}

instruction = dataset_dict['instruction'][0]
starting = 0
while starting < len(dataset_dict['input']):
    ending = starting + 10
    if ending > len(dataset_dict['input']):
        ending = dataset_dict['input']
    new_dataset_dict['input'].append(' </VERSE> '.join(dataset_dict['input'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['output'].append(' </VERSE> '.join(dataset_dict['output'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['instruction'].append(instruction)
    starting = ending

In [None]:
dataset_val = Dataset.from_dict(new_dataset_dict)
dataset_val = dataset_val.map(lm.formatting_prompts_func, batched = True,)

In [None]:
# Initialize a dictionary to hold the lists for each field
dataset_dict = {'input': [], 'output': [], 'instruction': []}
    
# Open the file and read line by line
with open(lm.data_path + 'waima' + '_' + 'train' + '_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        instruction = json_object.get('model_inputs', '')[:30]
        dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
        dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output
        dataset_dict['instruction'].append(instruction)

new_dataset_dict = {'input': [], 'output': [], 'instruction': []}

instruction = dataset_dict['instruction'][0]
starting = 0
while starting < len(dataset_dict['input']):
    ending = starting + 10
    if ending > len(dataset_dict['input']):
        ending = len(dataset_dict['input'])
    new_dataset_dict['input'].append(' </VERSE> '.join(dataset_dict['input'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['output'].append(' </VERSE> '.join(dataset_dict['output'][starting:ending]) + ' </VERSE>')
    new_dataset_dict['instruction'].append(instruction)
    starting = ending

In [None]:
dataset_train = Dataset.from_dict(new_dataset_dict)
dataset_train = dataset_train.map(lm.formatting_prompts_func, batched = True,)