In [None]:
%pip install -q -U bitsandbytes
%pip install -q -U transformers==4.41
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U datasets 
%pip install -q -U trl
#%pip install -q -U datasets==2.15.0  #     Uninstalling datasets-2.7.1:

# Restart kernel

In [None]:
import subprocess

# Run nvidia-smi command and capture the output
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
print(result.stdout.decode())

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import tqdm as notebook_tqdm

access_token = "xxx"
pretrained_model = "meta-llama/Llama-2-7b-hf"

# Load without quantization 
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model, token=access_token)#, device_map={"":0})

# 4.5 GB

In [None]:
base_model.resize_token_embeddings(260164)

In [None]:
from peft import PeftModel
tokenizer = AutoTokenizer.from_pretrained('MaLA-LM/mala-500-10b', token=access_token)
model = PeftModel.from_pretrained(base_model, 'MaLA-LM/mala-500-10b', token=access_token, is_trainable=True)

In [None]:
# merge Mala adapter with the base model and unload the adapter
merged_model = model.merge_and_unload()

special_tokens = {'additional_special_tokens': ['khm_Khmr', 'cja_Othr']}
tokenizer.add_special_tokens(special_tokens)
merged_model.resize_token_embeddings(len(tokenizer))

In [None]:
# save merged model
merged_model.save_pretrained("/root/merged_adapters")
tokenizer.save_pretrained("/root/merged_adapters")

In [None]:
# Load the merged model (always load the merged model before fine-tuning via SFTTrainer)
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('/root/merged_adapters', token=access_token, legacy=False)
tokenizer.add_prefix_space = True
#model = AutoModelForCausalLM.from_pretrained('/root/merged_adapters', token=access_token)

# 17 GB

In [None]:
# If you want to load quantization model 

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import tqdm as notebook_tqdm
#access_token = cry_id
merged_model = '/root/merged_adapters'     # Load the merged model (base model + adapter) 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(merged_model, token=access_token, device_map={"":0})#, quantization_config=bnb_config)

In [None]:
# Configure LoraConfig for model fine tuning 
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,  #16
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # trainable%: 0.2657765331302975
    modules_to_save=["embed_tokens", "lm_head"], # trainable%: 28.60409045333961
    #modules_to_save=["lm_head"], # trainable%: 14.434933493234952
    lora_dropout=0.05,
    bias="none",
    task_type="TaskType.CAUSAL_LM",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

In [None]:
# SKIPPED
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
torch.cuda.empty_cache()

In [None]:
# Read in training data
import json
from datasets import Dataset, DatasetDict

# Initialize a dictionary to hold the lists for each field
train = {'model_inputs': [], 'completion': []}

language = 'waima'

# Open the file and read line by line
with open('/root/all_llm_data/'+language+'_train_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        train['model_inputs'].append(json_object.get('model_inputs', ''))  
        train['completion'].append(json_object.get('completion', ''))  

# Convert the dictionary of lists into a `Dataset`
dataset_train = Dataset.from_dict(train)

In [None]:
val = {'model_inputs': [], 'completion': []}

# Open the file and read line by line - validation data
with open('/root/all_llm_data/'+language+'_val_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        val['model_inputs'].append(json_object.get('model_inputs', ''))  
        val['completion'].append(json_object.get('completion', ''))         

# Convert the dictionary of lists into a `Dataset`
dataset_val = Dataset.from_dict(val)

In [None]:
# Create a `DatasetDict` for a train/val split
dataset_dict = DatasetDict({'train': dataset_train,'val': dataset_val})

# Print the first entry of the dataset to check its structure
print(dataset_dict['train'][0])
print(dataset_dict['val'][0])

In [None]:
############### not used for SFTTrainer ###################
from transformers import DataCollatorForLanguageModeling
# Data collator dynamically pad the batches and mask tokens
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, mlm_probability=0.15)  # causal language modeling

In [None]:
# Check if there is a padding token, set the EOS token as padding token if not
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#tokenizer.padding_side = "left"  # Llama model 

def tokenize_function(examples):
    # Access the 'prompt' and 'completion' 
    scr_texts = examples["model_inputs"]  # A list of strings
    trg_texts = examples["completion"]  # A list of strings

    # Concatinate the source and target texts
    combined_texts = combined_texts = [scr + " " + trg for scr, trg in zip(scr_texts, trg_texts)]

    # Use the tokenizer on the combined texts 
    model_inputs = tokenizer(combined_texts, padding="max_length", truncation=True, max_length=256)
   # model predict based on the language pair
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

# Assume dataset_dict is already defined and contains 'train' and 'test' datasets
# Apply the tokenization function to the dataset
tokenized_train = dataset_dict["train"].map(tokenize_function, batched=True)
tokenized_val = dataset_dict["val"].map(tokenize_function, batched=True)

In [None]:
import transformers
from trl import SFTTrainer

max_seq_length = 2048
trainer = SFTTrainer(
    model = model,
    max_seq_length = max_seq_length, 
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer = tokenizer,
    packing=True, 
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8, # reduce if loss is too high
        per_device_eval_batch_size=8,  
        gradient_accumulation_steps=8,  
        lr_scheduler_type="cosine",   
        num_train_epochs=10,
        warmup_steps=20,
        #max_steps=180,
        learning_rate=1e-4, # reduce if loss is too high
        gradient_checkpointing=True,
        #bf16=True,
        logging_steps=20,
        output_dir="/root/outputs",
        optim="adamw_hf"
        #optim="paged_adamw_8bit",
    )
    #data_collator=data_collator,    # increase training loss by 20 fold 
)

In [None]:
torch.cuda.empty_cache()

In [None]:
from clearml import Task
task = Task.init(project_name="HuggingFace Transformers",
                 task_name="noQuant_allTrainingData",
                 output_uri=False) # don't save any of the models to clearml

# Train with no hyperparameter search
trainer.train()

In [None]:
model.save_pretrained('/root/trained_model_'+language)
tokenizer.save_pretrained('/root/trained_model_'+language)

In [None]:
# Restrain output to only a certain character set (in this case Latin script)
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, LogitsProcessorList, LogitsProcessor

class LatinScriptLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer):
        self.allowed_tokens = set(tokenizer.convert_tokens_to_ids(tokenizer.tokenize('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "-:;().,?!')))

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # Get the batch size and vocab size
        batch_size, vocab_size = scores.size()
        
        # Create a mask for allowed tokens
        allowed_mask = torch.zeros(vocab_size, dtype=torch.bool)
        allowed_mask[list(self.allowed_tokens)] = True
        
        # Apply the mask to the scores
        scores[:, ~allowed_mask] = -float("inf")
        
        return scores

# Create the logits processor
logits_processor = LatinScriptLogitsProcessor(tokenizer)
logits_processor_list = LogitsProcessorList([logits_processor])

In [None]:
# diversity_penalty needs to be a float (not int) - e.g., 1.0 rather than 1
def generate(model_input,max_new_tokens=200,temperature=0.5,top_p=0.9,top_k=50,repetition_penalty=1,renormalize_logits=False,logits_processor=None,
    num_beams=1,do_sample=True,penalty_alpha=0,no_repeat_ngram_size=0,diversity_penalty=0.0):
    # generate in batches (to avoid running out of memory)

    # tokenize all inputs for validation data
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer([model_input], padding = True, return_tensors="pt").to("cuda")

    # send all tokenized input through model
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        do_sample=do_sample,
        repetition_penalty=repetition_penalty,
        renormalize_logits=renormalize_logits,
        logits_processor=logits_processor,
        num_beams=num_beams,
        penalty_alpha=penalty_alpha,
        no_repeat_ngram_size=no_repeat_ngram_size,
        diversity_penalty=diversity_penalty
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
generated = generate(tokenized_train['model_inputs'][0],max_new_tokens=150,temperature=0.5,repetition_penalty=1.2,renormalize_logits=False,logits_processor=None,\
    top_k=50,no_repeat_ngram_size=4)

In [None]:
print(tokenized_train['model_inputs'])
print(tokenized_train['completion'])
print(generated)

In [None]:
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client('s3',
    aws_access_key_id="xxx",
    aws_secret_access_key="xxx",
)

def upload_file(file_name, bucket, object_name):
    if object_name is None:
        object_name = file_name
    try:
        s3.upload_file(file_name, bucket, Key=object_name)
    except ClientError as e:
        print(e)
        return False
    print("Success!")
    return True

In [None]:
files = ['tokenizer_config.json',
 'special_tokens_map.json',
 'tokenizer.model',
 'tokenizer.json',
 'README.md',
 'adapter_model.safetensors',
 'adapter_config.json']

for file in files:
   print(file)
   upload_file("/root/trained_model_" + language + "/" + file, "aqua-ml-data", \
      "MT/experiments/Demo_Laura/trained_models/trained_model_" + language + \
      "/" + file)

In [None]:
torch.cuda.empty_cache()