In [1]:
import os
import torch
from datasets import load_dataset, Dataset
import pandas as pd
import transformers
from transformers import AutoTokenizer
from trl import SFTTrainer
import transformers



In [2]:
base_model = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
lora_output = 'HajjLLM_Mixtal_Lora'
full_output = 'HajjLLM_Mixtal_Merged'
gguf_output = 'KUETLLM_Zephyr7b_gguf'
gguf_repo = "shahidul034/KUETLLM_Zephyr7b_gguf"
merged_repo = "shahidul034/KUETLLM_zephyr_base"
DEVICE = 'cuda'

In [3]:
from huggingface_hub import login
# login("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") #arbit
login("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") 

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rtx3090/.cache/huggingface/token
Login successful


In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model)


In [5]:
### huggingface dataset with Prompt, Answer pair

# data = load_dataset("arbitropy/kuetdata", split="train")
# data_df = data.to_pandas()

### read csv with Prompt, Answer pair 
data_location = r"/Data/Hajj.csv"
data_df=pd.read_csv( data_location ,encoding='unicode_escape')

### formatting function using tokenizer chat template, system text is set for KUETLLM
def formatted_text(x):
    temp = [
    {"role": "system", "content": "You are a KUET authority managed chatbot, help users by answering their queries about KUET."},
    {"role": "user", "content": x["Prompt"]},
    {"role": "assistant", "content": x["Answer"]}
    ]
    return tokenizer.apply_chat_template(temp, add_generation_prompt=False, tokenize=False)

### set formatting
data_df["text"] = data_df[["Prompt", "Answer"]].apply(lambda x: formatted_text(x), axis=1)
print(data_df.iloc[0])
dataset = Dataset.from_pandas(data_df)

FileNotFoundError: [Errno 2] No such file or directory: '/Data/Hajj.csv'



* Quantize base model

* Fine-tune adapters

* Merge Adapters to dequantized model

In [7]:

# Get quantized model
model = transformers.AutoModelForCausalLM.from_pretrained(base_model,
                                                          load_in_4bit=True,     # call for the 8 bit bnb quantized version
                                                          device_map='auto',
                                                          bnb_4bit_compute_dtype=torch.float16
                                                          )

ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [None]:
# print model to find lora layers
print(model)

### Setup model with adapters

In [None]:
# Set PEFT adapter config (16:32)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# target modules are currently selected for zephyr base model
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj","k_proj","o_proj","gate_proj","up_proj","down_proj"],   # target all the linear layers for full finetuning
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM")

In [None]:
# stabilize output layer and layernorms
model = prepare_model_for_kbit_training(model, 8)

In [None]:
# Set PEFT adapter on model (Last step)
model = get_peft_model(model, config)

### Finetune

In [None]:
# Set Hyperparameters
MAXLEN=512
BATCH_SIZE=12
GRAD_ACC=1
OPTIMIZER='paged_adamw_8bit' # save memory
LR=5e-06                      # slightly smaller than pretraining lr | and close to LoRA standard

In [None]:
# Setup Callbacks 
# early_stop = transformers.EarlyStoppingCallback(10, 1.15)      #hard to finetune further on general tasks like assitance

In [None]:
# Set training config
training_config = transformers.TrainingArguments(per_device_train_batch_size=BATCH_SIZE,
                                                 gradient_accumulation_steps=GRAD_ACC,
                                                 optim=OPTIMIZER,
                                                 learning_rate=LR,
                                                 fp16=True,            # consider compatibility when using bf16
                                                 logging_steps=10,
                                                 num_train_epochs = 2,
                                                 output_dir=lora_output,
                                                 remove_unused_columns=False,
                                                 )

In [None]:
# Set collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# Setup trainer
trainer = SFTTrainer(model=model,
                               train_dataset=dataset,
                               data_collator=data_collator,
                               args=training_config,
                               dataset_text_field="text",
                            #    callbacks=[early_stop], need to learn, lora easily overfits
                              )

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
trainer.save_model(lora_output)


## Merge Adapters to dequantized model

In [None]:
# Get peft config
from peft import PeftConfig
config = PeftConfig.from_pretrained(lora_output)

In [None]:
# Get base model
model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                                          return_dict=True,
                                                          )

In [None]:
# Load the Lora model
from peft import PeftModel
model = PeftModel.from_pretrained(model, lora_output)

In [None]:
# Get tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [None]:
# Merge model zephyr-7b-beta_assistant_v0.2_merged
merged_model = model.merge_and_unload()

In [None]:
merged_model.save_pretrained(full_output)
tokenizer.save_pretrained(full_output)

In [None]:
# ! huggingface-cli download arbitropy/unq_zephyer_kuetllm_lora_merged --local-dir KUETLLM_zephyr7b_unqantized

In [None]:
! python ./llama.cpp/convert.py ./{full_output}

In [None]:
# change model name
! ./llama.cpp/quantize ./{full_output}/ggml-model-f32.gguf zephyr_q4km_kuetllm.gguf q4_k_m

In [27]:
# push model to hub
merged_model.push_to_hub(merged_repo)
tokenizer.push_to_hub(merged_repo)

model-00004-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.25G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shahidul034/KUETLLM_zephyr_base/commit/1a2ff18bbf101341babc28a3e39dc942052c2ce9', commit_message='Upload tokenizer', commit_description='', oid='1a2ff18bbf101341babc28a3e39dc942052c2ce9', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
# uploading gguf, doesn't work due to slow network?
from huggingface_hub import HfApi
api = HfApi()

# api.create_repo(model_id, exist_ok=True, repo_type="model")
api.upload_file(
    repo_id=gguf_repo,
    path_or_fileobj="zephyr_q4km_kuetllm.gguf",
    path_in_repo="zephyr_q4km_kuetllm.gguf"
)

zephyr_q4km_kuetllm.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

'https://huggingface.co/shahidul034/KUETLLM_Zephyr7b_gguf/blob/main/zephyr_q4km_kuetllm.gguf'