#  QLoRA 4bit quantization of openllama v2_7b

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q sentencepiece
!pip install -q huggingface_hub
!pip install einops

First let's load Llama 7b_v2

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,LlamaForCausalLM, LlamaTokenizer, GenerationConfig

model_id = "openlm-research/open_llama_7b_v2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained('openlm-research/open_llama_7b_v2')
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [None]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
import transformers

# needed for gpt-neo-x tokenizer
# tokenizer.pad_token = tokenizer.eos_token

# needed for llama tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

trainer = transformers.Trainer(
    model=model,  
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_gpu_train_batch_size=1,
        gradient_accumulation_steps=12,
        warmup_steps=2,
        num_train_epochs=12,
        learning_rate=1e-5,
        fp16=True,
        logging_steps=2,
        save_total_limit=3,
        output_dir="./checkpoints",
        optim="paged_adamw_8bit",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()



In [None]:
# save model
trainer.save_model("./directory")

In [None]:
# Evaluate Model
prompt = f"""
Extract the Brithdate in the text and output it in ISO8601 Format, show only the date of birth: text

Output:
"""

input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=11, num_beams=1), pad_token_id=tokenizer.eos_token_id)
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig
model_id = "openlm-research/open_llama_7b_v2"
peft_model_base = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to('cuda')
#tokenizer = AutoTokenizer.from_pretrained(peft_model_base) - ausprobieren ob das dann noch geht???
tokenizer = AutoTokenizer.from_pretrained('openlm-research/open_llama_7b_v2')
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './directory/',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [None]:
# Evaluate Model
prompt = f"""
Extract the Brithdate in the text and output it in ISO8601 Format: text

Output:
"""

input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

original_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=11, num_beams=1),pad_token_id=tokenizer.eos_token_id)
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'PEFT MODEL:\n{original_model_text_output}')