In [None]:
import torch
import transformers
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
from datasets import load_dataset
from peft import prepare_model_for_kbit_training,LoraConfig,PeftModel,get_peft_model

In [None]:
model_path = "BUT-FIT/CSTinyLlama-1.2B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    model_max_length=512,
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
context_start = 'Odpověz na daňovou otázku jako odborník. Otázka: '
context_end = ' Odpověď: '

In [None]:
train_dataset = load_dataset('../data/dataset/', split='train')

In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Odpověz na daňovou otázku jako odborník. 

    * Otázka:
    {data_point["question"]}
    
    * Odpověď:
    {data_point["answer"]}
    """
    return tokenize(full_prompt)

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)

In [None]:
### Evaluate a prompt

In [None]:
eval_prompt = context_start + train_dataset[0]['question'] + context_end

In [None]:
eval_prompt

In [None]:
from transformers import pipeline
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)
model.eval()
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)

with torch.autocast('cuda', dtype=torch.float16):
    print(
        pipe(eval_prompt,
             max_new_tokens=256,
             top_p=0.95,
             repetition_penalty=1.0,
             do_sample=True,
             use_cache=True)[0]['generated_text'].split(context_end)[1]
    )

In [None]:
## Fine-tuning the model

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.1,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
# model = accelerator.prepare_model(model)

In [None]:
project = "finance-finetune"
base_model_name = "cs-tiny-llama"
run_name = base_model_name + "-" + project
output_dir = "../output/" + run_name


tokenizer.pad_token = tokenizer.eos_token


trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        fp16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer.train()

In [None]:
## Load the finetuned model and evaluate the prompt

In [None]:
ft_model = PeftModel.from_pretrained(base_model, "mistral-viggo-finetune/checkpoint-300")

ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True,repetition_penalty=1.5,temperature=0.2))