# Expert Fine-tuning
This noteboook is used to fine-tune individual expert models

In [1]:
import torch
import pandas as pd
from functools import partial
from pathlib import Path
from huggingface_hub import constants as hub_c
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling
from transformers.models.llama.modeling_llama import LlamaForCausalLM

from evals import evaluate, load_eval_dataset
from utils import DatasetArgs, get_dataset_args

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

seed = 42
torch.manual_seed(seed)

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

args = get_dataset_args(tokenizer, Path(hub_c.HF_HUB_CACHE))
dataset_id = "Topics"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

### Dataset preprocessing

In [3]:
def train_preprocess(args: DatasetArgs, example: dict):
    # Create prompt and target text
    prompt_args = [example[key] for key in args.prompt_args[dataset_id]]
    prompt = args.prompt_templates[dataset_id].format(*prompt_args)

    target = args.id2labels[dataset_id][example["label"]]
    
    # tokenize text
    return tokenizer(prompt + target, truncation=False)

In [None]:
dataset_path = args.paths[dataset_id]
df_dataset = pd.read_csv(dataset_path / "train.csv",
                         delimiter=args.del_mapping[dataset_id],
                         names=args.names_mapping[dataset_id])

preprocess_func = partial(train_preprocess, args)
dataset = (Dataset
            .from_pandas(df_dataset)
            .map(preprocess_func,
                 batched=False,
                 remove_columns=args.columns[dataset_id])
            .filter(lambda sample: len(sample["input_ids"]) <= args.max_length))

dataset = dataset.train_test_split(test_size=0.1, seed=seed)

## LoRA Setup

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

## Trainer setup

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

out_dir = Path(rf"D:/models/expert-Llama-3_2-1B-{dataset_id}")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=12,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_steps=128,
    logging_steps=32,
    save_steps=128,
    save_strategy="steps",
    eval_steps=128,
    eval_strategy="steps",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

# Eval

In [None]:
ckpt_path = Path(rf"D:/models/expert-Llama-3_2-1B-{dataset_id}") / "checkpoint-best"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16").eval()
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

In [None]:
dataset_id = "FPB"
testset = load_eval_dataset(tokenizer, dataset_id, args)
results = evaluate(expert_model, tokenizer,
                   testset,
                   args.token_opts[dataset_id])