In [None]:
%%capture
!sudo apt-get -y update
!sudo apt-get -y install build-essential
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install wandb
!pip install -q torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install -q 'unsloth==2025.1.1'
!pip uninstall -q transformers -y
!pip install -q 'transformers==4.47.1'
!pip uninstall trl -y && pip install --no-cache-dir --force-reinstall --no-deps "trl<0.15.0"

In [None]:
R = 16
PROJECT = "NER_SFT"
MODEL_ID = "unsloth/Qwen2.5-14B-Instruct"
MODEL_NAME = MODEL_ID.split('/')[-1]

In [None]:
HF_TOKEN = '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'
!export HF_TOKEN='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'
!export WANDB_API_KEY='00f7a841cc2925bdab7c82a2b4c186d12d042cb1'
!export WANDB_LOG_MODEL='false'

In [None]:
import pandas as pd
import wandb
import torch
import gc

from tqdm import tqdm
from datasets import Dataset
from transformers import TrainingArguments, EarlyStoppingCallback
from unsloth import is_bfloat16_supported, FastLanguageModel
from trl import SFTTrainer

In [3]:
from datasets import load_dataset

train_datasets = load_dataset("pofce/qwen-ukrainian-ner-train-dataset")
test_datasets = load_dataset("pofce/ukrainian-ner-test-dataset")

train_dataset, val_dataset, test_dataset = train_datasets["train"], train_datasets["validation"], test_datasets["test"]

README.md:   0%|          | 0.00/580 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/574k [00:00<?, ?B/s]

full_train-00000-of-00001.parquet:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8879 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2282 [00:00<?, ? examples/s]

Generating full_train split:   0%|          | 0/11161 [00:00<?, ? examples/s]

In [None]:
def train_and_evaluate(r_value, max_seq_length=2048):

    wandb.login(key='00f7a841cc2925bdab7c82a2b4c186d12d042cb1')
    wandb.init(project=PROJECT, config={"model_id": MODEL_NAME, "r": r_value})

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_ID,
        max_seq_length = max_seq_length
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = r_value,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 3407,
        use_rslora = False,
        loftq_config = None,
    )
    
    args = TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_ratio=0.03,
        num_train_epochs=2,
        learning_rate=2e-4, # ToDo: Adjust on best model
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=30,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=f"outputs/{MODEL_NAME}_r_{r_value}",
        report_to="wandb",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
    )
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=args,
    )

    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=1))
    trainer.train()

    trainer.push_to_hub(f"{MODEL_NAME}_{r_value}", token=HF_TOKEN)
    
    FastLanguageModel.for_inference(model)
    results = []
    
    for prompt in tqdm(test_dataset["prompts"]):
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=512)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({"prompt": prompt, "generated_answer": generated_text})

    df = pd.DataFrame(results)

    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    wandb.finish()
    return df

df = train_and_evaluate(R)

In [None]:
dataset = Dataset.from_pandas(df)
dataset.push_to_hub(f"pofce/{MODEL_NAME}_{R}_ex", token=HF_TOKEN)
df.to_csv(f"{MODEL_NAME}_{R}.csv", index=False)