In [16]:
import pandas as pd
import numpy as np
from datasets import Dataset

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import torch

### Prepare Dataset

In [5]:
alicia_dataset = np.load('datasets/alicia_dataset.npy', allow_pickle=True)

In [11]:
hf_dataset = Dataset.from_list(alicia_dataset.tolist())
print(hf_dataset)

Dataset({
    features: ['context', 'response'],
    num_rows: 12
})


In [12]:
def format_qwen_chat(example):
    return {
        "text": f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['context']}<|im_end|>\n<|im_start|>assistant\n{example['response']}<|im_end|>"
    }

In [None]:
formatted_dataset = hf_dataset.map(format_qwen_chat)
print(formatted_dataset)

### Load Model LoRa

In [None]:
model_name = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,  # or use 4bit for more savings
    device_map="auto",
    trust_remote_code=True
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Could not load bitsandbytes native library: /lib64/libc.so.6: version `GLIBC_2.34' not found (required by /nas/longleaf/home/smerrill/.conda/envs/llm/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda126.so)
Traceback (most recent call last):
  File "/nas/longleaf/home/smerrill/.conda/envs/llm/lib/python3.9/site-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
  File "/nas/longleaf/home/smerrill/.conda/envs/llm/lib/python3.9/site-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
  File "/nas/longleaf/home/smerrill/.conda/envs/llm/lib/python3.9/ctypes/__init__.py", line 460, in LoadLibrary
    return self._dlltype(name)
  File "/nas/longleaf/home/smerrill/.conda/envs/llm/lib/pytho

### Tokenize Dataset

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=1024, padding="max_length")

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

### Set up Training

In [None]:
training_args = TrainingArguments(
    output_dir="./lora-alicia-model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    save_strategy="epoch",
    fp16=True,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()
model.save_pretrained("./lora-alicia-model")
tokenizer.save_pretrained("./lora-alicia-model")