In [9]:
%pip install transformers trl accelerate torch bitsandbytes peft datasets -qU

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.12 requires torch<2.1,>=1.7, but you have torch 2.4.1 which is incompatible.
torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 2.4.1 which is incompatible.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 2.4.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 2.4.1 which is incompatible.
torchvision 0.15.2+cu118 requires torch==2.0.1, but you have torch 2.4.1 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [None]:
hf_token = "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"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", token=hf_token)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", token=hf_token, torch_dtype="auto", device_map="auto")

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
from datasets import load_dataset

ds = load_dataset("Open-Orca/OpenOrca", split="train")

In [None]:
import random

dataset_length = 10**5

dataset_size = len(ds)
random_indices = random.sample(range(dataset_size), dataset_length)

sampled_dataset = ds.select(random_indices)

train_size = int(0.9 * dataset_length)
train_dataset = sampled_dataset.select(range(train_size))
test_dataset = sampled_dataset.select(range(train_size, dataset_length))


In [None]:
def process_func(example):
    MAX_LENGTH = 512
    
    messages = [
        {"role": "system", "content": example["system_prompt"]},
        {"role": "user", "content": example["question"]},
        {"role": "assistant", "content": example["response"]},
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    input_ids = tokenizer.encode(text=text.strip(), add_special_tokens=True, truncation=True, max_length=MAX_LENGTH)

    labels = input_ids[:]  

    pad_len = MAX_LENGTH - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * pad_len
    labels += [tokenizer.pad_token_id] * pad_len

    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

    return {
        "input_ids": input_ids,
        "labels": labels,
    }


In [None]:
tokenized_train = train_dataset.map(process_func, remove_columns=train_dataset.column_names, batched=False)

In [None]:
tokenized_test = test_dataset.map(process_func, remove_columns=test_dataset.column_names, batched=False)

In [None]:
import evaluate
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForSeq2Seq

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    seq_len = (labels != -100).sum(-1)
    loss = np.exp(logits.mean())  

    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu = bleu_metric.compute(predictions=decoded_predictions, references=decoded_labels)
    rouge = rouge_metric.compute(predictions=decoded_predictions, references=decoded_labels)
    meteor = meteor_metric.compute(predictions=decoded_predictions, references=decoded_labels)

    return {
        "loss": float(loss),
        "perplexity": float(np.exp(loss)),
        "bleu": bleu["bleu"],
        "rouge": rouge["rougeL"].mid.fmeasure,
        "meteor": meteor["meteor"],
    }

# EXP1: Default model (with fine-tune)

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
import torch
torch.cuda.empty_cache()

# Default model training (via peft adapter)

> The weight matrix is scaled by lora_alpha\lora_rank, and a higher alpha value assigns more weight to the LoRA activations. We chose 16 since this was common practice in training scripts we reviewed.

https://github.com/QwenLM/Qwen/blob/main/recipes/finetune/deepspeed/finetune_lora_single_gpu.ipynb

https://github.com/QwenLM/Qwen/blob/main/finetune.py

params - example defaults

In [16]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

# default config
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# prefered config
peft_good_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj"],
)

In [17]:
#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_good_config)

In [18]:
model.print_trainable_parameters()

trainable params: 737,280 || all params: 494,770,048 || trainable%: 0.1490


In [None]:
args = TrainingArguments(
    output_dir="qwen_instruct_fine_tune",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    warmup_steps=1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch", 
    learning_rate=1e-4,
    lr_scheduler_type="constant",
    fp16=True,
    eval_accumulation_steps=1

)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
import matplotlib.pyplot as plt

train_losses = []
eval_losses = []
train_step_numbers = []  
eval_step_numbers = []  

for log in trainer.state.log_history:
    if 'loss' in log:
        train_losses.append(log['loss'])
        train_step_numbers.append(log['step'])
    if 'eval_loss' in log:
        eval_losses.append(log['eval_loss'])
        eval_step_numbers.append(log['step']) 

plt.figure(figsize=(10, 5))
plt.plot(train_step_numbers, train_losses, label='Train Loss', color='blue')
plt.plot(eval_step_numbers, eval_losses, label='Eval Loss', color='orange')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Train and Eval Loss over Steps')
plt.legend()
plt.show()

In [None]:
evaluation_results = trainer.evaluate(eval_dataset=tokenized_test)

for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")


  0%|          | 0/1250 [00:00<?, ?it/s][A
  0%|          | 2/1250 [00:00<09:22,  2.22it/s][A
  0%|          | 3/1250 [00:02<20:11,  1.03it/s][A
  0%|          | 4/1250 [00:04<30:46,  1.48s/it][A
  0%|          | 5/1250 [00:07<38:41,  1.86s/it][A
  0%|          | 6/1250 [00:11<52:16,  2.52s/it][A
  1%|          | 7/1250 [00:14<55:00,  2.66s/it][A
  1%|          | 8/1250 [00:18<1:03:26,  3.07s/it][A
  1%|          | 9/1250 [00:22<1:07:48,  3.28s/it][A
  1%|          | 10/1250 [00:25<1:12:01,  3.48s/it][A
  1%|          | 11/1250 [00:30<1:17:49,  3.77s/it][A
  1%|          | 12/1250 [00:34<1:22:57,  4.02s/it][A
  1%|          | 13/1250 [00:39<1:28:19,  4.28s/it][A
  1%|          | 14/1250 [00:45<1:36:49,  4.70s/it][A
  1%|          | 15/1250 [00:51<1:41:44,  4.94s/it][A
  1%|▏         | 16/1250 [00:56<1:47:25,  5.22s/it][A
  1%|▏         | 17/1250 [01:04<1:59:01,  5.79s/it][A
  1%|▏         | 18/1250 [01:10<2:03:25,  6.01s/it][A
  2%|▏         | 19/1250 [01:17<2:08:08,

[0;31mKernelOutOfMemory[0m: Kernel ran out of memory and has been restarted. If the restart fails, restart the kernel from the Kernel menu.
If the error persists, try choosing a different configuration or optimizing your code.

# Metrics

In [35]:
evaluation_results = trainer.evaluate(eval_dataset=tokenized_test)

for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")




  0%|          | 0/1250 [00:00<?, ?it/s][A[A[A


  0%|          | 2/1250 [00:00<05:27,  3.81it/s][A[A[A


  0%|          | 3/1250 [00:01<09:50,  2.11it/s][A[A[A

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.31 GiB. GPU 0 has a total capacity of 31.74 GiB of which 66.38 MiB is free. Including non-PyTorch memory, this process has 31.67 GiB memory in use. Of the allocated memory 28.92 GiB is allocated by PyTorch, and 2.38 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)