In [1]:
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [2]:
dataset = datasets.load_dataset("openai/gsm8k", "main")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
# Load model directly
config = BitsAndBytesConfig(
    load_in_8bit=True,  # Load model weights in 8-bit format
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    device_map="auto",  # Automatically maps layers across GPUs/CPUs
    quantization_config=config,  # Apply quantization to model
)
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
question = dataset["train"][0]["question"]
answer = dataset["train"][0]["answer"]

In [16]:
q_tokenized = tokenizer(question, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
q_tokenized = {k: v.to(model.device) for k, v in q_tokenized.items()}
# labels = tokenizer(answer, return_tensors="pt")
labels = tokenizer(answer, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
labels = {k: v.to(model.device) for k, v in labels.items()}
labels["input_ids"] = torch.where(labels["input_ids"] == tokenizer.eos_token_id, -100, labels["input_ids"])


In [17]:
q_tokenized

{'input_ids': tensor([[128000,     45,   4306,    689,   6216,  27203,    311,    220,   2166,
             315,   1077,   4885,    304,   5936,     11,    323,   1243,   1364,
            6216,   4376,    439,   1690,  27203,    304,   3297,     13,   2650,
            1690,  27203,   1550,  42701,    689,   4662,  31155,    304,   5936,
             323,   3297,     30, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}

In [18]:
labels

{'input_ids': tensor([[128000,     45,   4306,    689,   6216,    220,   2166,     14,     17,
             284,   1134,   2166,     14,     17,     28,   1187,   2511,   1187,
           27203,    304,   3297,    627,     45,   4306,    689,   6216,    220,
            2166,     10,   1187,    284,   1134,   2166,     10,   1187,     28,
            5332,   2511,   5332,  27203,  31155,    304,   5936,    323,   3297,
             627,    827,    220,   5332,   -100,   -100,   -100,   -100,   -100,
            -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
            -100]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}

In [19]:
with torch.no_grad():
    output = model(
        input_ids=q_tokenized["input_ids"],
        attention_mask=q_tokenized["attention_mask"],
        labels=labels["input_ids"],
        return_dict=True,
    )

In [20]:
output.loss

tensor(13.6753, device='cuda:0')

In [12]:
query = "Write a story about a robot that learns how to play the piano."
tokenizer.padding_side = 'right'
token_output = tokenizer(query, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
token_output = {k: v.to(model.device) for k, v in token_output.items()}

In [13]:
token_output

{'input_ids': tensor([[128000,   8144,    264,   3446,    922,    264,  12585,    430,  47310,
            1268,    311,   1514,    279,  27374,     13, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}

In [8]:

gen_output = model.generate(
    token_output["input_ids"], 
    attention_mask=token_output["attention_mask"], 
    max_length=128, 
    temperature=0.8,
    top_k=50,
    top_p=0.9,
)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [9]:
print(tokenizer.batch_decode(gen_output, skip_special_tokens=True)[0])

Write a story about a robot that learns how to play the piano. Describe how the robot learns and what he does with this knowledge. What other kinds of instruments or games might a robot be able to learn with enough time and practice?
Write a story about a robot that learns how to play the piano. Describe how the robot learns and what he does with this knowledge. What other kinds of


In [11]:
token_output

{'input_ids': tensor([[128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
          128001, 128001, 128001, 128001, 128000,   8144,    264,   3446,    922,
             264,  12585,    430,  47310,   1268,    311,   1514,    279,  27374,
              13]], device='cuda:0'),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [12]:
STOP

NameError: name 'STOP' is not defined

In [None]:
# prompt = ds['train']['question'][0]
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# res = model.generate(
#     input_ids,
#     max_length=200,
#     pad_token_id=tokenizer.eos_token_id,
#     attention_mask=torch.tensor([[1] * 50]),
#     num_return_sequences=1,
# )

# Lora Finetuning

In [None]:
# LoRA configuration
# Task Type : PeftModelForQuestionAnswering
lora_config = LoraConfig(
    r=16,  # Rank for the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    bias="none",  # How biases are handled
    task_type="CAUSAL_LM"  # Sequence-to-sequence task for language modeling
)

# Apply PEFT to the model
peft_model = get_peft_model(model, lora_config)


In [None]:
peft_model.device

device(type='cuda', index=0)

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer.train()


Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 GiB. GPU 0 has a total capacity of 23.87 GiB of which 21.42 GiB is free. Including non-PyTorch memory, this process has 2.45 GiB memory in use. Of the allocated memory 1.91 GiB is allocated by PyTorch, and 12.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)