In [3]:
!pip install unsloth trl transformers datasets torch

Collecting unsloth
  Using cached unsloth-2025.5.7-py3-none-any.whl.metadata (47 kB)
Collecting trl
  Using cached trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting unsloth_zoo>=2025.5.8 (from unsloth)
  Using cached unsloth_zoo-2025.5.8-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Using cached xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Using cached tyro-0.9.21-py3-none-any.whl.metadata (10 kB)
Collecting trl
  Using cached trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata

In [4]:
from unsloth import  FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments
from datasets import Dataset
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-24 00:02:45.707150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748044965.892760      75 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748044965.945925      75 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# 1. Load the model & tokenizer (CPU mode)
print('Loading')
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name="unsloth/Llama-3.2-1B-bnb-4bit",  # unquantized CPU-friendly model
    model_name = 'unsloth/Meta-Llama-3.1-8B',
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit=True,
    device_map = 'auto'
)

Loading
==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0, #0.05,
    bias="none",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = 2048,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.5.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
data = [
        {"instruction": "What is the Go-Getter book about?", "input": "", "response": "It is about determination and perseverance"},
        {"instruction": "Write an alternative ending where The Blue Vase fails the test.", "input": "", "output": "In this version, Cappy decides not to pursue the blue vase, leading Cappy Ricks to reconsider his judgment about who truly deserves to lead."},

    # Add more samples here...
]

dataset = Dataset.from_list(data)

def formatting_func(example):
    return {
        "text": f"""### Instruction:
        {example['instruction']}

        ### Input:
        {example['input']}

        ### Response:
        {example['response']}"""
    }

dataset = dataset.map(formatting_func)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [8]:

training_args = SFTConfig(
        dataset_text_field = "text",
        output_dir="./sft_finetune_cpu",
        max_seq_length = 2048,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        logging_steps = 1,
        optim = "adamw_8bit",
        fp16=False, # no fp16 on CPU
        seed = 3407,
)

In [9]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=training_args,
    formatting_func=formatting_func,
)

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

In [12]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient() 

personal_key_for_api = user_secrets.get_secret("WandB login key")

!wandb login $personal_key_for_api
print('Done')

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
Done


In [13]:
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [15]:
print('Trainng the model')
print(f"Model is loaded on: {next(model.parameters()).device}")

import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient() 

personal_key_for_api = user_secrets.get_secret("WandB login key")

!wandb login $personal_key_for_api
wandb.init()

print('Done')

print('Now training the model')
trainer.train()

print('Done')

Trainng the model
Model is loaded on: cuda:0
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Currently logged in as: [33mgcpcertuser9806[0m ([33mgcpcertuser9806-cibc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Done
Now training the model


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 60 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,971,520/8,000,000,000 (0.26% trained)


Step,Training Loss
1,4.4513
2,4.4513
3,4.448
4,4.4292
5,4.3797
6,4.2937
7,4.1717
8,4.0211
9,3.8423
10,3.6331


Done


Saving the Model

In [16]:
trainer.save_model("/kaggle/working/my_model")
tokenizer.save_pretrained("/kaggle/working/my_model")


('/kaggle/working/my_model/tokenizer_config.json',
 '/kaggle/working/my_model/special_tokens_map.json',
 '/kaggle/working/my_model/tokenizer.json')

In [20]:
#from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

#model_path = "/kaggle/working/my_model"

#tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModelForCausalLM.from_pretrained(model_path)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "### Instruction:\nWhat is 'The Go-Getter' about?\n### Response:\n"

output = pipe(prompt, max_new_tokens=50, do_sample=True, temperature=0.7)
print(output[0]["generated_text"])


Device set to use cuda:0


AttributeError: 'LlamaModel' object has no attribute 'max_seq_length'