In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install datasets
######################################
# INZSTALLAZIONE UNSLOTH
######################################

In [3]:
#####################################
# IMPORTING LIBRERIE
#####################################
import os
import gc
import random
import torch
import time
import json
import pandas as pd
from IPython.display import Markdown
from datasets import load_dataset, Dataset
import random
from unsloth import (
    FastLanguageModel 
)
from transformers import TrainerCallback
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-13 11:57:45.640269: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747137465.829704      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747137465.884217      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
########################################
# Qualche iperparametro
########################################
MODEL_NAME  = "google/gemma-2-2b-it"

MAX_LENGTH  = 2048
N_QBITS     = 4

OUTPUT_DIR  = "/kaggle/working/output"
LOG_DIR     = "/kaggle/working/logs"
LOSS_DIR    = "/kaggle/working/loss"
DATASET     = "tau/commonsense_qa"

SEED        = 33

In [5]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(LOSS_DIR, exist_ok=True)

In [6]:
#########################################
# Scelta della quantizzazione
########################################
print(f"Quantizzazione del modello: {N_QBITS} bit")
load_in_4bit         = True if N_QBITS == 4 else False
load_in_8bit         = True if N_QBITS == 8 else False
print(f"Quantizzazione 4 bit modello:{load_in_4bit}")
print(f"Quantizzazione 8 bit modello:{load_in_8bit}")

Quantizzazione del modello: 4 bit
Quantizzazione 4 bit modello:True
Quantizzazione 8 bit modello:False


In [7]:
#######################################
# Caricamento del modello
######################################
print(f"Caricamento del modello e del tokenizer {MODEL_NAME}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name           = MODEL_NAME,
    dtype                = None,
    max_seq_length       = MAX_LENGTH,
    load_in_4bit         = load_in_4bit,
    #load_in_8bit         = load_in_8bit,
    #full_finetuning      = False,
)

Caricamento del modello e del tokenizer google/gemma-2-2b-it...
==((====))==  Unsloth 2025.5.1: Fast Gemma2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [8]:
######################################
# Inserimento Matrici LoRA
######################################
model = FastLanguageModel.get_peft_model(
    model,
    r                          = 8,

    target_modules             = ["q_proj", "k_proj", "v_proj", "o_proj",
                                  "gate_proj", "up_proj", "down_proj"],

    #Permette di applicare LoRA solo a componenti specifiche del modello, ottimizzando il bilanciamento
    #tra efficienza computazionale e capacità di apprendimento
    #q_proj, k_proj, v_proj, o_proj: Parti delle proiezioni nella self-attention.
    #gate_proj, up_proj, down_proj: Componenti dei feed-forward layer

    lora_alpha                 = 16,
    lora_dropout               = 0, # 0 è ottimizzato
    bias                       = "none",

    use_gradient_checkpointing = "unsloth",
    random_state               = SEED,
    max_seq_length             = MAX_LENGTH,
    use_rslora                 = False,
    loftq_config               = None,
)

Unsloth 2025.5.1 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [None]:
##########################################
# Verifica dei token di sistema
# Switiching del padding-size
##########################################
print(f"PAD-TOKEN: {tokenizer.pad_token}")
print(f"BOS-TOKEN: {tokenizer.bos_token}")
print(f"EOS-TOKEN: {tokenizer.eos_token}")

##########################################
# Abbiamo a che fare con modelli autoregressivi quindi processano da sinistra a destra.
# se il padding è a sinistra, i token reali restano alla fine, vicino al contesto immediato del modello, e l’attenzione mascherata funziona meglio.
# Il padding a destra può confondere il modello, perché i [PAD] arrivano dopo i token validi, e il modello potrebbe tentare di "predire" i PAD.
############################################
print(f"before --- TOKENIZER PADDING SIZE:{tokenizer.padding_side}")
tokenizer.padding_side = "left"
print(f"after --- TOKENIZER PADDING SIZE:{tokenizer.padding_side}")

PAD-TOKEN: <pad>
BOS-TOKEN: <bos>
EOS-TOKEN: <eos>
before --- TOKENIZER PADDING SIZE:right
after --- TOKENIZER PADDING SIZE:left


In [10]:
########################################
# L'IT ha il chat temaplte, lo visualizziamo per verifica
#######################################
tokenizer.chat_template

"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

In [11]:
##########################################
# CARICAMENTO E FORMATTAZIONE DEL DATASET
#########################################

def load_commonsense_qa_split(split: str) -> pd.DataFrame:
    """Carica e filtra le colonne principali del dataset CommonsenseQA."""
    data = load_dataset(DATASET, split=split).to_pandas()
    return data[["question", "choices", "answerKey"]]

def format_choices_column(df: pd.DataFrame, new_col: str = "answers") -> pd.DataFrame:
    """Formatta le opzioni multiple in una stringa leggibile."""
    def format_choices(row):
        labels = list(row["choices"]["label"])
        texts = list(row["choices"]["text"])
        return "\n".join([f"{label}) {text}" for label, text in zip(labels, texts)])
    
    df[new_col] = df.apply(format_choices, axis=1)
    return df

def expand_answer_key(row) -> str:
    """Restituisce la risposta corretta formattata (label + testo)."""
    labels = list(row["choices"]["label"])
    texts = list(row["choices"]["text"])
    key = row["answerKey"]
    if key in labels:
        idx = labels.index(key)
        return f"{labels[idx]}) {texts[idx]}"
    else:
        return None  # oppure una stringa alternativa tipo "N/A"

def prepare_commonsense_dataset(n_samples: int = 10000, seed: int = 42) -> pd.DataFrame:
    """Prepara il dataset CommonsenseQA pronto per il fine-tuning."""
    train_df = load_commonsense_qa_split("train")
    val_df = load_commonsense_qa_split("validation")

    train_df = format_choices_column(train_df)
    val_df = format_choices_column(val_df)

    train_df["answerKey"] = train_df.apply(expand_answer_key, axis=1)
    val_df["answerKey"] = val_df.apply(expand_answer_key, axis=1)

    train_df = train_df[["question", "answers", "answerKey"]]
    val_df = val_df[["question", "answers", "answerKey"]]

    full_df = pd.concat([train_df, val_df], ignore_index=True)

    sampled_df = full_df.sample(n=n_samples, random_state=seed).reset_index(drop=True)

    return sampled_df.rename(columns={
        "question" : "INSTRUCTION",
        "answers"  : "ANSWERS",
        "answerKey": "RESPONSE"
    })

common_sense_dataset = prepare_commonsense_dataset(n_samples=10000)

README.md:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [12]:
dataset = Dataset.from_pandas(common_sense_dataset)

In [13]:
#########################################
# Definizione dei diversi prompt -> maggiori prompt maggiore dinamicità
# Conversione del dataset in modo conversazionale
#########################################
user_prompts  = [
    "Given the following common sense question, explain the reasoning behind the correct answer.",
    "Solve the following logic question and explain your choice clearly.",
    "How would you approach answering this common sense question? Provide a detailed explanation.",
    "Please answer the following question and explain the logic behind your answer.",
    "Here is a logic question. Choose the correct answer and justify your choice.",
    "Select the best answer for this question and explain why it is the most reasonable.",
    "Can you choose the most logical answer and explain your reasoning?",
    "What is the best answer to this question? Provide a clear explanation.",
    "Given the question below, explain how you arrived at the correct answer.",
    "Take the following logic question and explain your answer step by step.",
    "Please select the correct answer and explain your reasoning in detail.",
    "Here is a question that requires common sense. Explain why the answer you chose is the best.",
    "Please choose the most sensible answer to the question and justify your response.",
    "How would you solve the following logic-based question? Provide a detailed explanation.",
    "Answer the question below by explaining the reasoning behind your choice.",
    "Given the question below, provide a step-by-step explanation of how you selected the correct answer."
]

def create_conversation(sample):
  return {
    "messages": [
      #{"role": "system", "content": system_message}, GEMMA NON HA IL SYSTEM_PROMPT
      {"role": "user", "content": random.choice(user_prompts) + " " + sample["INSTRUCTION"] + " \n"+ sample["ANSWERS"]},
      {"role": "model", "content": sample["RESPONSE"]}
    ]
  }

In [14]:
dataset = dataset.map(create_conversation,batched=False)
print(dataset[1]['messages'])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

[{'content': 'What is the best answer to this question? Provide a clear explanation. which half of the earth do marmots live in? \nA) south\nB) northern hemisphere\nC) north america\nD) vancouver island\nE) jungle', 'role': 'user'}, {'content': 'B) northern hemisphere', 'role': 'model'}]


In [15]:
############################################
# Applicazione del chat template al modello
# Rimozione del token <bos> in quanto viene aggiunto dopo
############################################
def apply_chat_template_to_example(example):
    
    result_text = tokenizer.apply_chat_template(example['messages'], tokenize=False, add_generation_prompt=False)
    if result_text.startswith("<bos>"):
        result_text = result_text[len("<bos>"):]
        
    return {"prompt": result_text}

In [16]:
dataset = dataset.map(apply_chat_template_to_example,batched=False)
print(dataset[1]['prompt'])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

<start_of_turn>user
What is the best answer to this question? Provide a clear explanation. which half of the earth do marmots live in? 
A) south
B) northern hemisphere
C) north america
D) vancouver island
E) jungle<end_of_turn>
<start_of_turn>model
B) northern hemisphere<end_of_turn>



In [17]:
######################################
# Definizione di una callback, per salvarci la loss e per plottarla
######################################
class LossCallback(TrainerCallback):
    def __init__(self):

        self.train_losses = {}
        #self.eval_losses = {}

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        step = state.global_step

        if "loss" in logs:
            self.train_losses[step] = logs["loss"]

        if "loss" in logs:
            print(f"[Step {step}] Train Loss: {logs['loss']}")

In [18]:
#######################################
# Definizione degli iperparametri per il training
#####################################

#model.config.use_cache = False #SOLO PER TRAINING

training_args = SFTConfig(
    do_train                    = True,

    dataset_text_field          = "prompt",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 8,

    num_train_epochs            = 3,

    learning_rate               = 2e-4,
    lr_scheduler_type           = "linear",

    logging_strategy            = "steps",
    save_strategy               = 'steps',
    logging_steps               = 10,
    save_steps                  = 100,

    #warmup_steps                = 150,
    optim                       = "adamw_8bit",
    seed                        = SEED,

    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
   # load_best_model_at_end      = True,

    weight_decay                = 0.03,
    report_to                   = "none",
)

In [19]:
##############################
# Definizione del trainer
###############################

loss_callback = LossCallback()

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    max_seq_length     = MAX_LENGTH,
    train_dataset      = dataset,
    args               = training_args,
    packing            = False,
    callbacks          = [loss_callback]
)

Unsloth: Tokenizing ["prompt"] (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [20]:
#####################################
# Verifica di quello che passiamo come input al modello
#####################################
print(trainer.processing_class.decode(trainer.train_dataset[0]['input_ids']))

<bos><start_of_turn>user
Select the best answer for this question and explain why it is the most reasonable. What kind of place is an excavation site? 
A) working ground
B) quarry
C) desert
D) work area
E) under ground<end_of_turn>
<start_of_turn>model
D) work area<end_of_turn>



In [21]:
####################################
# Addestramento
####################################
print("Avvio del fine-tuning...")
trainer.train()
print("Fine del fine-tuning...")

Avvio del fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 3 | Total steps = 468
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 10,383,360/2,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,5.0847
20,3.4327
30,2.4253
40,1.803
50,1.5053
60,1.3799
70,1.3457
80,1.3039
90,1.3016
100,1.2961


[Step 10] Train Loss: 5.0847
[Step 20] Train Loss: 3.4327
[Step 30] Train Loss: 2.4253
[Step 40] Train Loss: 1.803
[Step 50] Train Loss: 1.5053
[Step 60] Train Loss: 1.3799
[Step 70] Train Loss: 1.3457
[Step 80] Train Loss: 1.3039
[Step 90] Train Loss: 1.3016
[Step 100] Train Loss: 1.2961
[Step 110] Train Loss: 1.3061
[Step 120] Train Loss: 1.2967
[Step 130] Train Loss: 1.2686
[Step 140] Train Loss: 1.2936
[Step 150] Train Loss: 1.2737
[Step 160] Train Loss: 1.2482
[Step 170] Train Loss: 1.2121
[Step 180] Train Loss: 1.2052
[Step 190] Train Loss: 1.1977
[Step 200] Train Loss: 1.2045
[Step 210] Train Loss: 1.1927
[Step 220] Train Loss: 1.1901
[Step 230] Train Loss: 1.1925
[Step 240] Train Loss: 1.1975
[Step 250] Train Loss: 1.1954
[Step 260] Train Loss: 1.1965
[Step 270] Train Loss: 1.1807
[Step 280] Train Loss: 1.1955
[Step 290] Train Loss: 1.1885
[Step 300] Train Loss: 1.1661
[Step 310] Train Loss: 1.18
[Step 320] Train Loss: 1.1266
[Step 330] Train Loss: 1.0776
[Step 340] Train Loss:

In [22]:
#################################
# Salvataggio del modello e tokenizzatore
#################################
token       = "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"
output_repo = 'stefra/GEMMA2BITCOMMONSENSE'
model.push_to_hub(output_repo, token = token, private = True)
tokenizer.push_to_hub(output_repo, token = token, private = True)

README.md:   0%|          | 0.00/581 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Saved model to https://huggingface.co/stefra/GEMMA2BITCOMMONSENSE


tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

In [23]:
torch.cuda.empty_cache()
gc.collect()

9270

In [24]:
##############################
# Salvataggio della loss
##############################
def save_losses_to_json(train_losses, output_dir):
    losses = {
        "train_losses": train_losses,
    }

    with open(f"{output_dir}/losses.json", "w") as f:
        json.dump(losses, f, indent=4)
    print(f"Losses salvate in {output_dir}/losses.json")

save_losses_to_json(loss_callback.train_losses, LOSS_DIR)

Losses salvate in /kaggle/working/loss/losses.json


In [25]:
#############################
# Plot della loss
#############################
import json
import matplotlib.pyplot as plt

def load_losses(file_name, loss_type):
    with open(file_name, 'r') as f:
        data = json.load(f)
        return data[loss_type] 

losses_CS = load_losses(f"/kaggle/working/{output_dir}/losses.json", "train_losses")

losses_CS_steps = list(map(int, losses_CS.keys()))
losses_CS_values = list(losses_CS.values())

plt.figure(figsize=(14, 8))

plt.plot(losses_CS_steps, losses_CS_values, label="loss_values", color="b", marker="o")

plt.title("Train Loss per COMMON SENSE - Gemma 2B-IT", fontsize=16)
plt.xlabel("Steps", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.savefig("/kaggle/working/losses_COMMONSENSE_plot.png", dpi=900)
plt.show()

NameError: name 'output_dir' is not defined