# Import delle librerie

In [1]:
from huggingface_hub import notebook_login
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import gc
#import unsloth
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

pd.set_option('display.max_columns', None)  # Mostra tutte le colonne
pd.set_option('display.width', None)        # Non tronca l'output a una larghezza fissa
pd.set_option('display.max_colwidth', None)

MAX_LENGHT = 2048

#os.environ["CUDA_LAUNCH_BLOCKING"]="1"
os.environ["TOKENIZERS_PARALLELISM"]="true"
!huggingface-cli login --token ######

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `UNIVERSAL_TOKEN` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `UNIVERSAL_TOKEN`


In [2]:
#File curlora.py

def compute_selection_probabilities(A):
    column_norms_squared = torch.sum(A**2, axis=0)
    row_norms_squared = torch.sum(A**2, axis=1)
    total_sum_squares = torch.sum(column_norms_squared)
    column_probs = column_norms_squared / total_sum_squares
    row_probs = row_norms_squared / total_sum_squares
    return column_probs, row_probs


def select_indices_with_replacement(probs, k):
    inverted_P = (1 / (probs + 0.001)).float()

    # Normalize the inverted probabilities
    probs = inverted_P / inverted_P.sum()
    
    # Sposta su CPU e converti in numpy
    if torch.is_tensor(probs):
        probs = probs.detach().cpu().numpy()
    
    return np.random.choice(len(probs), size=k, replace=True, p=probs)


def adjust_duplicates(selected_indices, A, axis):
    unique_indices, counts = np.unique(selected_indices, return_counts=True)
    adjusted_matrix = A[:, unique_indices] if axis == 1 else A[unique_indices, :]
    
    for idx, count in enumerate(counts):
        if count > 1:
            scaling_factor = np.sqrt(count)
            if axis == 1:
                adjusted_matrix[:, idx] *= scaling_factor
            else:
                adjusted_matrix[idx, :] *= scaling_factor
    
    return adjusted_matrix, unique_indices


def cur_decomposition(A, c):
    r = c
    column_probs, row_probs = compute_selection_probabilities(A)
    selected_columns = select_indices_with_replacement(column_probs, c)
    selected_rows = select_indices_with_replacement(row_probs, r)
    
    C = A[:, selected_columns]
    R = A[selected_rows, :]
    
    U = torch.empty(C.shape[1], R.shape[0])
    U = torch.zeros_like(U).to("cuda") #* 0.00
    
    return C, U, R


class CURModule(nn.Module):
    def __init__(self, W, rank):
        super(CURModule, self).__init__()
        C, U, R = cur_decomposition(W, rank)
        self.C = C * 1.0
        self.R = R * 1.0
        self.U = nn.Parameter(U)
        #self.d = torch.nn.Dropout(0.05)

    def forward(self, x):
        W_approx = torch.matmul(torch.matmul(self.C, self.U), self.R)
        try:
            x = torch.matmul(x, W_approx.t())
        except:
            x = torch.matmul(x, W_approx)
        #x = self.d(x)
        return x


class CURLoRAMLP(nn.Module):
    def __init__(self, base_model, rank=8, alpha=1):
        super(CURLoRAMLP, self).__init__()
        self.base_model = base_model
        self.rank = rank
        self.alpha = alpha
        # Identify the layer to adapt (the last layer)
        layer_to_adapt = base_model.layers[-1]
        # Freeze the parameters of the base model
        for param in self.base_model.parameters():
            param.requires_grad = False

        self.cur_module = CURModule(layer_to_adapt.weight, self.rank)
    
    def forward(self, x):
        x = self.base_model.layers[:-1](x)  # Use all layers except the last one
        x_0 = torch.matmul(x, self.base_model.layers[-1].weight.t()) 
        x_adapted = self.cur_module(x)
        x = x_0 + (self.alpha * x_adapted) + self.base_model.layers[-1].bias
        return x


class LinearWithCURLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.curlora = CURModule(linear.weight, rank)
        self.rank = rank
        self.alpha = alpha

    def forward(self, x):
        x_0 = self.linear(x)
        x_adapted = self.curlora(x)
        x = x_0 + (self.alpha * x_adapted) #+ self.linear.bias
        return x

In [3]:
#File utils.py

# I used this function "replace_linear_with_lora" from
# https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-E/01_main-chapter-code/appendix-E.ipynb
def replace_linear_with_curlora(model, rank, alpha):
    for name, module in model.named_children():
        #if isinstance(module, torch.nn.Linear):
        if any(l in name for l in ["q_proj", "v_proj", "k_proj"]):
            setattr(model, name, LinearWithCURLoRA(module, rank, alpha))
        else:
            replace_linear_with_curlora(module, rank, alpha)

def replace_linear_with_lora(model, rank, alpha):
    for name, module in model.named_children():
        #if isinstance(module, torch.nn.Linear):
        if any(l in name for l in ["q_proj", "v_proj", "k_proj"]):
            # Replace the Linear layer with LinearWithLoRA
            setattr(model, name, LinearWithLoRA(module, rank, alpha))
        else:
            # Recursively apply the same function to child modules
            replace_linear_with_lora(module, rank, alpha)


def load_model_and_tokenizer(model_name, repo_weights_name, load_params, device = "cuda"):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda"
    )
                                
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token

    for param in model.parameters():
        param.requires_grad = False

    #Attivazione CURLoRA
    replace_linear_with_curlora(model, rank=8, alpha=16)

    if load_params: 
        #Caricamento dei pesi del modello addestrato
        safetensor_path = hf_hub_download(repo_id=repo_weights_name, filename="model.safetensors")
        state_dict = load_file(safetensor_path, device=device)
        
        model.load_state_dict(state_dict, strict=False)
    
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters after: {total_params:,}")
    
    return model, tokenizer

# Caricamento del modello

In [4]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
repo_weights_name_1 = "francescoocurcio/Llama3.2_3B_CSQA"
repo_weights_name_2 = "francescoocurcio/Llama3.2_3B_CSQA_LI"

In [5]:
load_model1 = False
if load_model1:
    model, tokenizer = load_model_and_tokenizer(model_name, repo_weights_name_1, load_params = True)
else:
    model, tokenizer = load_model_and_tokenizer(model_name, repo_weights_name_2, load_params = True)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

2025-07-17 11:20:08.038197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752751208.265589      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752751208.330435      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Total trainable parameters after: 3,072


In [5]:
load_params = True
if load_params:
    model, tokenizer = load_model_and_tokenizer(model_name, repo_weights_name_2, load_params = True)
else:
    model, tokenizer = load_model_and_tokenizer(model_name, repo_weights_name_2, load_params = False)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

2025-07-17 09:53:45.484960: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752746025.717427      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752746025.789384      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Total trainable parameters after: 3,072


# Testing CommonSense_QA

In [6]:
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

# carica il dataset
csqa_val = load_dataset("tau/commonsense_qa",split="validation")

def score_text(model, tokenizer, text, device, max_length=512, do_debug=False):
    """
    Calcola la loss (sum of negative log-likelihood) su tutta la sequenza `text`.
    Restituisce un punteggio tale che la risposta più “plausibile”
    è quella col punteggio più basso.
    """
    # Tokenizzazione
    enc = tokenizer(text,
                    return_tensors="pt",
                    truncation=True,
                    max_length=max_length).to(device)
    input_ids = enc.input_ids
    # DEBUG: cosa ricevo in input?
    #if do_debug: 
        #print(f"[score_text] text: {text!r}")
        #print(f"[score_text] input_ids ({input_ids.shape}): {input_ids.tolist()}")

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        # HuggingFace ritorna `loss` come media sui token validi:
        nll = outputs.loss * input_ids.size(1)

    # DEBUG: che loss ottengo?
    #print(f"[score_text] loss (mean): {outputs.loss.item():.4f},  nll (sum): {nll.item():.4f}")
    return nll.item()


def evaluate_csqa(model, tokenizer, val_dataset, device="cuda", max_length=512, debug_examples=5):
    """
    Valuta l'accuracy su CommonSenseQA mediante ranking delle opzioni
    secondo la minore somma di NLL (o minore perplexity),
    confrontando le lettere (A, B, C, D, E) invece degli indici.
    """
    
    model.eval()
    model.to(device)

    correct = 0
    total = 0

    for i, ex in enumerate(tqdm(val_dataset, desc="CSQA eval")):
        question = ex["question"]
        choices  = ex["choices"]["text"]    # lista di stringhe delle opzioni
        true_key = ex["answerKey"]          # es. "A", "B", ...

        do_debug = (i < debug_examples)
        
        #if do_debug:
            #print(f"\n--- Example {i+1} ---")
            #print(f"Q: {question!r}")
            #print(f"Options: {choices}, true answer letter: {true_key}")

        # Calcolo dei punteggi NLL per ciascuna opzione
        scores = []
        for idx, choice in enumerate(choices):
            prompt = f"{question}  Answer: {choice}"
            score = score_text(model, tokenizer, prompt, device, max_length, do_debug)
            scores.append(score)
            # Stampo sia l'indice che la lettera corrispondente
            letter = chr(ord('A') + idx)
            #print(f"  choice {letter} ({choice!r}): score = {score:.4f}")

        # Trovo l'indice della score minima e ne ricavo la lettera
        pred_idx   = int(np.argmin(scores))
        pred_letter = chr(ord('A') + pred_idx)

        is_correct = (pred_letter == true_key)
        #print(f"[evaluate_csqa] Predicted letter = {pred_letter}, True letter = {true_key}, Correct? {is_correct}")

        if is_correct:
            correct += 1
        total += 1

        #print("----------------------------\n")

    accuracy = correct / total
    print(f"\nFinal accuracy: {accuracy:.4f} ({correct}/{total})")
    return accuracy

acc = evaluate_csqa(model, tokenizer, csqa_val, device="cuda", max_length=512)
print(f"CommonSenseQA accuracy: {acc:.2f}")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

CSQA eval: 100%|██████████| 1221/1221 [04:34<00:00,  4.45it/s]


Final accuracy: 0.3030 (370/1221)
CommonSenseQA accuracy: 0.30





# Verifica delle loss di addestramento

In [7]:
# Carica il dataset direttamente dalla tua repo
dataset = load_dataset("francescoocurcio/epoch_losses_dataset_CSQA_LI", split="train")

# Converti in DataFrame pandas
df = dataset.to_pandas()

# Mostra le prime righe
print(df)

README.md:   0%|          | 0.00/297 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3 [00:00<?, ? examples/s]

   epoch  mean_loss
0      1   0.349385
1      2   0.253107
2      3   0.238424


In [8]:
dataset = load_dataset("francescoocurcio/all_batch_losses_dataset_CSQA_LI", split="train")

# Converti in DataFrame pandas
df = dataset.to_pandas()

# Mostra le prime righe
print(df)

README.md:   0%|          | 0.00/279 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/112k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15000 [00:00<?, ? examples/s]

       batch_loss
0        1.516834
1        1.360186
2        0.803444
3        1.758728
4        1.563305
...           ...
14995    0.273831
14996    0.488716
14997    0.741090
14998    0.369727
14999    0.314238

[15000 rows x 1 columns]


# Calcolo della perplexity

In [7]:
import torch
import torch.nn.functional as F
from tqdm import tqdm


from datasets import load_dataset

wikidataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

txt = wikidataset["text"]
txt = [s for s in txt if s != '']
txt = "".join(txt)


def calculate_perplexity(model, tokenizer, text, device='cuda', max_length=512, stride=256):
    """
    Calcola la perplexity su un testo utilizzando il modello causal LM.
    
    Args:
        model: AutoModelForCausalLM già in eval() mode e su device.
        tokenizer: AutoTokenizer corrispondente.
        text: stringa di input.
        device: 'cuda' o 'cpu'.
        max_length: lunghezza massima del contesto (tipicamente <= modello.config.n_positions).
        stride: quantità di token di overlap tra finestra e finestra.
        
    Returns:
        perplexity (float)
    """
    model.eval()
    model.to(device)
    encodings = tokenizer(text, return_tensors='pt')
    input_ids = encodings.input_ids.to(device)
    n_tokens = input_ids.size(1)
    nlls = []
    # scorri il testo in finestre sovrapposte
    for begin in tqdm(range(0, n_tokens, stride), desc="Calcolo Perplexity"):
        end = min(begin + max_length, n_tokens)
        input_ids_slice = input_ids[:, begin:end]
        
        # i token di cui calcolare la loss; il resto viene ignorato
        target_ids = input_ids_slice.clone()
        target_ids[:, :- (end - begin - stride if end < n_tokens else 0)] = -100
        
        with torch.no_grad():
            outputs = model(input_ids_slice)
            logits = outputs.logits  # shape [1, seq_len, vocab_size]
        
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = target_ids[..., 1:].contiguous()
        
        loss = F.cross_entropy(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.view(-1),
            ignore_index=-100,
            reduction='sum',  # somma la loss sui token validi
        )
        nlls.append(loss)
    
    total_nll = torch.stack(nlls).sum()
    # numero totale di token validi (escludiamo quelli marcati -100)
    valid_tokens = (input_ids.ne(tokenizer.pad_token_id)).sum() - 1  
    perplexity = torch.exp(total_nll / valid_tokens)
    return perplexity.item()

model.to("cuda")
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (288937 > 131072). Running this sequence through the model will result in indexing errors
Calcolo Perplexity: 100%|██████████| 1129/1129 [07:33<00:00,  2.49it/s]


Perplexity: 66.6
