# Epicrisis Fine-Tuning + Export ONNX (Colab)

Flujo completo:
1) Entrenar (LoRA) con Unsloth
2) Merge del LoRA con el modelo base
3) Exportar a ONNX (fp16) con Optimum
4) (Opcional) Export oficial Transformers.js q4f16


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
BASE_DIR = "/content/drive/MyDrive/fine-tuning"
ADAPTER_DIR = f"{BASE_DIR}/epicrisis-lora-adapter"

import os
os.makedirs(ADAPTER_DIR, exist_ok=True)

In [2]:
!pip -q uninstall -y torch torchvision torchaudio

# Instala el trío alineado con el stack actual de Colab (CUDA 12.6)
!pip -q install --index-url https://download.pytorch.org/whl/cu126 \
  torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0

# Dependencias estables para SFT + LoRA + export ORT GenAI (evita cambios de API inesperados)
!pip -q install -U \
  "transformers==4.52.4" \
  "trl==0.11.4" \
  "peft==0.13.2" \
  "accelerate==0.34.2" \
  datasets bitsandbytes

# Export ONNX + cuantización (ORT GenAI builder) + dependencia onnx-ir
!pip -q install -U onnx onnxruntime onnxruntime-genai onnx-ir


In [6]:
!pip -q uninstall -y trl transformers peft accelerate
!pip -q install -U "transformers==4.46.3" "trl==0.11.4" "peft==0.13.2" "accelerate==0.34.2"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch, torchvision, torchaudio
print("torch", torch.__version__)
print("torchvision", torchvision.__version__)
print("torchaudio", torchaudio.__version__)
print("cuda", torch.version.cuda)
print("cuda available", torch.cuda.is_available())

torch 2.9.0+cu126
torchvision 0.24.0+cu126
torchaudio 2.9.0+cu126
cuda 12.6
cuda available True


## Subir dataset
Sube `train.jsonl` y `validation.jsonl`.

In [8]:
from google.colab import files
uploaded = files.upload()


Saving train.jsonl to train.jsonl
Saving validation.jsonl to validation.jsonl


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Configuración base
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
TRAIN_FILE = "/content/train.jsonl"
VAL_FILE   = "/content/validation.jsonl"

# Donde guardamos TODO en Google Drive
BASE_DIR = "/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b"
ADAPTER_DIR = f"{BASE_DIR}/epicrisis-lora-adapter"

# (Opcional) donde guardar un modelo merged (NO recomendado si tu objetivo es ORT GenAI builder)
MERGED_DIR = f"{BASE_DIR}/epicrisis-merged"

import os
os.makedirs(ADAPTER_DIR, exist_ok=True)
os.makedirs(MERGED_DIR, exist_ok=True)

print("MODEL_NAME:", MODEL_NAME)
print("TRAIN_FILE:", TRAIN_FILE)
print("VAL_FILE:", VAL_FILE)
print("ADAPTER_DIR:", ADAPTER_DIR)
print("MERGED_DIR:", MERGED_DIR)

MODEL_NAME: Qwen/Qwen2.5-0.5B-Instruct
TRAIN_FILE: /content/train.jsonl
VAL_FILE: /content/validation.jsonl
ADAPTER_DIR: /content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/epicrisis-lora-adapter
MERGED_DIR: /content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/epicrisis-merged


In [2]:
# Fine-tuning con Transformers + PEFT (LoRA) + TRL (SFTTrainer) - SIN Unsloth
import json
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

max_seq_length = 2048

# 1) Dataset (tu formato real: instruction / input / output)
data_files = {"train": TRAIN_FILE, "validation": VAL_FILE}
ds = load_dataset("json", data_files=data_files)

def format_example(example, tokenizer):
    messages = [
        {"role": "system", "content": "Eres un médico experto en redacción de epicrisis clínicas en español."},
        {"role": "user", "content": example["instruction"] + "\n" + json.dumps(example["input"], ensure_ascii=False, indent=2)},
        {"role": "assistant", "content": example["output"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

# 2) Modelo + tokenizer (forzando SDPA para evitar xformers)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa",  # <- clave: estable en Colab sin xformers
)

# 3) LoRA config (estable para export/convert posterior)
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

def map_fn(ex):
    return {"text": format_example(ex, tokenizer)}

train_ds = ds["train"].map(map_fn, remove_columns=ds["train"].column_names)
eval_ds  = ds["validation"].map(map_fn, remove_columns=ds["validation"].column_names)

# 4) Trainer (sin optim 8bit para evitar dependencias extra; estable)
args = TrainingArguments(
    output_dir="/content/_trainer-out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="no",          # guardamos manualmente el adapter al final
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=args,
)

trainer.train()

# 5) Guardar SOLO el adapter LoRA + tokenizer en Drive (persistente)
os.makedirs(ADAPTER_DIR, exist_ok=True)
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print("✅ Adapter LoRA guardado en:", ADAPTER_DIR)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/321 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/321 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

  super().__init__(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss


✅ Adapter LoRA guardado en: /content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/epicrisis-lora-adapter


In [None]:
# (Opcional) Merge LoRA -> modelo base (no es necesario para export ONNX con ORT GenAI builder)
# Si lo necesitas por alguna razón, puedes descomentar.

# from unsloth import FastLanguageModel
# model = trainer.model
# model = FastLanguageModel.for_inference(model)
# model.save_pretrained_merged(MERGED_DIR, tokenizer, save_method="merged_16bit")
# print("Modelo merged guardado en:", MERGED_DIR)


## Exportar a ONNX + cuantizar (INT4) con onnxruntime-genai (sin Optimum)

In [4]:
# Instala la dependencia faltante del builder
!pip -q install -U onnx-ir

# (Recomendado) actualiza onnxruntime-genai por si estabas en una versión vieja
!pip -q install -U onnxruntime-genai onnx onnxruntime

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/139.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import onnx_ir
print("onnx_ir OK:", onnx_ir.__version__ if hasattr(onnx_ir, "__version__") else "import ok")

onnx_ir OK: 0.1.14


In [7]:
# 1) Asegura dependencias del builder
!pip -q install -U onnx-ir onnx onnxruntime onnxruntime-genai

# 2) Sube transformers a una versión que incluya Qwen2.5-VL
# (esto NO afecta tu adapter ya entrenado)
!pip -q install -U "transformers==4.52.4" "tokenizers>=0.20.0" "huggingface-hub>=0.24.0"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/10.5 MB[0m [31m72.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.5/10.5 MB[0m [31m176.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
[?25h

> **Nota sobre HF_TOKEN:** para este modelo público normalmente no necesitas token.  
> Si en algún caso necesitas autenticarte, usa `!huggingface-cli login` o los *Colab Secrets* (no pegues tokens en el notebook).


In [8]:
%%bash
set -e

MODEL_NAME="Qwen/Qwen2.5-0.5B-Instruct"
ADAPTER_DIR="/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/epicrisis-lora-adapter"

OUT_CPU="/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/onnx-cpu-int4"
OUT_WEBGPU="/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/onnx-webgpu-int4"

python -m onnxruntime_genai.models.builder \
  -m "${MODEL_NAME}" \
  -o "${OUT_CPU}" \
  -p int4 \
  -e cpu \
  --extra_options hf_remote=true adapter_path="${ADAPTER_DIR}"

python -m onnxruntime_genai.models.builder \
  -m "${MODEL_NAME}" \
  -o "${OUT_WEBGPU}" \
  -p int4 \
  -e webgpu \
  --extra_options hf_remote=true adapter_path="${ADAPTER_DIR}"

echo "✅ ONNX CPU: ${OUT_CPU}"
echo "✅ ONNX WebGPU: ${OUT_WEBGPU}"


Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, BF16 CUDA, FP16 TRT-RTX, INT4 CPU, INT4 CUDA, INT4 DML, INT4 WebGPU
Extra options: {'hf_remote': 'true', 'adapter_path': '/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/epicrisis-lora-adapter'}
GroupQueryAttention (GQA) is used in this model.
Reading embedding layer
Reading decoder layer 0
Reading decoder layer 1
Reading decoder layer 2
Reading decoder layer 3
Reading decoder layer 4
Reading decoder layer 5
Reading decoder layer 6
Reading decoder layer 7
Reading decoder layer 8
Reading decoder layer 9
Reading decoder layer 10
Reading decoder layer 11
Reading decoder layer 12
Reading decoder layer 13
Reading decoder layer 14
Reading decoder layer 15
Reading decoder layer 16
Reading decoder layer 17
Reading decoder layer 18
Reading decoder layer 19
Reading decoder layer 20
Reading decoder layer 21
Reading decoder layer 22
Reading decoder layer 23
Reading final norm
Reading LM head
Sa

2026-01-15 06:20:43,076 numexpr.utils [INFO] - NumExpr defaulting to 2 threads.
2026-01-15 06:20:45.456253: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768458045.489094   16268 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768458045.499403   16268 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768458045.524424   16268 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768458045.524503   16268 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 0

In [None]:
import os, glob

OUT_CPU="/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/onnx-cpu-int4"
OUT_WEBGPU="/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/onnx-webgpu-int4"

def summarize_dir(p):
    if not os.path.isdir(p):
        return {"path": p, "exists": False, "file_count": 0, "sample_files": []}
    files = sorted(os.listdir(p))
    onnx_files = glob.glob(p + "/**/*.onnx", recursive=True)
    return {
        "path": p,
        "exists": True,
        "file_count": len(files),
        "sample_files": files[:25],
        "onnx_files_found": onnx_files[:10],
    }

print("CPU package:", summarize_dir(OUT_CPU))
print("WebGPU package:", summarize_dir(OUT_WEBGPU))


In [None]:
import onnxruntime_genai as og

MODEL_DIR="/content/drive/MyDrive/fine-tuning/epicrisis-qwen25-05b/onnx-cpu-int4"
prompt = "Redacta una epicrisis breve para un paciente con neumonía adquirida en la comunidad."

model = og.Model(MODEL_DIR)
tok = og.Tokenizer(model)

params = og.GeneratorParams(model)
params.set_search_options(max_length=220)

gen = og.Generator(model, params)
gen.append_tokens(tok.encode(prompt))

while not gen.is_done():
    gen.generate_next_token()

print(tok.decode(gen.get_sequence(0)))


## Prueba rápida (Python) con onnxruntime-genai (CPU)

In [None]:
import onnxruntime_genai as og

model_dir = "./epicrisis-onnx-webgpu-int4"  # paquete generado por builder
# En Python, el provider real depende de tu instalación; para validar lógica puedes usar CPU si generas paquete CPU.
# Si quieres validar en CPU, genera también con: -e cpu -o ./epicrisis-onnx-cpu-int4

print("Archivos en model_dir:", model_dir)

# Ejemplo mínimo (puede variar según versión). Si falla, usa el ejemplo oficial de ORT GenAI para tu versión.


## (Opcional) Reorganizar estructura para Transformers.js
Si vas a usar Transformers.js, normalmente esperas una carpeta `onnx/` y archivos tokenizer/config en raíz. Con ORT GenAI el paquete es distinto; para Transformers.js puede requerir adaptación.

In [None]:
import os, shutil, json
out_dir = './epicrisis-finetuned-onnx'
onnx_dir = os.path.join(out_dir, 'onnx')
os.makedirs(onnx_dir, exist_ok=True)

for f in os.listdir(out_dir):
    if f.endswith('.onnx') or f.endswith('.onnx_data'):
        shutil.move(os.path.join(out_dir, f), os.path.join(onnx_dir, f))

cfg_path = os.path.join(out_dir, 'config.json')
if os.path.exists(cfg_path):
    with open(cfg_path, 'r') as f:
        cfg = json.load(f)
    cfg['transformers.js_config'] = {
        'dtype': 'fp16',
        'kv_cache_dtype': {
            'fp16': 'float16'
        }
    }
    with open(cfg_path, 'w') as f:
        json.dump(cfg, f, indent=2)

print('Listo:', out_dir)


## (Opcional) Export oficial Transformers.js (q4f16)
Requiere scripts de Transformers.js.

In [None]:
!git clone https://github.com/huggingface/transformers.js.git -q
%cd transformers.js
!pip -q install -r requirements.txt
!python3 scripts/convert.py \
  --model_id ../epicrisis-merged \
  --task text-generation-with-past \
  --quantize q4f16 \
  --output_dir ../epicrisis-finetuned-tjs
%cd ..


## Descargar resultados

In [None]:
!zip -r epicrisis-onnx-webgpu-int4.zip epicrisis-onnx-webgpu-int4
print('ZIP creado: epicrisis-onnx-webgpu-int4.zip')