# Epicrisis Fine-Tuning + Export ONNX (Colab)

Flujo completo:
1) Entrenar (LoRA) con Unsloth
2) Merge del LoRA con el modelo base
3) Exportar a ONNX (fp16) con Optimum
4) (Opcional) Export oficial Transformers.js q4f16


In [None]:
# Instalar dependencias
!pip -q install 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'
!pip -q install --no-deps trl peft accelerate bitsandbytes
!pip -q install transformers datasets optimum[exporters] onnx onnxruntime


## Subir dataset
Sube `train.jsonl` y `validation.jsonl`.

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# Configuracion base
MODEL_NAME = 'Qwen/Qwen2.5-1.5B-Instruct'
TRAIN_FILE = 'train.jsonl'
VAL_FILE = 'validation.jsonl'
OUTPUT_DIR = './epicrisis-model-finetuned'
MERGED_DIR = './epicrisis-merged'

print('MODEL_NAME:', MODEL_NAME)


In [None]:
# Fine-tuning con Unsloth
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

MAX_SEQ_LENGTH = 2048
DTYPE = None
LOAD_IN_4BIT = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'o_proj',
        'gate_proj', 'up_proj', 'down_proj'
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias='none',
    use_gradient_checkpointing='unsloth',
    random_state=42,
)

def format_prompt(example):
    instruction = example.get('instruction', 'Epicrisis:')
    input_data = example.get('input', {})
    output = example.get('output', '')

    if isinstance(input_data, dict):
        import json
        input_str = json.dumps(input_data, ensure_ascii=False)
    else:
        input_str = str(input_data)

    messages = [
        {
            'role': 'system',
            'content': 'Eres un asistente medico experto en redaccion de epicrisis clinicas en espanol.'
        },
        {
            'role': 'user',
            'content': f"{instruction}\n{input_str}"
        },
        {
            'role': 'assistant',
            'content': output
        },
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {'text': text}

train_dataset = load_dataset('json', data_files=TRAIN_FILE, split='train')
val_dataset = load_dataset('json', data_files=VAL_FILE, split='train')
train_dataset = train_dataset.map(format_prompt)
val_dataset = val_dataset.map(format_prompt)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    lr_scheduler_type='cosine',
    warmup_ratio=0.03,
    optim='adamw_8bit',
    weight_decay=0.01,
    max_grad_norm=0.3,
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    save_steps=50,
    save_total_limit=3,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    seed=42,
    report_to='none',
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field='text',
    max_seq_length=MAX_SEQ_LENGTH,
    args=training_args,
)

trainer_stats = trainer.train()
print('Loss final:', trainer_stats.metrics.get('train_loss'))

model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print('Modelo LoRA guardado en', OUTPUT_DIR)


In [None]:
# Merge LoRA con modelo base
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model = model.merge_and_unload()
model.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)
print('Modelo merged guardado en', MERGED_DIR)


## Exportar a ONNX (fp16, GPU)
Usa `--device cuda` para fp16.

In [None]:
!optimum-cli export onnx \
  --model ./epicrisis-merged \
  --task text-generation-with-past \
  --dtype fp16 \
  --device cuda \
  --opset 18 \
  --no-dynamic-axes \
  --batch_size 1 \
  --sequence_length 2 \
  ./epicrisis-finetuned-onnx


## Reorganizar estructura para Transformers.js (fp16)

In [None]:
import os, shutil, json
out_dir = './epicrisis-finetuned-onnx'
onnx_dir = os.path.join(out_dir, 'onnx')
os.makedirs(onnx_dir, exist_ok=True)

for f in os.listdir(out_dir):
    if f.endswith('.onnx') or f.endswith('.onnx_data'):
        shutil.move(os.path.join(out_dir, f), os.path.join(onnx_dir, f))

cfg_path = os.path.join(out_dir, 'config.json')
if os.path.exists(cfg_path):
    with open(cfg_path, 'r') as f:
        cfg = json.load(f)
    cfg['transformers.js_config'] = {
        'dtype': 'fp16',
        'kv_cache_dtype': {
            'fp16': 'float16'
        }
    }
    with open(cfg_path, 'w') as f:
        json.dump(cfg, f, indent=2)

print('Listo:', out_dir)


## (Opcional) Export oficial Transformers.js (q4f16)
Requiere scripts de Transformers.js.

In [None]:
!git clone https://github.com/huggingface/transformers.js.git -q
%cd transformers.js
!pip -q install -r requirements.txt
!python3 scripts/convert.py \
  --model_id ../epicrisis-merged \
  --task text-generation-with-past \
  --quantize q4f16 \
  --output_dir ../epicrisis-finetuned-tjs
%cd ..


## Descargar resultados

In [None]:
!zip -r epicrisis-finetuned-onnx.zip epicrisis-finetuned-onnx
from google.colab import files
files.download('epicrisis-finetuned-onnx.zip')
