In [5]:
import torch
def get_supported_dtype(device: str) -> torch.dtype:
    if device == "cuda" and torch.cuda.is_bf16_supported():
        return torch.bfloat16
    elif device == "cuda" and torch.cuda.is_fp16_supported():
        return torch.float16
    else:
        return torch.float32

In [None]:
# pip install -U "transformers>=4.41" "peft>=0.11" "accelerate>=0.33" torch

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# ---- config ----
BASE_MODEL_ID = "google/gemma-3-270m-it"          # change me
LORA_ADAPTER_ID = "boreasg/sft-model-runpod-test_20260123_065252"     # change me
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = get_supported_dtype(DEVICE)

# ---- load tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)

# ---- load two base instances so we can compare the raw base model vs the SFT (PEFT) model ----
device_map = "auto" if DEVICE == "cuda" else None
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=DTYPE,
    device_map=device_map,
)
# load a second copy to apply the adapter onto (keep base_model untouched)
base_for_peft = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=DTYPE,
    device_map=device_map,
)
sft_model = PeftModel.from_pretrained(base_for_peft, LORA_ADAPTER_ID)

# eval mode
base_model.eval()
sft_model.eval()

# quick debug prints
print("DEVICE:", DEVICE)
print("base_model first param dtype:", next(base_model.parameters()).dtype)
print("sft_model first param dtype:", next(sft_model.parameters()).dtype)
print("tokenizer eos_token_id:", tokenizer.eos_token_id)

# ---- generation helper that can run any model instance ----
@torch.inference_mode()
def generate_with_model(model, prompt: str, *, max_new_tokens: int = 512, do_sample: bool = False) -> str:
    messages = [
        {"role": "system", "content": "You are a helpful, concise assistant. Answer directly and accurately."},
        {"role": "user", "content": prompt},
    ]

    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(text, return_tensors="pt")
    else:
        text = f"System: You are a helpful, concise assistant. Answer directly and accurately.\nUser: {prompt}\nAssistant:"
        inputs = tokenizer(text, return_tensors="pt")

    # send inputs to the device where the model lives (works with device_map placements)
    model_device = next(model.parameters()).device
    inputs = {k: v.to(model_device) for k, v in inputs.items()}

    # sanity-check: single forward to inspect logits
    try:
        logits = model(**inputs).logits
        print(f"Sanity: logits nan? {logits.isnan().any().item()} max_abs={logits.abs().max().item():.3e}")
    except Exception as e:
        print("Sanity forward failed:", e)

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
    )
    output_text = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return output_text.strip()

# ---- compare runner ----
def compare_models(prompt: str):
    print("--- Prompt ---")
    print(prompt)
    print()

    print("--- Base model output ---")
    try:
        base_out = generate_with_model(base_model, prompt)
        print(base_out)
    except Exception as e:
        print("Base generation failed:", e)

    print()
    print("--- SFT (PEFT) model output ---")
    try:
        sft_out = generate_with_model(sft_model, prompt)
        print(sft_out)
    except Exception as e:
        print("SFT generation failed:", e)

    return {"base": base_out if 'base_out' in locals() else None, "sft": sft_out if 'sft_out' in locals() else None}

# ---- example ----
if __name__ == "__main__":
    result = compare_models("Explain LoRA in 3 bullet points.")
    # result contains both outputs for programmatic use


Here are 3 bullet points summarizing the role of LoRA in AI:

*   LoRA (Low-Rank Adaptation) allows you to fine-tune an AI model without needing to train it from scratch, making it more efficient and adaptable to new tasks or datasets.
*   It leverages pre-trained models and fine-tuning techniques to achieve state-of-the-art performance on specific tasks or datasets.


In [7]:
chat("Write a short message to a friend")

"Hi [Friend's Name], Just wanted to say hi! Hope you're doing well. Let me know if you need anything!"