In [1]:
# Install Unsloth, upgrade libraries, and install Gradio for the web interface
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade transformers trl accelerate peft
!pip install gradio


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-n65_9g4o/unsloth_d0b0c94634244d1a8f11c12d18acbe38
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-n65_9g4o/unsloth_d0b0c94634244d1a8f11c12d18acbe38
  Resolved https://github.com/unslothai/unsloth.git to commit 229e2ecc67756f36316dfcbea42396f59eef44e0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.9.9 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.9.9-py3-none-any.whl.metadata (31 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git-

In [3]:
import torch
from unsloth import FastLanguageModel
from peft import PeftModel
import gradio as gr

# ===================================================================================
# 1. Configuration
# ===================================================================================
max_seq_length = 2048 # Llama 3.1 supports longer sequences
load_in_4bit = True
dtype = None # Autodetect for your GPU

# --- ⬇️ IMPORTANT: Make sure this path is correct for your model ⬇️ ---
# Path to your fine-tuned LoRA adapters (local path or Hugging Face Hub ID)
your_finetuned_model_path = "jardemr/fiap_tech_challenge_22_09"


# ===================================================================================
# 2. Load Both Models
# ===================================================================================
print("Loading Base Model (unsloth/Llama-3.2-3B-Instruct-bnb-4bit)...")
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit", # The base model you used
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=""
)
FastLanguageModel.for_inference(base_model)
print("✅ Base Model Loaded.")


print(f"Loading Fine-Tuned Model from: {your_finetuned_model_path}...")
# Create a separate instance for the fine-tuned model by reloading the base
ft_model, _ = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=""
)
# Apply your saved adapters
ft_model = PeftModel.from_pretrained(ft_model, your_finetuned_model_path)
FastLanguageModel.for_inference(ft_model)
print("✅ Fine-Tuned Model Loaded.")


# ===================================================================================
# 3. Define the Comparison Function using the Alpaca Prompt
# ===================================================================================
# --- Define the EXACT Alpaca Prompt You Used for Training ---
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def compare_models(title):
    print(f"Generating for title: '{title}'...")
    instruction = "Generate a detailed description for the following title."

    # Prepare the prompt
    inference_prompt = alpaca_prompt.format(instruction, title, "")
    inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")

    # Generation parameters
    gen_kwargs = {
        "max_new_tokens": 256,
        "do_sample": True,
        "temperature": 0.6,
        "repetition_penalty": 1.1,
    }

    # Generate from Base Model
    base_output_tokens = base_model.generate(**inputs, **gen_kwargs)
    base_response = tokenizer.batch_decode(base_output_tokens[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    # Generate from Fine-Tuned Model
    ft_output_tokens = ft_model.generate(**inputs, **gen_kwargs)
    ft_response = tokenizer.batch_decode(ft_output_tokens[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    print("Generation complete.")
    return base_response, ft_response

# ===================================================================================
# 4. Launch the Gradio Web Interface
# ===================================================================================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Model Comparison: Llama 3.2 Base vs. Fine-Tuned")
    gr.Markdown("Enter a product title below to see how the Base Model and your Fine-Tuned Model respond.")

    inp = gr.Textbox(label="Enter a Title (Input)", placeholder="Example: Professional Visual Basic 6 Databases")
    btn = gr.Button("Generate Responses")

    with gr.Row():
        out_base = gr.Textbox(label="Base Model Response", lines=10)
        out_ft = gr.Textbox(label="Fine-Tuned Model Response", lines=10)

    btn.click(fn=compare_models, inputs=inp, outputs=[out_base, out_ft])

# This creates a public link for you to use the interface
demo.launch(share=True)

Loading Base Model (unsloth/Llama-3.2-3B-Instruct-bnb-4bit)...
==((====))==  Unsloth 2025.9.7: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Base Model Loaded.
Loading Fine-Tuned Model from: jardemr/fiap_tech_challenge_22_09...
==((====))==  Unsloth 2025.9.7: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled -

