<a href="https://colab.research.google.com/github/renato-penna/fiap-tech-challenge-fase03/blob/main/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Format Dataset

In [1]:
import json
from datasets import load_dataset # Although load_dataset is imported, we'll process line by line for memory efficiency.

DATA_PATH = "/content/drive/MyDrive/Fiap/trnTreaded.json"
OUTPUT_PATH_DATASET = "/content/drive/MyDrive/Fiap/formatted_trn.json"

def format_dataset_into_model_input(data):
    """
    Função ajustada para receber um dicionário completo (um item do dataset)
    e extrair 'prompt' e 'completion' dele.
    """
    prompt = data.get("prompt", "")
    completion = data.get("completion", "")

    instruction = "Generate a description for the following item."

    try:
        # Extrai o texto entre "Question:" e "Answer:"
        input_text = prompt.split("Question:")[1].split("Answer:")[0].strip()
    except IndexError:
        input_text = ""

    # Extrai a resposta que vem depois de "Answer:"
    try:
        response = prompt.split("Answer:")[1].strip()
    except IndexError:
        # Se 'Answer:' não estiver no prompt, usamos o campo 'completion'
        response = completion.strip()

    return instruction, input_text, response

# Process the dataset line by line to avoid memory issues
with open(DATA_PATH, 'r', encoding='utf-8') as input_file, \
     open(OUTPUT_PATH_DATASET, 'w', encoding='utf-8') as output_file:

    for line in input_file:
        try:
            item = json.loads(line)
            instruction, input_text, response = format_dataset_into_model_input(item)

            formatted_item = {
                "instruction": instruction,
                "input": input_text,
                "output": response
            }

            output_file.write(json.dumps(formatted_item, ensure_ascii=False) + '\n')
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {line.strip()} - Error: {e}")
        except Exception as e:
            print(f"An error occurred processing line: {line.strip()} - Error: {e}")

print(f"Dataset salvo em {OUTPUT_PATH_DATASET}")

Dataset salvo em /content/drive/MyDrive/Fiap/formatted_trn.json


### Install Dependencies

In [1]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-0whl8x7i/unsloth_2b0fdbca4d1c4db2a8fd59911b176cce
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-0whl8x7i/unsloth_2b0fdbca4d1c4db2a8fd59911b176cce
  Resolved https://github.com/unslothai/unsloth.git to commit e025ca90131dcdc4ae752c591116d5e58ef3adfc
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.9.5 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.9.5-py3-none-any.whl.metadata (9.5 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git

### Setup and Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments


max_seq_length = 2048
dtype = None
load_in_4bit = True
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]

### Load Model and Tokenizer

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

### Configure LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)