In [2]:
# Instalação de dependências
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Importação e Preparação do Dataset
import json
import pandas as pd
from datasets import load_dataset


In [8]:
# Carregar o JSON diretamente usando streaming
file_path = "/content/drive/MyDrive/fiap/tech-challenge-03/trn.json"

# Carregar o dataset com as colunas "title" e "content"
dataset = load_dataset("json", data_files=file_path, split="train", streaming=True)

# Função de processamento para formatar os prompts e respostas
def format_dataset(example):
    """
    Formata cada exemplo do dataset para fine-tuning.
    """
    # Remover linhas com dados ausentes
    if example["title"] and example["content"]:
        return {
            "prompt": f"Pergunta: Descreva o produto | Contexto: {example['title']}",
            "response": example["content"]
        }
    # Return an empty dictionary if title or content is missing to avoid None values
    return {}

# Filtrar exemplos com valores não nulos em "title" e "content"
dataset = dataset.filter(lambda example: example["title"] is not None and example["content"] is not None)

# Aplicar transformação no dataset
formatted_dataset = dataset.map(format_dataset, remove_columns=["title", "content"])

In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Carregar o tokenizer e modelo GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenizar o dataset
def tokenize_function(examples):
    """
    Tokeniza os exemplos no modo batched.
    """
    inputs = [
        example["prompt"] + "\nResposta: " + example["response"]
        for example in zip(examples["prompt"], examples["response"])
    ]
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "response"])

num_examples = sum(1 for _ in dataset)
print(f"Total de exemplos: {num_examples}")

# Calcule os passos
batch_size = 4
max_steps = num_examples // batch_size
print(f"Passos necessários: {max_steps}")

# Configuração do treinamento
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    max_steps=max_steps,  # Especifique o número de passos manualmente
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    report_to="none"
)


# Inicializar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Iniciar o treinamento
trainer.train()


max_steps is given, it will override any value given in num_train_epochs


Total de exemplos: 2248619
Passos necessários: 562154


TypeError: tuple indices must be integers or slices, not str

In [None]:
# Avaliação e Geração de Respostas
def generate_response(question, title):
    input_text = f"Pergunta: {question} | Contexto: {title}"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Teste do modelo treinado
question = "Qual é a descrição do produto?"
title = "Laptop Dell Inspiron 15"
print(generate_response(question, title))