In [5]:
# Instalação de dependências
!pip install transformers datasets scikit-learn



In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Função para processar JSON em chunks e formatar os prompts
def process_json_in_chunks(file_path, chunk_size=10000):
    """
    Process a large JSON file in chunks using pandas, format the data for fine-tuning.

    Parameters:
        file_path (str): Path to the JSON file.
        chunk_size (int): Number of rows per chunk.

    Yields:
        pd.DataFrame: Chunk of the JSON file as a DataFrame with formatted columns.
    """
    for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size):
        # Retain necessary columns and remove rows with NaN
        chunk = chunk[["title", "content"]].dropna()

        # Format the prompts for fine-tuning
        chunk["prompt"] = "Pergunta: Descreva o produto | Contexto: " + chunk["title"]
        chunk["response"] = chunk["content"]

        # Drop the original columns to keep only formatted data if desired
        chunk = chunk[["prompt", "response"]]

        yield chunk

In [17]:
# Importação e Preparação do Dataset
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Caminho do arquivo
file_path = "/content/drive/MyDrive/fiap/tech-challenge-03/trn.json"

# Processar o JSON em chunks e armazenar os resultados
processed_data = []
for chunk in process_json_in_chunks(file_path):
    processed_data.append(chunk)  # Opcional: Combine todos os chunks (cuidado com a memória)

# Combinar todos os chunks em um único DataFrame (se necessário)
final_df = pd.concat(processed_data, ignore_index=True)



NameError: name 'final_df' is not defined

In [18]:
# Dividir em treino e validação
train_data, val_data = train_test_split(final_df, test_size=0.1, random_state=42)

In [None]:
# Tokenização e Configuração do Modelo
from transformers import AutoTokenizer, AutoModelForCausalLM

# Escolher o modelo base
model_name = "gpt2"  # Modelo base
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token # using eos_token as pad_token

# Tokenizar os dados
def tokenize_function(example):
    return tokenizer(
        example["prompt"],
        text_target=example["response"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Tokenizar o conjunto de dados
train_encodings = train_data.apply(lambda x: tokenize_function(x), axis=1)
val_encodings = val_data.apply(lambda x: tokenize_function(x), axis=1)

In [None]:
# Configuração e Execução do Treinamento
from transformers import Trainer, TrainingArguments

# Configurar argumentos de treinamento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
)

# Preparar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
)

# Fine-tuning
trainer.train()

In [None]:
# Avaliação e Geração de Respostas
def generate_response(question, title):
    input_text = f"Pergunta: {question} | Contexto: {title}"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Teste do modelo treinado
question = "Qual é a descrição do produto?"
title = "Laptop Dell Inspiron 15"
print(generate_response(question, title))