In [22]:
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from docling.document_converter import DocumentConverter

# ==============================================================================
# CONFIGURA√á√ÉO DO OLLAMA
# ==============================================================================
llm = ChatOllama(
    base_url="http://localhost:11434",
    model='gemma3:12b',
    temperature=0.3,  # Aumentado para respostas menos rob√≥ticas
    num_ctx=8192,     # Contexto maior para processar mais chunks
    num_predict=1024, # Mais tokens para respostas completas
)

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="nomic-embed-text"
)

# ==============================================================================
# CARREGAR PDF COM DOCLING (MELHORADO)
# ==============================================================================
def load_pdf(pdf_path):
    """Carrega PDF preservando estrutura com Docling."""
    print(f"üìÑ Carregando: {pdf_path}")
    
    # Converter PDF para markdown preservando estrutura
    converter = DocumentConverter()
    result = converter.convert(pdf_path)
    markdown = result.document.export_to_markdown()
    
    # Criar documento
    doc = Document(page_content=markdown, metadata={"source": pdf_path})
    
    # Chunking otimizado para markdown
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,      # Chunks maiores para mais contexto
        chunk_overlap=300,    # Overlap maior para n√£o perder contexto
        separators=["\n## ", "\n### ", "\n\n", "\n", ". ", " ", ""],  # Respeita estrutura markdown
        length_function=len,
    )
    chunks = splitter.split_documents([doc])
    
    print(f"‚úÖ {len(chunks)} chunks criados")
    return chunks

# ==============================================================================
# CRIAR VECTORSTORE
# ==============================================================================
def create_vectorstore(chunks):
    """Cria vectorstore com embeddings."""
    print("üîÆ Criando vectorstore...")
    vectorstore = Chroma.from_documents(chunks, embeddings)
    print("‚úÖ Vectorstore pronto!")
    return vectorstore

# ==============================================================================
# FAZER PERGUNTA COM PROMPT OTIMIZADO
# ==============================================================================
def ask(vectorstore, query, k=5, score_threshold=0.5):
    """
    Faz pergunta com prompt otimizado.
    
    Args:
        vectorstore: O vectorstore com os documentos
        query: A pergunta
        k: N√∫mero de chunks a recuperar (padr√£o: 5)
        score_threshold: Limiar de similaridade (0-1, padr√£o: 0.5 = 50%)
    """
    print(f"\nüîç {query}\n")
    
    # Buscar mais chunks com filtro de qualidade
    docs_scores = vectorstore.similarity_search_with_score(query, k=k)
    
    # Filtrar apenas chunks relevantes (score baixo = alta similaridade)
    # Score < 1.0 geralmente indica boa relev√¢ncia
    filtered_docs = [(doc, score) for doc, score in docs_scores if score < (2 - score_threshold*2)]
    
    if not filtered_docs:
        print("‚ö†Ô∏è Nenhum trecho relevante encontrado. Tentando com threshold mais baixo...")
        filtered_docs = docs_scores[:3]  # Pega os 3 melhores mesmo assim
    
    # Montar contexto numerado para melhor rastreabilidade
    context_parts = []
    for i, (doc, _) in enumerate(filtered_docs, 1):
        context_parts.append(f"[Trecho {i}]\n{doc.page_content}")
    
    context = "\n\n".join(context_parts)
    
    # Prompt otimizado para respostas melhores
    prompt = f"""Voc√™ √© um assistente especializado em an√°lise de documentos. Use APENAS as informa√ß√µes dos trechos fornecidos para responder.

TRECHOS DO DOCUMENTO:
{context}

INSTRU√á√ïES:
- Responda de forma clara, completa e estruturada
- Cite os trechos espec√≠ficos quando relevante (ex: "Conforme o Trecho 2...")
- Se a informa√ß√£o n√£o estiver nos trechos, diga claramente que n√£o encontrou
- Use exemplos e detalhes dos trechos quando dispon√≠veis
- Organize a resposta em par√°grafos ou t√≥picos quando apropriado

PERGUNTA: {query}

RESPOSTA:"""
    
    # Obter resposta
    response = llm.invoke(prompt).content
    
    # Exibir resposta
    print("üí¨ Resposta:")
    print("="*80)
    print(response)
    print("="*80)
    
    # Exibir chunks com scores
    print(f"\nüìö {len(filtered_docs)} Chunks utilizados:\n")
    for i, (doc, score) in enumerate(filtered_docs, 1):
        similarity = max(0, (1 - score/2) * 100)
        preview = doc.page_content[:].replace('\n', ' ')
        
        # Indicador visual de qualidade
        quality = "üü¢" if similarity > 70 else "üü°" if similarity > 50 else "üî¥"
        
        print(f"{quality} Chunk {i}: {similarity:.1f}% similar")
        print(f"   {preview}...\n")
    
    return {
        "answer": response,
        "chunks": filtered_docs,
        "num_chunks_used": len(filtered_docs)
    }

# ==============================================================================
# FUN√á√ÉO DE DIAGN√ìSTICO
# ==============================================================================
def diagnostico(vectorstore, query, k=10):
    """Mostra os top K chunks para diagn√≥stico sem fazer pergunta ao LLM."""
    print(f"\nüî¨ DIAGN√ìSTICO: '{query}'\n")
    
    docs_scores = vectorstore.similarity_search_with_score(query, k=k)
    
    for i, (doc, score) in enumerate(docs_scores, 1):
        similarity = max(0, (1 - score/2) * 100)
        quality = "üü¢" if similarity > 70 else "üü°" if similarity > 50 else "üî¥"
        
        print(f"{quality} Chunk {i}: {similarity:.1f}% (score: {score:.4f})")
        print(f"Conte√∫do: {doc.page_content[:300]}...")
        print("-"*80)

# ==============================================================================
# SETUP (EXECUTE ESTA C√âLULA UMA VEZ)
# ==============================================================================
pdf_path = "datasets/tattoo.pdf"  # ‚Üê MUDE AQUI

# Carregar e processar
chunks = load_pdf(pdf_path)
vectorstore = create_vectorstore(chunks)

print("\n" + "="*80)
print("üöÄ RAG OTIMIZADO PRONTO!")
print("="*80)
print("\nUso b√°sico:")
print("  ask(vectorstore, 'sua pergunta')")
print("\nUso avan√ßado:")
print("  ask(vectorstore, 'sua pergunta', k=8, score_threshold=0.6)")
print("\nDiagn√≥stico:")
print("  diagnostico(vectorstore, 'sua pergunta')")
print("="*80)

2026-01-11 21:50:23,473 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-11 21:50:23,498 - INFO - Going to convert document batch...
2026-01-11 21:50:23,499 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-11 21:50:23,500 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2026-01-11 21:50:23,536 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-01-11 21:50:23,552 [RapidOCR] download_file.py:60: File exists and is valid: /home/clayton/miniconda3/envs/ml311/lib/python3.11/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-01-11 21:50:23,553 [RapidOCR] main.py:53: Using /home/clayton/miniconda3/envs/ml311/lib/python3.11/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-01-11 21:50:23,641 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-01-11 21:50:23,643 [RapidOCR] download_file.py:60: File exists and is vali

üìÑ Carregando: datasets/tattoo.pdf


[32m[INFO] 2026-01-11 21:50:23,726 [RapidOCR] download_file.py:60: File exists and is valid: /home/clayton/miniconda3/envs/ml311/lib/python3.11/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2026-01-11 21:50:23,727 [RapidOCR] main.py:53: Using /home/clayton/miniconda3/envs/ml311/lib/python3.11/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.onnx[0m
2026-01-11 21:50:23,829 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2026-01-11 21:50:23,829 - INFO - Accelerator device: 'cuda:0'
2026-01-11 21:50:24,647 - INFO - Accelerator device: 'cuda:0'
2026-01-11 21:50:25,125 - INFO - Processing document tattoo.pdf
2026-01-11 21:50:54,877 - INFO - Finished converting document tattoo.pdf in 31.41 sec.


‚úÖ 109 chunks criados
üîÆ Criando vectorstore...


2026-01-11 21:51:03,250 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"


‚úÖ Vectorstore pronto!

üöÄ RAG OTIMIZADO PRONTO!

Uso b√°sico:
  ask(vectorstore, 'sua pergunta')

Uso avan√ßado:
  ask(vectorstore, 'sua pergunta', k=8, score_threshold=0.6)

Diagn√≥stico:
  diagnostico(vectorstore, 'sua pergunta')


In [24]:
# 1. VERIFICAR O QUE FOI EXTRA√çDO
print(f"Total de chunks: {len(chunks)}")
print(f"\nPrimeiro chunk (primeiros 500 chars):")
print(chunks[0].page_content[:])
print("\n" + "="*80)

# 2. VERIFICAR TAMANHOS DOS CHUNKS
tamanhos = [len(c.page_content) for c in chunks]
print(f"\nEstat√≠sticas dos chunks:")
print(f"  M√©dia: {sum(tamanhos)/len(tamanhos):.0f} caracteres")
print(f"  Menor: {min(tamanhos)} caracteres")
print(f"  Maior: {max(tamanhos)} caracteres")
print(f"  Chunks vazios: {sum(1 for t in tamanhos if t < 50)}")

Total de chunks: 109

Primeiro chunk (primeiros 500 chars):
This article has been accepted for publication in IEEE Access. This is the author's version which has not been fully edited and

Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.

Digital Object Identifier 10.1109/ACCESS.2017.DOI


Estat√≠sticas dos chunks:
  M√©dia: 1043 caracteres
  Menor: 102 caracteres
  Maior: 1498 caracteres
  Chunks vazios: 0


In [28]:
ask(vectorstore, "show the content of TABLE 2. Tattoo segmentation models.")

2026-01-11 21:55:57,490 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"



üîç show the content of TABLE 2. Tattoo segmentation models.



2026-01-11 21:57:07,501 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


üí¨ Resposta:
Conforme solicitado, apresento o conte√∫do da TABLE 2. Tattoo segmentation models, extra√≠da dos trechos fornecidos:

**TABLE 2. Tattoo segmentation models.**

| Ref.   | Network       | Semantic?   | Open-Set?   |
|--------|---------------|-------------|-------------|
| [9]    | ConvNet       | No          | No          |
| [31]   | AlexNet + VGG | No          | No          |
| [13]   | ViT-based     | No          | No          |
| Ours   | SegFormer     | Yes         | Yes         |

**Observa√ß√µes:**

*   A tabela lista quatro modelos de segmenta√ß√£o de tatuagens: [9], [31], [13] e "Ours" (que se refere ao modelo apresentado no documento). (Trecho 5)
*   Para cada modelo, a tabela indica a arquitetura da rede neural utilizada ("Network"), se o modelo realiza segmenta√ß√£o sem√¢ntica ("Semantic?") e se √© um modelo de "Open-Set?". (Trecho 5)
*   O modelo "Ours" (do pr√≥prio trabalho) utiliza a arquitetura SegFormer e √© tanto um modelo de segmenta√ß√£o sem√¢ntica qua

{'answer': 'Conforme solicitado, apresento o conte√∫do da TABLE 2. Tattoo segmentation models, extra√≠da dos trechos fornecidos:\n\n**TABLE 2. Tattoo segmentation models.**\n\n| Ref.   | Network       | Semantic?   | Open-Set?   |\n|--------|---------------|-------------|-------------|\n| [9]    | ConvNet       | No          | No          |\n| [31]   | AlexNet + VGG | No          | No          |\n| [13]   | ViT-based     | No          | No          |\n| Ours   | SegFormer     | Yes         | Yes         |\n\n**Observa√ß√µes:**\n\n*   A tabela lista quatro modelos de segmenta√ß√£o de tatuagens: [9], [31], [13] e "Ours" (que se refere ao modelo apresentado no documento). (Trecho 5)\n*   Para cada modelo, a tabela indica a arquitetura da rede neural utilizada ("Network"), se o modelo realiza segmenta√ß√£o sem√¢ntica ("Semantic?") e se √© um modelo de "Open-Set?". (Trecho 5)\n*   O modelo "Ours" (do pr√≥prio trabalho) utiliza a arquitetura SegFormer e √© tanto um modelo de segmenta√ß√£o se

In [26]:
for c in chunks: 
    print(c)
    print("\n\n\n")

page_content='This article has been accepted for publication in IEEE Access. This is the author's version which has not been fully edited and

Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.

Digital Object Identifier 10.1109/ACCESS.2017.DOI' metadata={'source': 'datasets/tattoo.pdf'}




page_content='## Open-Set Tattoo Semantic Segmentation

ANDERSON BRILHADOR 1 , RODRIGO TCHALSKI DA SILVA 1 , CARLOS ROBERTO MODINEZ-JUNIOR 1 , GABRIEL DE ALMEIDA SPADAFORA 1 , HEITOR SILV√âRIO LOPES 1 , AND ANDR√â EUG√äNIO LAZZARETTI 1 , (Member, IEEE).

1 Federal University of Technology - Paran√°, Av. Sete de Setembro, 3165, Curitiba, 80230-901, Paran√°, Brazil.

Corresponding author: Anderson Brilhador (e-mail: andersonbrilhador@gmail.com).' metadata={'source': 'datasets/tattoo.pdf'}




page_content='ABSTRACT Tattoos can serve as an essential source of biometric information for public security, aiding in identifying suspects and victims. In order to automate tattoo classi