In [48]:
!pip install langchain langchain_community beautifulsoup4 chromadb sentence-transformers
!pip install -U lxml

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [41]:
import os
import time
from langchain_community.document_loaders import DirectoryLoader, BSHTMLLoader
from langchain_community.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from typing import Any, List, Sequence
from pathlib import Path
from shutil import rmtree

In [42]:
def load_documents_from_directory(
    path: str = "./data/python-3.13-docs-html",
    glob: str = "**/*.html",
    use_bshtml_loader: bool = True,
    show_progress: bool = True,
    use_multithreading: bool = True,
    get_text_separator: str = " "
) -> List[Any]:
    """
    Carrega arquivos de um diret√≥rio e retorna uma lista de `Document` (LangChain),
    com as mesmas configura√ß√µes usadas no notebook original.

    Par√¢metros
    ----------
    path : str
        Caminho para a pasta contendo os arquivos (ex.: "./data/python-3.13-docs-html").
    glob : str
        Padr√£o de busca dos arquivos (padr√£o: "**/*.html").
    use_bshtml_loader : bool
        Se True, usa `BSHTMLLoader` para extrair texto de HTML.
    show_progress : bool
        Mostra barra de progresso durante carregamento.
    use_multithreading : bool
        Ativa carregamento com m√∫ltiplas threads (mais r√°pido).
    get_text_separator : str
        Separador usado pelo `BSHTMLLoader` ao extrair texto.

    Retorno
    -------
    List[Any]
        Lista de objetos `Document` do LangChain.

    Tipos
    -----
    - path: str
    - glob: str
    - use_bshtml_loader: bool
    - show_progress: bool
    - use_multithreading: bool
    - get_text_separator: str
    - retorno: list[Document]
    """
    if use_bshtml_loader:
        loader = DirectoryLoader(
            path=path,
            glob=glob,
            loader_cls=BSHTMLLoader,
            show_progress=show_progress,
            use_multithreading=use_multithreading,
            loader_kwargs={'get_text_separator': get_text_separator}
        )
    else:
        loader = DirectoryLoader(
            path=path,
            glob=glob,
            show_progress=show_progress,
            use_multithreading=use_multithreading
        )
    docs = loader.load()
    print(f"üìÇ {len(docs)} documentos carregados de '{path}'")
    return docs


In [43]:
def split_documents(
    documents: Sequence[Any],
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    add_start_index: bool = True
) -> List[Any]:
    """
    Divide documentos em peda√ßos (chunks) para indexa√ß√£o/busca.

    Par√¢metros
    ----------
    documents : Sequence[Document]
        Lista de documentos `Document`.
    chunk_size : int, opcional (padr√£o: 1000)
        Tamanho de cada chunk em caracteres.
    chunk_overlap : int, opcional (padr√£o: 200)
        Sobreposi√ß√£o em caracteres entre chunks.
    add_start_index : bool, opcional (padr√£o: True)
        Se True, adiciona no metadado o √≠ndice inicial do chunk no texto original.

    Retorno
    -------
    List[Document]
        Lista de chunks resultantes.
    """
    # --- FASE 2: PROCESSAMENTO E INDEXA√á√ÉO ---
    print("2. Dividindo os textos em peda√ßos (chunks)...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        add_start_index=add_start_index
    )
    chunks = text_splitter.split_documents(list(documents))
    print(f"‚úÇÔ∏è {len(chunks)} chunks criados (tamanho={chunk_size}, overlap={chunk_overlap}, add_start_index={add_start_index})")
    return chunks

In [44]:
def ensure_writable_directory(path: str) -> str:
    """
    Garante que `path` exista e seja grav√°vel. 
    - Cria a pasta se n√£o existir.
    - Testa permiss√£o criando/removendo um arquivo de teste.
    
    Par√¢metros
    ----------
    path : str
        Caminho para o diret√≥rio de persist√™ncia do Chroma.

    Retorno
    -------
    str
        O mesmo `path` caso esteja OK.

    Levanta
    -------
    OSError
        Se o diret√≥rio n√£o for grav√°vel.
    """
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)

    test_file = p / ".write_test"
    try:
        with open(test_file, "w", encoding="utf-8") as f:
            f.write("ok")
        test_file.unlink(missing_ok=True)
    except OSError as e:
        raise OSError(
            f"Diret√≥rio '{path}' n√£o √© grav√°vel. "
            "Escolha outro caminho (ex.: './chroma_db_*') ou ajuste permiss√µes."
        ) from e

    return path

In [45]:
def build_hf_embeddings(
    model_name: str = "intfloat/multilingual-e5-small",
    device: str = "cpu",
    normalize_embeddings: bool = True,
    **kwargs: Any,
) -> HuggingFaceEmbeddings:
    """
    Cria um objeto de embeddings do HuggingFace para vetoriza√ß√£o.

    Par√¢metros
    ----------
    model_name : str, opcional (padr√£o: "intfloat/multilingual-e5-small")
        Nome do modelo de embeddings (Hugging Face Hub).
    device : str, opcional (padr√£o: "cpu")
        Dispositivo de execu√ß√£o ("cpu", "cuda", etc.).
    normalize_embeddings : bool, opcional (padr√£o: True)
        Se True, normaliza os vetores (√∫til para similaridade de cosseno).
    **kwargs : Any
        Par√¢metros extras repassados (ex.: model_kwargs, encode_kwargs).

    Retorno
    -------
    HuggingFaceEmbeddings
        Inst√¢ncia configurada do wrapper de embeddings.

    Tipos
    -----
    - model_name: str
    - device: str
    - normalize_embeddings: bool
    - retorno: HuggingFaceEmbeddings
    """
    model_kwargs = kwargs.pop("model_kwargs", {"device": device})
    encode_kwargs = kwargs.pop("encode_kwargs", {"normalize_embeddings": normalize_embeddings})
    return HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
        **kwargs,
    )


In [46]:
def build_and_persist_chroma_index(
    documents: Sequence[Any],
    embeddings: HuggingFaceEmbeddings,
    persist_directory: str,
) -> Chroma:
    """
    Cria **e persiste** um √≠ndice Chroma local a partir de `documents` usando `embeddings`.

    Par√¢metros
    ----------
    documents : Sequence[Document]
        Documentos (idealmente j√° chunkados) a serem indexados.
    embeddings : HuggingFaceEmbeddings
        Objeto de embeddings HuggingFace para vetoriza√ß√£o.
    persist_directory : str
        Diret√≥rio onde o √≠ndice Chroma ser√° salvo (cria caso n√£o exista).

    Retorno
    -------
    Chroma
        Inst√¢ncia do vetor store criada e j√° persistida.

    Efeitos Colaterais
    ------------------
    - Cria/atualiza arquivos do Chroma no `persist_directory` (ex.: sqlite + blobs).

    Tipos
    -----
    - documents: Sequence[Document]
    - embeddings: HuggingFaceEmbeddings
    - persist_directory: str
    - retorno: Chroma
    """
    vectorstore = Chroma.from_documents(
        documents=list(documents),
        embedding=embeddings,
        persist_directory=persist_directory,
    )
    return vectorstore

In [47]:
INPUT_DIR = "./data/python-3.13-docs-html"
GLOB = "**/*.html"
PERSIST_DIR = "./chroma_db_python_iniciante"

ensure_writable_directory(PERSIST_DIR)

docs = load_documents_from_directory(
    path=INPUT_DIR,
    glob=GLOB,
    use_bshtml_loader=True,
    show_progress=True,
    use_multithreading=True,
    get_text_separator=" "
)

chunks = split_documents(
    documents=docs,
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

emb = build_hf_embeddings(
    model_name="intfloat/multilingual-e5-small",
    device="cpu",
    normalize_embeddings=True
)

vs = build_and_persist_chroma_index(
    documents=chunks,
    embeddings=emb,
    persist_directory=PERSIST_DIR
)

#modelos usados para testar BAAI/bge-base-en-v1.5, llama3.2:3b-instruct-q4_K_M, qwen2.5:1.5b-instruct-q4_K_M, phi3:mini e intfloat/multilingual-e5-small

 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 163/164 [00:25<00:00,  6.33it/s]


üìÇ 163 documentos carregados de './data/python-3.13-docs-html'
2. Dividindo os textos em peda√ßos (chunks)...
‚úÇÔ∏è 9293 chunks criados (tamanho=1000, overlap=200, add_start_index=True)
