# Gerador de Embedding e salvar em um banco vetorial

In [1]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

In [2]:
# Configurações
pdf_folder = 'pdf'
output_folder = 'vectorstore'
chunk_size = 1000
chunk_overlap = 200
model_name = "BAAI/bge-m3"

## Cria a pasta de saída se não existir

In [3]:
os.makedirs(output_folder, exist_ok=True)

## Embedding

In [4]:
embeddings = HuggingFaceEmbeddings(model_name = model_name)


  from tqdm.autonotebook import tqdm, trange


In [5]:
input_test = "Um teste de embedding"
result = embeddings.embed_query(input_test)
result

[-0.024902960285544395,
 -0.0032636155374348164,
 -0.03654949739575386,
 0.010908588767051697,
 -0.006346181500703096,
 -0.015045144595205784,
 0.032469943165779114,
 0.01756499707698822,
 0.01342172920703888,
 0.002943760482594371,
 0.002918571000918746,
 -0.02915963903069496,
 0.01816560886800289,
 -0.0060061318799853325,
 0.007251395843923092,
 -0.009097201749682426,
 0.002471750136464834,
 -0.0006890395889058709,
 0.00029020095826126635,
 -0.03151862323284149,
 -0.028475789353251457,
 -0.04494095966219902,
 0.03219582885503769,
 -0.0011045010760426521,
 0.015106625854969025,
 0.03537397459149361,
 -0.021542729809880257,
 -0.006963697262108326,
 0.0072737629525363445,
 0.016461744904518127,
 0.020529672503471375,
 0.00537773035466671,
 0.017369721084833145,
 -0.04840398207306862,
 -0.03653369098901749,
 -0.014139743521809578,
 0.02123447321355343,
 -0.022698258981108665,
 -0.033381037414073944,
 0.0032317503355443478,
 0.012383311055600643,
 0.016676444560289383,
 -0.000308634247630

## Inicializa uma lista para armazenar todos os documentos


In [6]:
all_docs = []

for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        print(f"Processando: {filename}")
        
       
        loader = PyPDFLoader(os.path.join(pdf_folder, filename))
        docs = loader.load()
        
       
        all_docs.extend(docs)

Processando: Processo_0000746-09.2022.5.13.0033.pdf


## Divide todos os documentos em chunks

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
splits = text_splitter.split_documents(all_docs)

## Cria embeddings e armazena no FAISS

In [8]:

vectorstore = FAISS.from_documents(splits, embeddings)

In [10]:
vectorstore.index.reconstruct(0)

array([-0.0265008 ,  0.0441442 ,  0.00557468, ...,  0.00730532,
       -0.00920477,  0.02945846], dtype=float32)

## Salva o vectorstore

In [10]:

vectorstore.save_local(os.path.join(output_folder, "db_faiss_all_pdfs"))   

print("Processamento concluído e convertidos em embeddings e salvos no FAISS.")

Processamento concluído e convertidos em embeddings e salvos no FAISS.


## Avalia o carregamento do vectorstore existente

In [12]:
print("Carregando o banco de dados FAISS existente...")
existing_db_path = 'vectorstore/db_faiss_all_pdfs'

vectorstore = FAISS.load_local(existing_db_path, embeddings, allow_dangerous_deserialization=True)

Carregando o banco de dados FAISS existente...


## Processa cada novo arquivo PDF na pasta

In [None]:

for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        print(f"Processando novo PDF: {filename}")
        
        # Carrega o PDF
        loader = PyPDFLoader(os.path.join(pdf_folder, filename))
        docs = loader.load()
        
        # Divide o texto em chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        splits = text_splitter.split_documents(docs)
        
        # Adiciona os novos documentos ao vectorstore existente
        vectorstore.add_documents(splits)
        
        print(f"Embeddings adicionados para: {filename}")

## Salva o vectorstore atualizado

In [None]:

updated_db_path = os.path.join(output_folder, "db_faiss_updated")
vectorstore.save_local(updated_db_path)

print(f"Processamento concluído. Banco de dados FAISS atualizado e salvo em: {updated_db_path}")