# Gerador de Embedding e salvar em um banco vetorial

In [1]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

In [2]:
# Configurações
pdf_folder = 'pdf_ppg'
output_folder = 'vectorstore'
chunk_size = 1000
chunk_overlap = 200
model_name = "BAAI/bge-m3"

## Cria a pasta de saída se não existir

In [3]:
os.makedirs(output_folder, exist_ok=True)

## Embedding

In [4]:
embeddings = HuggingFaceEmbeddings(model_name = model_name)


  from tqdm.autonotebook import tqdm, trange


In [10]:
input_test = "Um teste de embedding"
result = embeddings.embed_query(input_test)
result

[-0.02490304782986641,
 -0.0032635200768709183,
 -0.036549534648656845,
 0.010908761993050575,
 -0.006346122361719608,
 -0.01504512969404459,
 0.03247008100152016,
 0.017564868554472923,
 0.013421802781522274,
 0.0029437681660056114,
 0.0029186538886278868,
 -0.02915957011282444,
 0.01816558837890625,
 -0.006006191018968821,
 0.007251410745084286,
 -0.009097369387745857,
 0.0024718865752220154,
 -0.0006890571676194668,
 0.00029019167413935065,
 -0.03151866793632507,
 -0.028475850820541382,
 -0.04494099318981171,
 0.03219576179981232,
 -0.0011045554419979453,
 0.015106668695807457,
 0.03537403419613838,
 -0.021542703732848167,
 -0.006963635794818401,
 0.007273894269019365,
 0.0164616871625185,
 0.02052968367934227,
 0.005377693567425013,
 0.017369689419865608,
 -0.04840395599603653,
 -0.03653370216488838,
 -0.014139838516712189,
 0.021234503015875816,
 -0.022698242217302322,
 -0.033381011337041855,
 0.0032316986471414566,
 0.01238334272056818,
 0.01667656935751438,
 -0.00030855819932185

## Inicializa uma lista para armazenar todos os documentos


In [5]:
all_docs = []

for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        print(f"Processando: {filename}")
        
       
        loader = PyPDFLoader(os.path.join(pdf_folder, filename))
        docs = loader.load()
        
       
        all_docs.extend(docs)

Processando: 2017 - A new shortrecorded photoplethysmogram dataset for blood pressure monitoring in China.pdf
Processando: 2020 - A_Noninvasive_Blood_Glucose_Monitoring_System_Based_on_Smartphone_PPG_Signal_Processing_and_Machine_Learning.pdf
Processando: 2023 - Intensive care photoplethysmogram datasets and machine-learning for blood.pdf
Processando: 2023 - MACHINE LEARNING-BASED DIABETES DETECTION USING.pdf
Processando: 2024 - Non-Invasive_Blood_Glucose_Estimation_Based_on_Machine_Learning_Algorithms_Using_PPG_Signals.pdf
Processando: 2024 - Non-Invasive_Continuous_Real-Time_Blood_Glucose_Estimation_Using_PPG_Features-based_Convolutional_Autoencoder_with_TinyML_Implementation.pdf
Processando: 2024 - Non-Invasive_Glucose_Measurement_Technologies_Recent_Advancements_and_Future_Challenges.pdf
Processando: 2024 - PPG-Based_Feature_Extraction_for_Type_II_Diabetes_Prediction_A_Machine_Learning_Approach.pdf


## Divide todos os documentos em chunks

In [6]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
splits = text_splitter.split_documents(all_docs)

## Cria embeddings e armazena no FAISS

In [7]:

vectorstore = FAISS.from_documents(splits, embeddings)

In [8]:
vectorstore.index.reconstruct(0)

array([-0.04739801, -0.04351531, -0.06772435, ...,  0.01635654,
       -0.0265494 ,  0.00369587], dtype=float32)

## Salva o vectorstore

In [9]:

vectorstore.save_local(os.path.join(output_folder, "db_faiss_all_pdfs"))

print("Processamento concluído e convertidos em embeddings e salvos no FAISS.")

Processamento concluído e convertidos em embeddings e salvos no FAISS.


## Avalia o carregamento do vectorstore existente

In [11]:
print("Carregando o banco de dados FAISS existente...")
existing_db_path = 'vectorstore/db_faiss_all_pdfs'

vectorstore = FAISS.load_local(existing_db_path, embeddings, allow_dangerous_deserialization=True)

Carregando o banco de dados FAISS existente...


## Processa cada novo arquivo PDF na pasta

In [None]:

for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        print(f"Processando novo PDF: {filename}")
        
        # Carrega o PDF
        loader = PyPDFLoader(os.path.join(pdf_folder, filename))
        docs = loader.load()
        
        # Divide o texto em chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        splits = text_splitter.split_documents(docs)
        
        # Adiciona os novos documentos ao vectorstore existente
        vectorstore.add_documents(splits)
        
        print(f"Embeddings adicionados para: {filename}")

## Salva o vectorstore atualizado

In [None]:

updated_db_path = os.path.join(output_folder, "db_faiss_updated")
vectorstore.save_local(updated_db_path)

print(f"Processamento concluído. Banco de dados FAISS atualizado e salvo em: {updated_db_path}")