**FIRST PART-PREPROCESSING,EMBEDDING,VECTORDB**

*IMPORT*

In [32]:
print('Wait for import...')
import os
import glob
import fitz #to import pdf
import re
import chromadb #for vectordb
from chromadb.config import Settings
from FlagEmbedding import BGEM3FlagModel
from langchain_text_splitters import RecursiveCharacterTextSplitter
import ollama
import time
import torch
from tqdm import tqdm
print('Ready!')

Wait for import...
Ready!


*FUNCTIONS*

In [33]:
def device():
    if torch.backends.mps.is_available():
        device = 'mps'
        print('mps ready')
    else:       
        device = 'cpu'
        print('=( no mps')
    return device

def path_extraction(path_manuals):
    #the return is not a sorted list
    return glob.glob(os.path.join(path_manuals, '*.pdf')) #if you want: sorted(glob.gl.....)

def cleaning(text):
    #minimal pre-cleaning
    text = text.replace('\x00', '') #ok for db
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()

def chunkin(text):
    clean_text = cleaning(text)
    #langchain
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len, 
        separators=["\n\n", "\n", ". ", " ", ""] 
    )
    
    return text_splitter.split_text(clean_text)

def text_preprocessing(path_manuals):
    pdf_files = path_extraction(path_manuals)
    all_chunks = []
    all_metadatas = []

    print(f"Processing {len(pdf_files)} files...")

    for pdf_path in tqdm(pdf_files, desc="Lettura PDF"):
        full_text = ''
        file_name = os.path.basename(pdf_path)
        with fitz.open(pdf_path) as doc:
            for page in doc:
                full_text += page.get_text()
            #BLOCK CHUNCKING
            doc_chunks = chunkin(full_text)
            #METADATA SAVE
        for c in doc_chunks:
            all_chunks.append(c)
            all_metadatas.append({'source': file_name})
            #lock loop and divide documents
    

    if len(all_chunks) != len(all_metadatas):
        print(f"WARNING! Chunks: {len(all_chunks)}, Metas: {len(all_metadatas)}")
        return [], []

    print(f"Extraction {len(all_chunks)} total chunks from {len(pdf_files)} files COMPLETE.")
    return all_chunks, all_metadatas

def create_vector_db(db_path, collec_name, embedding_model, chunks, metadatas, batch_size=500):
    client = chromadb.PersistentClient(path=db_path)
    '''
    if you want to make many attempts, unlock this part of the code.
    
    try:
        client.delete_collection(collec_name)
    except:
        pass
    '''
    collection = client.get_or_create_collection(name=collec_name)

    tot_chunk_n = len(chunks)
    print(f"Generating embeddings and saving {tot_chunk_n} chunks...")

    #process chunks in batches for Out Of Memory
    for i in range(0, tot_chunk_n, batch_size):
        
        end_index = min(i + batch_size, tot_chunk_n)

        batch_chunks = chunks[i : end_index]
        batch_metas = metadatas[i : end_index]
        batch_ids = [f"id_{k}" for k in range(i, end_index)] # generate unique id
        
        encoded_output = embedding_model.encode(batch_chunks)
        batch_embeddings = encoded_output['dense_vecs']

        # Append to the database
        collection.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            documents=batch_chunks,
            metadatas=batch_metas
        )
        
    print(f"Done! {len(chunks)} vettors saved into: '{db_path}'.")  

**SET VARIABLES**

In [35]:
#PATH
PDF_PATH = 'new_manuals'
DB_PATH = "/Users/raffaeleciccarone/Desktop/project_ibm/chroma_db"         
COLLECTION_NAME = "manuali_manutenzione_test"
#MAKE CHUNKING
all_chunks, all_metadatas = text_preprocessing(PDF_PATH)
print(f'{len(all_chunks)} ready!')
#DEFINE EMBEDDING MODEL
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, devices=device())
print("Embedding model: BGE-M3")
print("Ready!")
#BATCH SIZE
BATCH_SIZE = 500 #5461 max size for local sqllite

Processing 7 files...


Lettura PDF: 100%|██████████| 7/7 [00:03<00:00,  2.02it/s]


Extraction 7223 total chunks from 7 files COMPLETE.
7223 ready!
mps ready


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Embedding model: BGE-M3
Ready!


In [36]:
create_vector_db(DB_PATH, COLLECTION_NAME, model, all_chunks, all_metadatas)

Generating embeddings and saving 7223 chunks...


pre tokenize: 100%|██████████| 2/2 [00:00<00:00,  6.62it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 2/2 [07:40<00:00, 230.44s/it]
pre tokenize: 100%|██████████| 2/2 [00:00<00:00,  5.82it/s]
Inference Embeddings: 100%|██████████| 2/2 [03:02<00:00, 91.09s/it] 
pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 13.08it/s]
Inference Embeddings: 100%|██████████| 2/2 [14:07<00:00, 423.91s/it]
pre tokenize: 100%|██████████| 2/2 [00:00<00:00,  5.56it/s]
Inference Embeddings: 100%|██████████| 2/2 [20:21<00:00, 610.75s/it]
pre tokenize: 100%|██████████| 2/2 [00:00<00:00,  2.25it/s]
Inference Embeddings: 100%|██████████| 2/2 [21:24<00:00, 642.28s/it] 
pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 16.27it/s]
Inference Embeddings: 100%|██████████| 2/2 [11:48<00:00, 354.0

Done! 7223 vettors saved into: '/Users/raffaeleciccarone/Desktop/project_ibm/chroma_db'.


**TEST INFO | retrieval**

In [39]:
def retrieve_documents(query, embedding_model, collection, n_results=3):
    """
    Research into vector DB.
    Return text, metadatas and score.
    """
    print(f"\nUser query: '{query}'")

    encoded_output = embedding_model.encode(query)
    query_vector = encoded_output['dense_vecs']

    # WARNING: query_embeddings want a list
    # So: [query_vector]
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=n_results,
        include=["documents", "metadatas", "distances"] 
    )

    print(f"\n=== TOP {n_results} RESULTS ===")
    
    retrieved_docs = results['documents'][0]
    retrieved_metas = results['metadatas'][0]
    retrieved_dists = results['distances'][0]

    for i, (doc, meta, dist) in enumerate(zip(retrieved_docs, retrieved_metas, retrieved_dists)):
        print(f"\nCIUUUF CIUUUUUUF n. #{i+1} (distance: {dist:.4f})")
        print(f"From: {meta.get('source', 'Unknown')}") 
        print(f"Chunk text: {doc[:200]}...") 
        print("=D" * 50)

    # Entire pack for llm
    return results


query_user = "Come si esegue la manutenzione del sistema di frenata?"
client = chromadb.PersistentClient(path=DB_PATH)
collection = client.get_collection(name=COLLECTION_NAME)

context_results = retrieve_documents(
    query=query_user,
    embedding_model=model,  
    collection=collection,  
    n_results=5
)


User query: 'Come si esegue la manutenzione del sistema di frenata?'

=== TOP 5 RESULTS ===

CIUUUF CIUUUUUUF n. #1 (distance: 0.6472)
From: ETR_500.pdf
Chunk text: la pressione residua all'interno dell'impianto non possa provocare infortuni.
—
Scaricare l'aria dell'impianto.
—
Aprire, con la chiave in dotazione al personale, lo sportello dei serbatoi freno (fig....
=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D

CIUUUF CIUUUUUUF n. #2 (distance: 0.6542)
From: ETR_500.pdf
Chunk text: la pressione residua all'interno dell'impianto non possa provocare infortuni.
—
Scaricare l'aria dell'impianto.
—
Aprire, con la chiave in dotazione al personale, lo sportello in cui è ubicato il grup...
=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D=D

CIUUUF CIUUUUUUF n. #3 (distance: 0.6693)
From: MR1-3-A.pdf
Chunk text: 1.2. 
Funzionamento generale dell’impianto freno
Il sistema frenante prevede:
