In [1]:
import sys

# Add the directory to sys.path
sys.path.append('/home/qianyucazelles/chatbot_rag/src')
from utils.doc_to_vertex_search import *

from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
import os

In [2]:
from datetime import datetime
def print_with_time(message):
    current_time = datetime.now()
    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{formatted_time}] {message}")

In [3]:
def pdf_to_vector_search(
    pdf_path,
    CHATBOT_ID: str,
    unstructure_loader: bool=True,
    split_to_article: bool=False
):
    print_with_time(f"Process file:{pdf_path} ")
    
    #load pages
    if unstructure_loader:
        print("Using unstrucure loader")
        loader = UnstructuredPDFLoader(f'{pdf_path}')
    else:
        loader = PyPDFLoader(f'{pdf_path}')
    pages = loader.load()
    #split to article
    if split_to_article:
        articles = split_pages_into_artiles(pages)
        doc_splits = text_to_chunk(articles)
    else:
        doc_splits = text_to_chunk(pages)
        
    # get embeddings
    embeddings = get_embeddings()

    #get parameters
    parameters = get_me_parameters(f'../vector_store_me_parameters/{CHATBOT_ID}_me.json')

    #get vector store
    me = get_vector_store(parameters, embeddings)

    #add to vector store
    doc_ids = add_splits_to_vector_store(doc_splits,me)
    
    print_with_time(f"{pdf_path} is added to vector store.")
        
    return doc_ids

In [11]:
def process_folder(CHATBOT_ID, folder_path, pdf_doc_ids, other_files):
         
    for item in os.listdir(folder_path):
        
        item_path = os.path.join(folder_path, item)
        print_with_time(f"Scanning {item_path}")
        
        if os.path.isdir(item_path):
            
            print_with_time(f"{item_path} is folder, now go to new level")
            process_folder(CHATBOT_ID,item_path,pdf_doc_ids, other_files)
            
        elif item.lower().endswith('.pdf'):
            
            print_with_time(f"{item_path} is pdf, now process to vector store")
            try:
                doc_ids = pdf_to_vector_search(item_path,CHATBOT_ID)
            except:
                doc_ids=[]
            pdf_doc_ids[item_path]=doc_ids
            
        else:
            other_files.append(item_path)

In [12]:
def main(CHATBOT_ID: str,
         root_folder: str
         ):
    
    pdf_data = {}
    other_files = []
    
    process_folder(CHATBOT_ID,root_folder, pdf_data, other_files)
    
    # Writing PDF data to JSON file
    pdf_output_file = f"../vector_store_me_parameters/{CHATBOT_ID}_pdf_doc_ids.json"
    with open(pdf_output_file, "w") as json_file:
        json.dump(pdf_data, json_file, indent=4)
    print(f"PDF data has been written to {pdf_output_file}")
    
    # Writing other files list to JSON file
    other_output_file = f"../vector_store_me_parameters/{CHATBOT_ID}_non_pdf_files.json"
    with open(other_output_file, "w") as json_file:
        json.dump(other_files, json_file, indent=4)
    print(f"Non-PDF files list has been written to {other_output_file}")



In [6]:
!ls ../documents/ecg_hr/

'CONVENTION COLLECTIVE HPA (à jour au 13 02 2024).docx.pdf'
'Nouveau RI 2021 Version finale_FR EN_signed.pdf'


In [13]:
if __name__ == "__main__":
    CHATBOT_ID = "ecg_assurance"
    root_folder = f"/home/qianyucazelles/chatbot_rag/src/documents/{CHATBOT_ID}/"
    main(CHATBOT_ID,root_folder)

[2024-04-17 16:36:43] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024
[2024-04-17 16:36:43] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024 is folder, now go to new level
[2024-04-17 16:36:43] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE
[2024-04-17 16:36:43] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE is folder, now go to new level
[2024-04-17 16:36:43] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf
[2024-04-17 16:36:43] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf is pdf, now process to vector store
[2024-04-17 16:36:43] Process file:/home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


Waiting
...successfully added to vector store with 19 new doc ids
[2024-04-17 16:37:30] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf is added to vector store.
[2024-04-17 16:37:30] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf
[2024-04-17 16:37:30] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf is pdf, now process to vector store
[2024-04-17 16:37:30] Process file:/home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf 
Using unstrucure loader


Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


Waiting
......successfully added to vector store with 39 new doc ids
[2024-04-17 16:39:17] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf is added to vector store.
[2024-04-17 16:39:17] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE LPS Allianz 2024.pdf
[2024-04-17 16:39:17] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE LPS Allianz 2024.pdf is pdf, now process to vector store
[2024-04-17 16:39:17] Process file:/home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE LPS Allianz 2024.pdf 
Using unstrucure loader


Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


Waiting
.successfully added to vector store with 9 new doc ids
[2024-04-17 16:39:37] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE LPS Allianz 2024.pdf is added to vector store.
[2024-04-17 16:39:37] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant 2024 Police locaux de stockage Allianz.pdf
[2024-04-17 16:39:37] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant 2024 Police locaux de stockage Allianz.pdf is pdf, now process to vector store
[2024-04-17 16:39:37] Process file:/home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant 2024 Police locaux de stockage Allianz.pdf 
Using unstrucure loader


Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


Waiting
........successfully added to vector store with 41 new doc ids
[2024-04-17 16:39:50] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant 2024 Police locaux de stockage Allianz.pdf is added to vector store.
[2024-04-17 16:39:50] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant LPS 2024 police locaux de stockage Allianz.pdf
[2024-04-17 16:39:50] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant LPS 2024 police locaux de stockage Allianz.pdf is pdf, now process to vector store
[2024-04-17 16:39:50] Process file:/home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant LPS 2024 police locaux de stockage Allianz.pdf 
Using unstrucure loader


Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


Waiting
.successfully added to vector store with 7 new doc ids
[2024-04-17 16:39:54] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant LPS 2024 police locaux de stockage Allianz.pdf is added to vector store.
[2024-04-17 16:39:54] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/ECG ITALIA - véhicules
[2024-04-17 16:39:54] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/ECG ITALIA - véhicules is folder, now go to new level
[2024-04-17 16:39:54] Scanning /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/ECG ITALIA - véhicules/polizza ECG Italia Srl n. 533662573 _ copia per direzione firmata.pdf
[2024-04-17 16:39:54] /home/qianyucazelles/chatbot_rag/src/documents/ecg_assurance/Contrats 2024/ECG ITALIA - véhicules/polizza ECG Italia Srl n. 533662573 _ copia per direzione firmata.pdf is pdf, now process to vector store
[2024-04-17 16:39:54] Process file:

PDFPageCountError: Unable to get page count.
Syntax Error: Gen inside xref table too large (bigger than INT_MAX)
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Gen inside xref table too large (bigger than INT_MAX)
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table


In [None]:
# parameters = get_me_parameters(f'../vector_store_me_parameters/{CHATBOT_ID}_me.json')

In [None]:
# # init the aiplatform package
# from google.cloud import aiplatform

# aiplatform.init(project=parameters['PROJECT_ID'], location=parameters['LOCATION'])

In [None]:
# my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(parameters['ME_INDEX_ENDPOINT_ID'])

In [None]:
# my_index_endpoint.undeploy_all()
# my_index_endpoint.delete(force=True)

In [None]:
# me.similarity_search("way to reserve a booking", k=4)