In [1]:
import sys

# Add the directory to sys.path
sys.path.append('/home/qianyucazelles/chatbot_rag/src')
from utils.doc_to_vertex_search import *

from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
import os

KeyboardInterrupt: 

In [None]:
from datetime import datetime
def print_with_time(message):
    current_time = datetime.now()
    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{formatted_time}] {message}")

In [None]:
def read_txt_to_list(input_file):
    # Initialize an empty list to store the item paths
    item_paths = []

    # Open the file in read mode
    with open(input_file, "r") as f:
        # Read each line in the file
        for line in f:
            # Remove trailing whitespace and newline characters
            line = line.strip()
            # Append the cleaned line to the list
            item_paths.append(line)
    return item_paths

In [None]:
def print_with_line(message):
    print("-"*100)
    print_with_time(message)
    
def print_with_star(message):
    print("*"*100)
    print_with_time(message)
    

In [None]:
def pdf_to_vector_search(
    pdf_path,
    me,
    unstructure_loader: bool=True,
    split_to_article: bool=False
):
    print_with_time(f"Process file:{pdf_path} ")
    
    #load pages
    if unstructure_loader:
        print("Using unstrucure loader")
        loader = UnstructuredPDFLoader(f'{pdf_path}')
    else:
        loader = PyPDFLoader(f'{pdf_path}')
    pages = loader.load()
    
    print_with_line(f"len of pages:len(pages)")
    
    #split to article
    if split_to_article:
        articles = split_pages_into_artiles(pages)
        doc_splits = text_to_chunk(articles)
    else:
        doc_splits = text_to_chunk(pages)
        

    print_with_line(f"n splits:len(doc_splits)")
    
    #add to vector store
    doc_ids = add_splits_to_vector_store(doc_splits,me)
    
    
    if len(doc_ids)>0:
        
        print_with_line(f"Added to vector store with {len(doc_ids)} ids.")
        
        with open("../add_docs_to_vectorstore_logs/processed_pdfs.txt", "a") as f:
            f.write(f"'{pdf_path}',\n")
        with open("../add_docs_to_vectorstore_logs/processed_pdfs_doc_ids.txt", "a") as f:
            f.write(f"'{pdf_path}': {doc_ids},\n")
        
    else:
        
        print_with_star("No doc ids returned")
        
        with open(f"../add_docs_to_vectorstore_logs/unprocessed_pdfs.txt", "a") as f:
            f.write(f"'{pdf_path}'," + "\n")
            
    return doc_ids

In [None]:
def process_folder(me,CHATBOT_ID, folder_path, pdf_doc_ids, other_files):
    processed_pdfs = read_txt_to_list("../add_docs_to_vectorstore_logs/processed_pdfs.txt")     
    for item in os.listdir(folder_path):
        
        item_path = os.path.join(folder_path, item)
        print_with_time(f"Scanning {item_path}")
        
        if os.path.isdir(item_path):
            
            print_with_time(f"{item_path} is folder, now go to new level")
            process_folder(me,CHATBOT_ID,item_path,pdf_doc_ids, other_files)
            
        elif item.lower().endswith('.pdf'):
            if item not in processed_pdfs:
                
                print_with_time(f"{item_path} is pdf, now process to vector store")
                try:
                    
                    doc_ids = pdf_to_vector_search(item_path,me)
                    
                except Exception as e:
                    with open(f"../add_docs_to_vectorstore_logs/unprocessed_pdfs.txt", "a") as f:
                        f.write(f"'{item_path}'," + "\n")
                    print(f"An error occurred while processing {item_path}: {e}")
                    doc_ids=[]
                pdf_doc_ids[item_path]=doc_ids
            
        else:
            other_files.append(item_path)

In [None]:
def main(me,
         CHATBOT_ID: str,
         root_folder: str
         ):
    
    pdf_data = {}
    other_files = []
    
    process_folder(me,CHATBOT_ID,root_folder, pdf_data, other_files)
    
    # Writing PDF data to JSON file
    pdf_output_file = f"../vector_store_me_parameters/{CHATBOT_ID}_pdf_doc_ids.json"
    with open(pdf_output_file, "w") as json_file:
        json.dump(pdf_data, json_file, indent=4)
    print(f"PDF data has been written to {pdf_output_file}")
    
    # Writing other files list to JSON file
    other_output_file = f"../vector_store_me_parameters/{CHATBOT_ID}_non_pdf_files.json"
    with open(other_output_file, "w") as json_file:
        json.dump(other_files, json_file, indent=4)
    print(f"Non-PDF files list has been written to {other_output_file}")



In [None]:
if __name__ == "__main__":
    CHATBOT_ID = "ecg_assurance"
    root_folder = f"../documents/{CHATBOT_ID}/"
    # get embeddings
    embeddings = get_embeddings()

    #get parameters
    parameters = get_me_parameters(f'../vector_store_me_parameters/{CHATBOT_ID}_me.json')

    #get vector store
    me = get_vector_store(parameters, embeddings)
    
    main(me,CHATBOT_ID,root_folder)

2024-04-17 19:37:35.964096: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


[2024-04-17 19:37:41] Scanning ../documents/ecg_assurance/Contrats 2024
[2024-04-17 19:37:41] ../documents/ecg_assurance/Contrats 2024 is folder, now go to new level
[2024-04-17 19:37:41] Scanning ../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE
[2024-04-17 19:37:41] ../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE is folder, now go to new level
[2024-04-17 19:37:41] Scanning ../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf
[2024-04-17 19:37:41] ../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf is pdf, now process to vector store
[2024-04-17 19:37:41] Process file:../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf 
Using unstrucure loader


  from .autonotebook import tqdm as notebook_tqdm


----------------------------------------------------------------------------------------------------
[2024-04-17 19:38:19] len of pages:len(pages)
----------------------------------------------------------------------------------------------------
[2024-04-17 19:38:19] n splits:len(doc_splits)
Waiting
...successfully added to vector store with 19 new doc ids
----------------------------------------------------------------------------------------------------
[2024-04-17 19:38:24] Added to vector store with 19 ids.
[2024-04-17 19:38:24] Scanning ../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf
[2024-04-17 19:38:24] ../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf is pdf, now process to vector store
[2024-04-17 19:38:24] Process file:../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/Avenant police LOCAUX DE STOCKAGE Allianz 2024.pdf 
Using unstrucure loader

In [None]:
# pdf_path = "/home/qianyucazelles/internal_knowledge_chatbot/src/documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf","ecg_assurance"
# CHATBOT_ID = "ecg_assurance"
# root_folder = f"/home/qianyucazelles/chatbot_rag/src/documents/{CHATBOT_ID}/"
# # get embeddings
# embeddings = get_embeddings()

# #get parameters
# parameters = get_me_parameters(f'../vector_store_me_parameters/{CHATBOT_ID}_me.json')

# #get vector store
# me = get_vector_store(parameters, embeddings)
# # pdf_to_vector_search(pdf_path,me)

In [None]:
# pdf_to_vector_search("../documents/ecg_assurance/Contrats 2024/SIEGES/Generali - Si\u00e8ge Montpellier.pdf",me)

In [None]:
# loader = UnstructuredPDFLoader("../documents/ecg_assurance/Contrats 2024/LOCAUX DE STOCKAGE/NEW Contratto polizza magazzino Manerba.pdf")

In [None]:
# pages = loader.load()


In [None]:
# #split to article
# split_to_article= False
# if split_to_article:
#     articles = split_pages_into_artiles(pages)
#     doc_splits = text_to_chunk(articles)
# else:
#     doc_splits = text_to_chunk(pages)
    


# #add to vector store
# doc_ids = add_splits_to_vector_store(doc_splits,me)

In [None]:
# parameters = get_me_parameters(f'../vector_store_me_parameters/{CHATBOT_ID}_me.json')

In [None]:
# # init the aiplatform package
# from google.cloud import aiplatform

# aiplatform.init(project=parameters['PROJECT_ID'], location=parameters['LOCATION'])

In [None]:
# my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(parameters['ME_INDEX_ENDPOINT_ID'])

In [None]:
# my_index_endpoint.undeploy_all()
# my_index_endpoint.delete(force=True)

In [None]:
# me.similarity_search("way to reserve a booking", k=4)