In [3]:
!gsutil cp gs://ecg_dpo_2/me_parameters/ecg_dpo_2_me.json ../vector_store_me_parameters/

Copying gs://ecg_dpo_2/me_parameters/ecg_dpo_2_me.json...
/ [1 files][  390.0 B/  390.0 B]                                                
Operation completed over 1 objects/390.0 B.                                      


In [4]:
import sys

# Add the directory to sys.path
sys.path.append('/home/qianyucazelles/internal_knowledge_chatbot/src')
from utils.doc_to_vertex_search import *
from utils.general_utils import *

from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
import os

In [5]:
import nltk
def nb_tokens(text):
    return len(nltk.word_tokenize(text))

In [6]:
def load_pdf_to_langchain_doc(
    pdf_path,
    unstructure_loader: bool=True):
    
    if unstructure_loader:
        print("Using unstrucure loader")
        loader = UnstructuredPDFLoader(f'{pdf_path}')
    else:
        loader = PyPDFLoader(f'{pdf_path}')
    doc = loader.load()
    
    return doc

In [7]:
def pdf_to_vector_search(
    pdf_path,
    me,
    unstructure_loader: bool=False,
    split_to_article: bool=False
):
    print_with_time(f"Process file:{pdf_path} ")
    
    #load pages
    pages = load_pdf_to_langchain_doc(pdf_path,unstructure_loader)
    
    print_with_line(f"len of pages:{len(pages)}")
    
    #split to article
    if split_to_article:
        articles = split_pages_into_artiles(pages)
        doc_splits = doc_to_chunk(articles)
    else:
        doc_splits = doc_to_chunk(pages)
        

    print_with_line(f"n splits:{len(doc_splits)}")
    
    #add to vector store
    doc_ids = add_splits_to_vector_store(doc_splits,me)
    
    
    #log process
    if len(doc_ids)>0:
        
        print_with_line(f"Added to vector store with {len(doc_ids)} ids.")
        
        with open("../add_docs_to_vectorstore_logs/processed_pdfs.txt", "a") as f:
            f.write(f"'{pdf_path}',\n")
        with open("../add_docs_to_vectorstore_logs/processed_pdfs_doc_ids.txt", "a") as f:
            f.write(f"'{pdf_path}': {doc_ids},\n")
        
    else:
        
        print_with_star("No doc ids returned")
        
        with open(f"../add_docs_to_vectorstore_logs/unprocessed_pdfs.txt", "a") as f:
            f.write(f"'{pdf_path}'," + "\n")
            
    return doc_ids

In [8]:
def process_folder(me,CHATBOT_ID, folder_path, pdf_doc_ids, other_files):
    try:
        processed_pdfs = read_txt_to_list("../add_docs_to_vectorstore_logs/processed_pdfs.txt") 
    except:
        processed_pdfs = []    
    for item in os.listdir(folder_path):
        
        item_path = os.path.join(folder_path, item)
        print_with_time(f"Scanning {item_path}")
        
        if os.path.isdir(item_path):
            
            print_with_time(f"{item_path} is folder, now go to new level")
            process_folder(me,CHATBOT_ID,item_path,pdf_doc_ids, other_files)
            
        elif item.lower().endswith('.pdf'):
            if item not in processed_pdfs:
                
                print_with_time(f"{item_path} is pdf, now process to vector store")
                try:
                    
                    doc_ids = pdf_to_vector_search(item_path,me)
                    
                except Exception as e:
                    with open(f"../add_docs_to_vectorstore_logs/unprocessed_pdfs.txt", "a") as f:
                        f.write(f"'{item_path}'," + "\n")
                    print_with_star(f"An error occurred while processing {item_path}: {e}")
                    doc_ids=[]
                pdf_doc_ids[item_path]=doc_ids
            
        else:
            other_files.append(item_path)

In [9]:
def main(me,
         CHATBOT_ID: str,
         root_folder: str
         ):
    
    pdf_data = {}
    other_files = []
    
    process_folder(me,CHATBOT_ID,root_folder, pdf_data, other_files)
    
    # Writing PDF data to JSON file
    pdf_output_file = f"../add_docs_to_vectorstore_logs/{CHATBOT_ID}_pdf_doc_ids.json"
    with open(pdf_output_file, "w") as json_file:
        json.dump(pdf_data, json_file, indent=4)
    print(f"PDF data has been written to {pdf_output_file}")
    
    # Writing other files list to JSON file
    other_output_file = f"../add_docs_to_vectorstore_logs/{CHATBOT_ID}_non_pdf_files.json"
    with open(other_output_file, "w") as json_file:
        json.dump(other_files, json_file, indent=4)
    print(f"Non-PDF files list has been written to {other_output_file}")



In [11]:
root_folder = f"../documents/ecg_dpo/"

In [12]:
if __name__ == "__main__":
    CHATBOT_ID = "ecg_dpo"
    # root_folder = f"../documents/{CHATBOT_ID}/"
    # get embeddings
    embeddings = get_embeddings()

    #get parameters
    parameters = get_me_parameters(f'../vector_store_me_parameters/{CHATBOT_ID}_me.json')

    #get vector store
    me = get_vector_store(parameters, embeddings)
    
    main(me,CHATBOT_ID,root_folder)

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


[2024-04-19 17:01:18] Scanning ../documents/ecg_dpo/Code civil
[2024-04-19 17:01:18] ../documents/ecg_dpo/Code civil is folder, now go to new level
[2024-04-19 17:01:18] Scanning ../documents/ecg_dpo/Code civil/LEGITEXT000006070721.pdf
[2024-04-19 17:01:18] ../documents/ecg_dpo/Code civil/LEGITEXT000006070721.pdf is pdf, now process to vector store
[2024-04-19 17:01:18] Process file:../documents/ecg_dpo/Code civil/LEGITEXT000006070721.pdf 
----------------------------------------------------------------------------------------------------
[2024-04-19 17:01:21] len of pages:679
----------------------------------------------------------------------------------------------------
[2024-04-19 17:01:21] n splits:679
Waiting
.....................................................................................................................................successfully added to vector store with 679 new doc ids
-----------------------------------------------------------------------------------