Steps:
- Load pdfs
- Split text
- Embeddings
- Build Index
- Deploy index to endpoint/Store chunks in cloud
- QA chain


In [1]:
import numpy as np
import json

# Vertex parameters settings

In [2]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]

LOCATION = ! gcloud config get-value compute/region
LOCATION = LOCATION[0]

In [30]:
print(PROJECT_ID, LOCATION)

ecg-ai-416210 europe-west1


In [3]:
# matching engine
CHATBOT_NAME = "legal-chatbot"
ME_INDEX_NAME = f"{PROJECT_ID}-{CHATBOT_NAME}-me-index"  # @param {type:"string"}
ME_EMBEDDING_DIR = f"{PROJECT_ID}-{CHATBOT_NAME}-me-bucket"  # @param {type:"string"}
ME_DIMENSIONS = 768  # when using Vertex PaLM Embedding

# Load pdfs

In [4]:
# !pip install langchain

In [5]:
# !pip install pypdf

In [6]:
from langchain.document_loaders import PyPDFLoader

In [7]:
local_pdf_path = "pdfs/Homair-FR.pdf"
loader = PyPDFLoader(local_pdf_path)
pages = loader.load()

In [8]:
# pages[0]

# Split documents

##### - Split page by header "ARTICLE" and add title

In [9]:
import re
from langchain_core.documents.base import Document

articles = []
title_ = ''
for pn_, page in enumerate(pages):

    page_content = page.page_content
    metadata = page.metadata.copy()
    metadata['page']+=1
    header_spliter = "ARTICLE"
    
    
    for i,s_ in enumerate(page_content.split(header_spliter)):
        
        first_line = s_.split("\n")[0]
        len_first_line = len(first_line)
     
        
        if pn_+i==0: # if first page first split
            title_dict = {"title":s_.split("\n")[3]} #TBU
        elif i==0 & len_first_line<10: # if an article is splitted into 2 pages, combine previous content together           
            title_dict = {"title":f'{header_spliter}{title_}'}
            previous_part = articles[-1].page_content
            s_ = f"{previous_part} \n\n {s_}"
            articles = articles[:-1]
        else:
            title_ = first_line
            title_dict = {"title":f'{header_spliter}{title_}'}
            s_ = f"{header_spliter}{s_}"
        metadata_article = {**metadata,**title_dict}
        article = Document(page_content=s_, metadata=metadata_article)
        # print(metadata_article )
        articles.append(article)

In [10]:
articles[0]

Document(page_content=' \n \n1/7 \n CONDITIONS GENERALES DE LOCATION  \n \nAfin de pouvoir bénéficier des prestations proposées par la société Homair Vacances au travers de  \nses marques commerciales , nous vous demandons de lire attentivement les présentes Conditions \nGénérales de Location.  \n \nAfin de faciliter la lecture du présent texte, nous avons employé le masculin comme genre neutre, \nsans discrimination, pour désigner aussi bien les femmes que les hommes.  \n \nVersion publiée le 23/01/202 4 \n \n', metadata={'source': 'pdfs/Homair-FR.pdf', 'page': 1, 'title': ' CONDITIONS GENERALES DE LOCATION  '})

In [11]:
articles[2]

Document(page_content="ARTICLE 2 - LEXIQUE  \n• « Camping »  : Etablissement touristique exploité en tout ou partie (uniquement location \nd’Hébergements) par la Société  ; \n• « Catalogue »  : tous les supports de communication en version papier (et en version \nnumérique ) présentant tout ou partie des  Campings  et des Hébergements proposés par la \nSociété.   \n• « Client »  : personne physique assimilée à un particulier qui effectue et est à l’origine du \npaiement d’une réservation pour son propre compte. Le Client peut également être un \npartenaire (par exemple, un Comité Social Economique) qui effectue une réservation pour l e \ncompte d'un ayant droit. La notion de Client s’oppose, entre autres, aux notions de participant \nau séjour, groupe, occupant, visiteur, résident ou collectivité.  \n• « Conditions Générales » ou «  CGL »  : les présentes conditions générales de location.  \n• « Contrat de location  » : la confirmation de réservation et les CG L constituent ensemble le

In [12]:
articles[18]

Document(page_content='ARTICLE 18- SERVICES DU CAMPING  \nLa Société fait ses meilleurs efforts pour maintenir à jour les informations et descriptifs relatifs aux \nServices du Camping qu’elle propose sur l’ensemble des Sites internet et Catalogue . Il peut arriver \nque certaines activités et installations proposées par les Campings des Sociétés  exploitantes, et \nindiquées dans le descriptif, soient supprimées ou modifiées, notamment pour des raisons \nclimatiques, en cas de force majeure, ou de dysfonctionnement en avant ou arrière -saison. La Société \ninformera dans la m esure du possible le Client des travaux ou aménagements entrepris sur les \nCampings lors de son séjour.  \n \n', metadata={'source': 'pdfs/Homair-FR.pdf', 'page': 7, 'title': 'ARTICLE 18- SERVICES DU CAMPING  '})

##### - split articles to chunk

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=50,
    separators=["\n\n", ".", "!", "?"] #, ".", "!", "?", ",", " ", ""
)

In [15]:
doc_splits = text_splitter.split_documents(articles)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 73


# Vectorstore and Embeddings

In [16]:
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

In [17]:
mengine = MatchingEngineUtils(PROJECT_ID, LOCATION, ME_INDEX_NAME)

## - Create index and deploy end point if first usage

##### - Vector Store parameters

##### - create bucket and dummy init file

In [47]:
# Make a Google Cloud Storage bucket for your Matching Engine index
! set -x && gsutil mb -p $PROJECT_ID -l $LOCATION gs://$ME_EMBEDDING_DIR

+ gsutil mb -p ecg-ai-416210 -l europe-west1 gs://ecg-ai-416210-legal-chatbot-me-bucket
Creating gs://ecg-ai-416210-legal-chatbot-me-bucket/...


In [53]:
# Create a dummy embeddings file to initialize when creating the index
import uuid
# dummy embedding
init_embedding = {"id": str(uuid.uuid4()), "embedding": list(np.zeros(ME_DIMENSIONS))}

# dump embedding to a local file
with open("embeddings_0.json", "w") as f:
    json.dump(init_embedding, f)

# write embedding to Cloud Storage
! set -x && gsutil cp embeddings_0.json gs://{ME_EMBEDDING_DIR}/init_index/embeddings_0.json

+ gsutil cp embeddings_0.json gs://ecg-ai-416210-legal-chatbot-me-bucket/init_index/embeddings_0.json
Copying file://embeddings_0.json [Content-Type=application/json]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


##### - create index

In [58]:
index = mengine.create_index(
    embedding_gcs_uri=f"gs://{ME_EMBEDDING_DIR}/init_index",
    dimensions=ME_DIMENSIONS,
    index_update_method="streaming",
    index_algorithm="tree-ah",
)
if index:
    print(index.name)

INFO:root:Index ecg-ai-416210-legal-chatbot-me-index does not exists. Creating index ...
INFO:root:Creating index with long running operation projects/500033913879/locations/europe-west1/indexes/1183584684882264064/operations/3123674601360457728
INFO:root:Poll the operation to create index ...
INFO:root:Index ecg-ai-416210-legal-chatbot-me-index created with resource name as projects/500033913879/locations/europe-west1/indexes/1183584684882264064


.projects/500033913879/locations/europe-west1/indexes/1183584684882264064


##### - deploy index to endpoint

In [None]:
index_endpoint = mengine.deploy_index()
if index_endpoint:
    print(f"Index endpoint resource name: {index_endpoint.name}")
    print(
        f"Index endpoint public domain name: {index_endpoint.public_endpoint_domain_name}"
    )
    print("Deployed indexes on the index endpoint:")
    for d in index_endpoint.deployed_indexes:
        print(f"    {d.id}")

INFO:root:Index endpoint ecg-ai-416210-legal-chatbot-me-index-endpoint does not exists. Creating index endpoint...
INFO:root:Deploying index to endpoint with long running operation projects/500033913879/locations/europe-west1/indexEndpoints/5425227865958383616/operations/1970753096753610752
INFO:root:Poll the operation to create index endpoint ...
INFO:root:Index endpoint ecg-ai-416210-legal-chatbot-me-index-endpoint created with resource name as projects/500033913879/locations/europe-west1/indexEndpoints/5425227865958383616 and endpoint domain name as 
INFO:root:Deploying index with request = id: "ecg_ai_416210_legal_chatbot_me_index_20240327140725"
index: "projects/500033913879/locations/europe-west1/indexes/1183584684882264064"
display_name: "ecg_ai_416210_legal_chatbot_me_index_20240327140725"
dedicated_resources {
  machine_spec {
    machine_type: "e2-standard-2"
  }
  min_replica_count: 2
  max_replica_count: 10
}

INFO:root:Poll the operation to deploy index ...


......................

INFO:root:Deployed index ecg-ai-416210-legal-chatbot-me-index to endpoint ecg-ai-416210-legal-chatbot-me-index-endpoint


.Index endpoint resource name: projects/500033913879/locations/europe-west1/indexEndpoints/5425227865958383616
Index endpoint public domain name: 
Deployed indexes on the index endpoint:


https://console.cloud.google.com/vertex-ai/matching-engine/indexes?project=ecg-ai-416210

## - Configure Matching Engine as Vector Store

In [18]:
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/500033913879/locations/europe-west1/indexes/1183584684882264064
ME_INDEX_ENDPOINT_ID=projects/500033913879/locations/europe-west1/indexEndpoints/5425227865958383616


In [19]:
# Create a dictionary with the parameter names and their values
me_parameters = {
    "PROJECT_ID": PROJECT_ID,
    "LOCATION": LOCATION,
    "CHATBOT_NAME": CHATBOT_NAME,
    "ME_INDEX_ID": ME_INDEX_ID,
    "ME_INDEX_ENDPOINT_ID": ME_INDEX_ENDPOINT_ID,
    "ME_INDEX_NAME": ME_INDEX_NAME,
    "ME_EMBEDDING_DIR": ME_EMBEDDING_DIR,
    "ME_DIMENSIONS": ME_DIMENSIONS
    }
# Specify the file path where you want to save the JSON data
file_path = "me_parameters.json"

# Write the dictionary to a JSON file
with open(file_path, "w") as json_file:
    json.dump(me_parameters, json_file, indent=4)

Parameters have been written to me_parameters.json


##### - set embeddings

In [22]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/home/jupyter/chatbot_rag/src/utils/__init__.py'>

In [23]:
from utils.custom_vertexai_embeddings import CustomVertexAIEmbeddings

In [24]:
# Embeddings API integrated with langChain
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)



##### - initialize vector store

In [25]:
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=LOCATION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

##### - Add documents splits as embeddings in Matching Engine as index

In [26]:
texts = [doc.page_content for doc in doc_splits]
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        {"namespace": "page", "allow_list": [str(doc.metadata["page"])]},
        {"namespace": "title", "allow_list": [str(doc.metadata["title"])]},
        {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in doc_splits
]

In [27]:
doc_ids = me.add_texts(texts=texts, metadatas=metadatas)

Waiting
.............

INFO:root:Indexed 73 documents to Matching Engine.


##### - test similarity search

In [28]:
# Test whether search from vector store is working
me.similarity_search("moyens de réserver un séjour", k=4)

Waiting


[Document(page_content='.  \n \nModalités de Réservation  Il existe quatre moyens de réserver un séjour :  \n \n• -Par téléphone  : auprès de nos équipes de réservation et via les numéros suivants (coût d’un appel \nlocal) :  \n• pour Homair Vacances  au 04.84.39.08.60  \n• pour Tohapi au 04.48.20.20.20  ou +33 4 48 20 20 20 (depuis l’étranger)  \n \n• -Par Internet  : sur les Sites internet de la Société.  \n \n• -Par mail  : via les formulaire s de contact suivants  : \n• pour Homair Vacances  : https://contact.homair.com/hc/fr/requests/new  \n• pour Tohapi  :  reservations@tohapi.fr   \n• pour Marvilla Parks  : https://contact.marvilla -parks.com/hc/fr/requests/new   \n \n• Sur place  : exclusivement pour les Campings Homair Vacances , auprès de l’accueil du Camping.  \n \nLa Réservation d’un séjour s’effectue selon les étapes  suivantes :  \n \n1)  Le Client sélectionne le Camping de son choix selon son descriptif.  \n2) Le Client sélectionne la durée du séjour, la date de départ, 

# Set chatbot

##### - set template

In [29]:
from langchain.prompts import PromptTemplate
template = """SYSTEM: You are an intelligent assistant helping the users with their questions on papers related to laws conditions and always cite the article that you're referring to.

Question: {question}

Strictly Use ONLY the following pieces of context to answer the question at the end. Think step-by-step and then answer.

Do not try to make up an answer:
 - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
 - If the context is empty, just say "I do not know the answer to that."

=============
{context}
=============

Question: {question}
Helpful Answer:"""

In [30]:
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

##### - set memory

In [31]:
# Set memory
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key=f"chat_history",
    return_messages=True,
     output_key='answer'
)

##### - set vertex matching engine as retriever

In [32]:
# Create chain to answer questions
NUMBER_OF_RESULTS = 5
SEARCH_DISTANCE_THRESHOLD = 0.6

# Expose index to the retriever
retriever = me.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
    filters=None,
)

##### - set llm

In [38]:
from langchain.llms import VertexAI

In [40]:
llm = VertexAI(
    model_name="text-bison@002",
    max_output_tokens=1024,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

##### - chatbot

In [34]:
from langchain.chains import ConversationalRetrievalChain

In [44]:
qa = ConversationalRetrievalChain.from_llm(
        llm=llm, 
        retriever=retriever,
        memory=memory,
        combine_docs_chain_kwargs={"prompt": QA_CHAIN_PROMPT},
        return_source_documents=True,
        return_generated_question=True #to solve error
    )

##### - test

In [None]:
qa({"question": "What are the methods to reserve a stay?"})['answer']