In [7]:
import numpy as np
import json

In [None]:
# Set credentials
# gcloud auth application-default login

# Vertex parameters

In [8]:
from utils.doc_to_vertex_search import get_me_parameters
file_path = "me_parameters.json"
parameters =  get_me_parameters(file_path)

In [9]:
PROJECT_ID = parameters['PROJECT_ID']
LOCATION = parameters['LOCATION']
CHATBOT_NAME = parameters['CHATBOT_NAME']
ME_INDEX_ID = parameters['ME_INDEX_ID']
ME_INDEX_ENDPOINT_ID = parameters['ME_INDEX_ENDPOINT_ID']
ME_INDEX_NAME = parameters['ME_INDEX_NAME']
ME_EMBEDDING_DIR = parameters['ME_EMBEDDING_DIR']
ME_DIMENSIONS = parameters['ME_DIMENSIONS']

# Initiate Embeddings, vector store and retriever

In [10]:
# initiate embeddings
from utils.custom_vertexai_embeddings import CustomVertexAIEmbeddings
# Embeddings API integrated with langChain
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


In [11]:
# initiate vector store
from utils.matching_engine import MatchingEngine
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=LOCATION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
    # credentials_path=credentials_path
)

In [None]:
# Test whether search from vector store is working
# me.similarity_search("moyens de réserver un séjour", k=4)

In [12]:
# Expose index to the retriever
NUMBER_OF_RESULTS = 2
SEARCH_DISTANCE_THRESHOLD = 0.6

retriever = me.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
    filters=None,
)

In [None]:
# qn = "What're the methods to reserve a stay?"

In [None]:
# retriever.get_relevant_documents(qn)

# Initiate llm

## local llm

In [4]:
from langchain.llms import HuggingFacePipeline

In [1]:
#ref: https://medium.com/aimonks/exploring-offline-rag-with-langchain-zephyr-7b-beta-and-decilm-7b-c0626e09ee1f
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig, BitsAndBytesConfig
from transformers import TextStreamer

torch_dtype = torch.float16
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)

model_id = "HuggingFaceH4/zephyr-7b-beta" #"google/gemma-2b-it"

#download model
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id,
#                                                 torch_dtype=torch_dtype, 
#                                                 low_cpu_mem_usage=True,
#                                                 #quantization_config= quantization_config
#                                                 )
# model.save_pretrained('zephyr-7b-beta-model', max_shard_size="1000MB")
# tokenizer.save_pretrained('zephyr-7b-beta-tokenizer')
# del model
# del tokenizer
# torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm
2024-04-02 15:46:41.727197: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 8/8 [01:12<00:00,  9.00s/it]


In [2]:
# load it from the local folder:
tokenizer = AutoTokenizer.from_pretrained("zephyr-7b-beta-tokenizer",torch_dtype=torch_dtype)
model = AutoModelForCausalLM.from_pretrained("zephyr-7b-beta-model", low_cpu_mem_usage=True, torch_dtype=torch_dtype,quantization_config= quantization_config)
streamer = TextStreamer(tokenizer, skip_prompt=True)

Loading checkpoint shards: 100%|██████████| 15/15 [01:01<00:00,  4.12s/it]


In [5]:
# ref: https://github.com/langchain-ai/langchain/issues/2918
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1026,
    do_sample= True,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.2,
    streamer=streamer
)

gpu_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
type(gpu_llm)

In [6]:
!nvidia-smi

Tue Apr  2 15:55:22 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    35W /  70W |   9220MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            On   | 00000000:00:05.0 Off |                    0 |
| N/A   76C    P0    33W /  70W |   8554MiB / 15360MiB |      0%      Default |
|       

# Initiate template

In [None]:
# gemme-2b-it template https://medium.com/@mohammed97ashraf/building-a-retrieval-augmented-generation-rag-model-with-gemma-and-langchain-a-step-by-step-f917fc6f753f

In [13]:
from langchain.prompts import PromptTemplate
template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question, in its original English.
                        Chat History:
                        {chat_history}
                        Follow-Up Input: {question}
                        Standalone question:"""
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(template)

# Initiate memory

In [14]:
# Set memory
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key=f"chat_history",
    return_messages=True,
    output_key='answer'
)

# Set chatbot

In [15]:
from langchain.chains import ConversationalRetrievalChain

In [16]:
from langchain.prompts import PromptTemplate
template = """SYSTEM: You are an intelligent assistant helping the users with their questions on papers related to laws conditions and always cite the article that you're referring to.
Please alway reply at the same language that question uses.

Question: {question}

Strictly Use ONLY the following pieces of context to answer the question at the end. Think step-by-step and then answer.

Do not try to make up an answer:
 - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
 - If the context is empty, just say "I do not know the answer to that."

=============
{context}
=============

Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [17]:
qa = ConversationalRetrievalChain.from_llm(
        llm=gpu_llm, 
        retriever=retriever,
        chain_type="stuff",
        memory=memory,
        combine_docs_chain_kwargs={"prompt": QA_CHAIN_PROMPT},
        return_source_documents=True,
        return_generated_question=True #to solve error
    )

# Test

In [None]:
# reference to get helpful answer
# https://huggingface.co/spaces/cvachet/pdf-chatbot/blob/main/app.py


In [18]:
memory.clear()

In [19]:
message = "What are the methods to reserve a stay?"

In [20]:
response = qa.invoke({"question": message})

Waiting
There are four ways to reserve a stay: by phone through our reservation teams using the numbers provided below (local call charges), for Homair Vacances at 04.84.39.08.60 and for Tohapi at 04.48.20.20.20 or +33 4 48 20 20 20 when calling from abroad; through the company's websites; by email using the following addresses: for Homair Vacances at https://contact.homair.com/hc/fr/requests/new, for Tohapi at reservations@tohapi.fr and for Marvilla Parks at https://contact.marvilla -parks.com/hc/fr/requests/new; directly at the campsite reception desk for Homair Vacances campsites only. The reservation process follows these steps: first, the customer selects the desired campsite based on its description; second, the customer specifies the duration of the stay, the departure date, the number of occupants and the type of accommodation or pitch; third, the customer enters any benefits received, providing relevant codes where applicable (partner code or promotional code); fourth, the cus

In [21]:
!nvidia-smi

Tue Apr  2 15:57:44 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   9220MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            On   | 00000000:00:05.0 Off |                    0 |
| N/A   77C    P0    34W /  70W |  13894MiB / 15360MiB |      0%      Default |
|       

In [None]:
response = qa.invoke({"question": "Can you give more details?"})

In [None]:
!nvidia-smi

In [None]:
# response = qa({"question": "I mean details about how to reserve a stay?"})