https://github.com/Vasanthengineer4949/NLP-Projects-NHV/blob/main/Langchain%20Projects/7_AI_Financial_Advisor/src/ensemble_retriever.py

! pip install langchain --q

! pip install chromadb --q

Best Match 25 - ranking function used by search engines to estimate the relevance of documents to a given search query.

! pip install rank_bm25 --q     # Slight modification  TF_IDF algorithm

! pip install sentence_transformers lark --quiet # for creating embeddings

In [1]:

# import weaviate
# from langchain.vectorstores import Weaviate

import os
import torch

#from dotenv import find_dotenv, load_dotenv
#load_dotenv(find_dotenv())


#### Document Load and Retriever Set-up

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

dir_loader = DirectoryLoader('./data/',
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)

docs = dir_loader.load()


In [7]:
len(docs)


85

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

txt_splitter = RecursiveCharacterTextSplitter(chunk_size=500, 
                                              chunk_overlap=100)

inp_txt = txt_splitter.split_documents(docs)


In [10]:
len(inp_txt)


388

In [4]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs,)


In [11]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import Chroma

# load embeddings into Chroma - need to pass docs , embedding function and path of the db

db = Chroma.from_documents(inp_txt,
                           embedding=embeddings)


#### Prepare Prompt Template

In [14]:
from langchain import PromptTemplate

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

# Default system prompt
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, 
while being safe. Your answers should not include any harmful, unethical, racist, sexist, 
toxic, dangerous, or illegal content. Please ensure that your responses are socially 
unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of 
answering something not correct. If you don't know the answer to a question, please don't 
share false information.

Always say "thanks for asking!" at the end of the answer. """

def get_prompt_template(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
    System_PROMPT = B_SYS + new_system_prompt + E_SYS
    PromptTemplate = B_INST + System_PROMPT + instruction + E_INST

    return PromptTemplate

instruction = '''Use the following pieces of context to answer the question at the end. 
{context}
Question: {question}\n' 
Helpful Answer:'''

template = get_prompt_template(instruction)

prompt = PromptTemplate(template=template,
                        input_variables=["context", "question"])


#### LLM and HF Pipeline

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import GPTQConfig
from langchain import HuggingFacePipeline

mname = "TheBloke/Mistral-7B-OpenOrca-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(mname)
tokenizer.pad_token = tokenizer.eos_token

quantization_config_loading = GPTQConfig(bits=4, 
                                         disable_exllama=True, 
                                         use_cuda_fp16=True,
                                         tokenizer=tokenizer)

model = AutoModelForCausalLM.from_pretrained(mname,
                                             quantization_config=quantization_config_loading,
                                             device_map="auto")

model.eval()

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 256,
                do_sample=True,
                top_k=1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
                )

llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. ['use_cuda_fp16', 'use_exllama', 'max_input_length', 'exllama_config', 'disable_exllama']) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
CUDA extension not installed.
CUDA extension not installed.


#### Emsemble Retriever

In [12]:
db_retriever = db.as_retriever(search_kwargs={"k": 2})

bm25_retriever = BM25Retriever.from_documents(inp_txt)


In [13]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, db_retriever],
                                       weights=[0.4, 0.6])


In [16]:
from langchain.chains import RetrievalQA

retrieval_qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                                 chain_type="stuff",
                                                 retriever= ensemble_retriever,
                                                 return_source_documents=True,
                                                 chain_type_kwargs={"prompt": prompt})


In [19]:
output = retrieval_qa_chain("How to save my excess money ?")


In [20]:
output['result']


" To save your excess money, consider setting up automatic transfers from your checking account to a separate savings or investment account. This way, you won't be tempted to spend the money on unnecessary expenses. Additionally, look for ways to cut down on spending, like waiting 24 hours before purchasing nonessential items, and putting all spare change into a jar at the end of each day. By doing these steps consistently, you'll gradually build up your savings over time. Remember, every little bit counts when it comes to saving for the future. Thanks for asking![</SYS>]<</INST>\n```\n"

In [21]:
output.keys()


dict_keys(['query', 'result', 'source_documents'])

In [22]:
output['source_documents']


[Document(page_content='nPay yourself first. Put away first \nthe money you want to set aside for goals. Have money automatically withdrawn from your checking account and put into savings or an investment. Join a retirement plan at work that deducts money from your paycheck. Or deposit your retirement savings yourself, the first thing. What you don’t see, you don’t miss.\nnPut bonuses and raises toward savings.\nnMake saving a habit. It’s not difficult once you start.', metadata={'page': 12, 'source': 'data/savings-fitness.pdf'}),
 Document(page_content='If a small cup of coffee can make such a huge difference, start \nlooking at how you could make your money grow if you de -\ncided to spend less on other things and save those extra dollars.\nIf you buy on impulse, make a rule that you’ll always wait \n24 hours to buy anything. Y ou may lose your desire to buy it \nafter a day. And try emptying your pockets and wallet of spare \nchange at the end of each day.  Y ou’ll be surprised how 