https://github.com/insightbuilder/python_de_learners_data/blob/main/code_script_notebooks/projects/exploring_bard/selfQueryingRetriever_QAChains.ipynb

In [1]:
import torch 
import time
import transformers # HF import
from langchain import HuggingFacePipeline # To build the HF pipeline using Llama-2
from langchain import PromptTemplate,  LLMChain # To create PromptTemplate and LLMChain
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM , AutoModel  # For creating the model and tokenizer


In [2]:
model_name = 'meta-llama/Llama-2-7b-chat-hf' # Model path for Llama-2 finetuned chat model

device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    #trust_remote_code=True,
    #config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)

model.eval()

pipe = transformers.pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
                )
        
llm = HuggingFacePipeline(pipeline=pipe,
                          model_kwargs = {'temperature' : 0.7})


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Document Preparation

In [3]:
#Loading the documents from langchain resources folder

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_pdf(path_pdf):
  get_text = PyPDFLoader(path_pdf)
  
  get_pages = get_text.load()

  final_text = []

  shredder = RecursiveCharacterTextSplitter(chunk_size=350,
                                            chunk_overlap=20,
                                            length_function=len) 
  
  final_shred = shredder.split_documents(get_pages)

  return final_shred


In [5]:

#Just to test the function
agent_pg = load_pdf("./Agents.pdf")


In [6]:
agent_pg[0]


Document(page_content='5/31/23, 6:16 AM Agents — \x00\x00 LangChain 0.0.186\nhttps://python.langchain.com/en/stable/modules/agents.html 1/3Agents\nContents\nAction Agents\nPlan-and-Execute Agents\nConceptual Guide\nSome applications will require not just a predetermined chain of calls to LLMs/other tools, but', metadata={'source': './Agents.pdf', 'page': 0})

In [7]:
import glob
file_list = glob.glob("./*.pdf")
all_docs = []
for file in file_list:
  temp_docs = load_pdf(file)
  all_docs.extend(temp_docs)


In [8]:
len(all_docs)


2822

#### Creation of Embeddings

We will use the open source sentence transformer embedding to create the embedding.

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


#### Vector Store

In [10]:
from langchain.vectorstores import Chroma

# load embeddings into Chroma - need to pass docs ,embedding function and path of the db

db = Chroma.from_documents(all_docs,
                           embedding=embeddings)


In [11]:
db_retriever = db.as_retriever()
db_retriever.get_relevant_documents("langchain concepts")


[Document(page_content='It creates a vibrant and thriving ecosystem.\nIntegrations : Guides for how other products can be used with LangChain.\nDependents : List of repositories that use LangChain.\nSkip to main content\x00\x00\nCTRL  + K', metadata={'page': 2, 'source': './WelcometoLangChain.pdf'}),
 Document(page_content='LangChainHub : The LangChainHub is a place to share and explore other prompts, chains,\nand agents.\nGallery : A collection of great projects that use Langchain, compiled by the folks at\nKyrolabs . Useful for finding inspiration and example implementations.\nTracing : A guide on using tracing in LangChain to visualize the execution of chains and', metadata={'page': 3, 'source': './WelcometoLangChain.pdf'}),
 Document(page_content='5/31/23, 6:10 AM Welcome to LangChain — \x00\x00 LangChain 0.0.186\nhttps://python.langchain.com/en/stable/ 1/4Welcome to LangChain\nContents\nGetting Started\nModules\nUse Cases\nReference Docs\nEcosystem\nAdditional Resources\nLangChain

#### Metadata Field Info

In [12]:
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info=[
    AttributeInfo(
        name="source",
        description="Filename and location of the source file", 
        type="string", 
    ),
    AttributeInfo(
        name="page",
        description="Page number on which the document is found", 
        type="integer", 
    )
]
document_content_description = "Text documents from Langchain help and concept documentation"


#### Creating a Retrieval QA Chain using LLM (llama-2)

In [13]:
#Create our Q/A Chain

from langchain.retrievers.self_query.base import SelfQueryRetriever


In [14]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    db,
    document_content_description,
    metadata_field_info,
    verbose=True
)

retriever.get_relevant_documents("What are some concepts of Agents")


OutputParserException: Parsing text
```json
{
    "query": "agents",
    "filter": "and(gt(page, 5), contains(source, \\"Agents\\"))"
}
```
 raised following error:
Got invalid JSON object. Error: Expecting ',' delimiter: line 3 column 53 (char 77)

In [15]:
retriever = SelfQueryRetriever.from_llm(llm, 
                                        db, 
                                        document_content_description, 
                                        metadata_field_info, 
                                        verbose=True,
                                        enable_limit=True)

retriever.get_relevant_documents("Explain 3 concepts of Chains")


OutputParserException: Parsing text
```json
{
    "query": "concepts chains",
    "filter": "and(gt(page, 5), contains(source, 'chains'))",
    "limit": 3
}
```
 raised following error:
Unexpected token Token('COMMA', ',') at line 1, column 12.
Expected one of: 
	* LPAR
Previous tokens: [Token('CNAME', 'page')]


In [None]:
retriever.get_relevant_documents("Give 2 example of autonomous agent")


In [16]:
from langchain.chains import RetrievalQAWithSourcesChain


In [17]:
chain = RetrievalQAWithSourcesChain.from_chain_type(llm, 
                                                    chain_type="stuff", 
                                                    retriever=retriever)

chain({"question":"Give 2 types of agents"})


  warn_deprecated(


{'question': 'Give 2 types of agents',
 'answer': ' There are two types of agents:\n\n1. LLM-augmented autonomous agents\n2. AgentBench\n\n',
 'sources': ''}

In [18]:
chain({"question":"How to combine LLMs"},
      return_only_outputs=False)


OutputParserException: Parsing text
```json
{
    "query": "LLM combination",
    "filter": "and(gt(page, 5), lt(page, 10))",
    "limit": 2
}
```
 raised following error:
Unexpected token Token('COMMA', ',') at line 1, column 12.
Expected one of: 
	* LPAR
Previous tokens: [Token('CNAME', 'page')]


In [20]:
chain({"question":"Provide 2 examples of combining LLMs"},
      return_only_outputs=False)


OutputParserException: Parsing text
```json
{
    "query": "combine llms",
    "filter": "and(or(gt(page, 5), lt(page, 10)), gt(source, \\".*LLM.*\\"))"
}
```
 raised following error:
Got invalid JSON object. Error: Expecting ',' delimiter: line 3 column 65 (char 95)

In [21]:
chain({"question":"4 Concepts in langchain"},
      return_only_outputs=False)


OutputParserException: Parsing text
```json
{
    "query": "concept",
    "filter": "and(gt(page, 1), lt(page, 5))",
    "limit": 4
}
```
 raised following error:
Unexpected token Token('COMMA', ',') at line 1, column 12.
Expected one of: 
	* LPAR
Previous tokens: [Token('CNAME', 'page')]


In [None]:
chain({"question":"Explain consistency in langchain."},
      return_only_outputs=False)
