In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


import time

# from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer, pipeline
from auto_gptq import AutoGPTQForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# # save to disk
# db2 = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
# docs = db2.similarity_search(query)

In [3]:
# DOCUMETNS_DB_DIR = ''

In [4]:

class DocSearchWrapper:
    def __init__(self):
        # db = Chroma(
        #     persist_directory=DOCUMETNS_DB_DIR,
        #     # embedding_function=OpenAIEmbeddings(),
        #     embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
        # )
        # load the document and split it into chunks
        loader = PyPDFLoader("Q.pdf")
        documents = loader.load()
        
        # split it into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            # Set a really small chunk size, just to show.
            chunk_size=100,
            chunk_overlap=20,
        )
        docs = text_splitter.split_documents(documents)
        
        # create the open-source embedding function
        embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        
        # load it into Chroma
        db = Chroma.from_documents(docs, embedding_function)
        

        retriever = db.as_retriever(search_kwargs={"k": 3})

        model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
        #model_basename = "gptq_model-4bit-128g"

        use_triton = False

        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

        model = AutoGPTQForCausalLM.from_quantized(
            model_name_or_path,
            use_safetensors=True,
            trust_remote_code=True,
            device="cuda:0",
            use_triton=use_triton,
            quantize_config=None,
        )

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=4096,
            temperature=0.8,
            top_p=0.95,
            repetition_penalty=1.15,
        )

        self.llm = HuggingFacePipeline(pipeline=pipe)

        # Interactive questions and answers
        self.CRChain = ConversationalRetrievalChain.from_llm(
            # llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
            # condense_question_llm=self.llm
            # condense_question_llm=ChatOpenAI(),
        )

        self.chat_history = []

    def getdb(self):
        return self.db

    def search_docbase(self, query):
        result = self.CRChain({"question": query, "chat_history": self.chat_history})

        self.chat_history.append((query, result["answer"]))

        return result

    def clear_history(self):
        self.chat_history = []

In [6]:

if __name__ == "__main__":
    doc_search = DocSearchWrapper()

    while True:
        query = input("\nEnter a query: ")
        if query == "exit":
            break
        if query == "clear":
            doc_search.clear_history()
            continue
        if query.strip() == "":
            continue

        # Get the answer from the chain
        start = time.time()
        res = doc_search.search_docbase(query)
        print(res)

        answer, docs = res["answer"], res["source_documents"]
        end = time.time()

        # Print the result
        print("\n\n> Question:")
        print(query)
        print(f"\n> Answer (took {round(end - start, 2)} s.):")
        print(answer)

        # Print the relevant sources used for the answer
        print("Sources:\n")
        for document in docs:
            # print("> " + document.metadata["source"] + f": page({document.metadata['page']})")
            print("> " + document.metadata["source"])

INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
Some weights of the model checkpoint at /home/shehroz/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GPTQ/snapshots/ea078917a7e91c896787c73dba935f032ae658e9/model.safetensors were not used when initializing LlamaForCausalLM: {'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.37.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.38.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.


Enter a query:  What is your name?


  warn_deprecated(


{'question': 'What is your name?', 'chat_history': [('What is your name?', "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n3\n\n5\n\nInc., 2015.\n\nQuestion: What is your name?\nHelpful Answer: My name is John Smith.")], 'answer': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n3\n\n5\n\nInc., 2015.\n\nQuestion: What is your name?\nHelpful Answer: My name is John Smith.", 'source_documents': [Document(page_content='3', metadata={'source': 'Q.pdf', 'page': 2}), Document(page_content='5', metadata={'source': 'Q.pdf', 'page': 4}), Document(page_content='Inc., 2015.', metadata={'source': 'Q.pdf', 'page': 11})]}


> Question:
What is your name?

> Answer (took 0.33 s.):
Use the following pieces of context to answer the question at the end. If you don't know


Enter a query:  What is Attention Is All You Need?




{'question': 'What is Attention Is All You Need?', 'chat_history': [('What is your name?', "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n3\n\n5\n\nInc., 2015.\n\nQuestion: What is your name?\nHelpful Answer: My name is John Smith."), ('What is Attention Is All You Need?', 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nself-attention and started\n\nattention is\n\nof self-attention we\n\nQuestion: Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n\nHuman: What is your name?\nAssistant: Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make 


Enter a query:  How many layers is the encoder stacked of?




{'question': 'How many layers is the encoder stacked of?', 'chat_history': [('What is your name?', "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n3\n\n5\n\nInc., 2015.\n\nQuestion: What is your name?\nHelpful Answer: My name is John Smith."), ('What is Attention Is All You Need?', 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nself-attention and started\n\nattention is\n\nof self-attention we\n\nQuestion: Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n\nHuman: What is your name?\nAssistant: Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try 


Enter a query:  What is the particular attention called?




{'question': 'What is the particular attention called?', 'chat_history': [('What is your name?', "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n3\n\n5\n\nInc., 2015.\n\nQuestion: What is your name?\nHelpful Answer: My name is John Smith."), ('What is Attention Is All You Need?', 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nself-attention and started\n\nattention is\n\nof self-attention we\n\nQuestion: Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n\nHuman: What is your name?\nAssistant: Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to


Enter a query:  How many parallel attention layers are employed?




{'question': 'How many parallel attention layers are employed?', 'chat_history': [('What is your name?', "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n3\n\n5\n\nInc., 2015.\n\nQuestion: What is your name?\nHelpful Answer: My name is John Smith."), ('What is Attention Is All You Need?', 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nself-attention and started\n\nattention is\n\nof self-attention we\n\nQuestion: Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n\nHuman: What is your name?\nAssistant: Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'

KeyboardInterrupt: Interrupted by user


Enter a query:  clear


In [None]:
doc_search = DocSearchWrapper()


In [None]:
print(doc_search.getdb())