In [None]:
!pip install langchain==0.0.263
!pip install llama-index==0.8.1.post1
!pip install huggingface-hub==0.16.4
!pip install requests==2.31.0
!pip install pypdf==3.15.1

# New Section

In [None]:
!pip install qdrant-client==1.4.0

In [None]:
!pip install sentence-transformers==2.2.2

In [None]:
#Import necessary stuff
import json
import os
from typing import Sequence

import qdrant_client
import requests
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document as LangDoc
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from llama_index import LLMPredictor, get_response_synthesizer, Prompt
from llama_index import LangchainEmbedding, ServiceContext
from llama_index import set_global_service_context
from llama_index.indices.vector_store import VectorIndexRetriever
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.readers.file.base import SimpleDirectoryReader
from llama_index.readers.schema.base import Document as LIDoc
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.vector_stores.types import VectorStoreQueryMode

In [None]:
documents = SimpleDirectoryReader(input_files=["./TestDocument.pdf"],
                                  recursive=True).load_data()
collection_table = "test01"
custom_meta = {"file_id": "unique-file-id",
               "display_file_name": "display_file_name",
               "display_folder_name": "display_folder_name",

               }


os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<hugging-face-token>"
qdrant_uri = "<qdrant-uri>"
qdrant_api_key = "<qdrant_api_key>"

model_name = "declare-lab/flan-alpaca-base"

temperature = 0
local_storage_directory = None
llm = HuggingFaceHub(repo_id=model_name,
                     model_kwargs={"temperature": temperature,
                                   "max_length": 512})

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=model_name,
                                   cache_folder=local_storage_directory
                                   )

In [None]:
api_headers = {
    'api-key': qdrant_api_key,
    'Content-Type': 'application/json'
}

q_client = qdrant_client.QdrantClient(
    qdrant_uri,
    api_key=qdrant_api_key  # For Qdrant Cloud, None for local instance
)

In [None]:
llm_predictor = LLMPredictor(llm=llm)
embed_model = LangchainEmbedding(embeddings)
service_context = ServiceContext.from_defaults(embed_model=embed_model,
                                               llm_predictor=llm_predictor,
                                               chunk_size=512,
                                               node_parser=SimpleNodeParser(
                                                   text_splitter=TokenTextSplitter(chunk_size=300,
                                                                                   chunk_overlap=20)))
set_global_service_context(service_context)

In [None]:
def split_docs(documents, chunk_size=1024, chunk_overlap=20, use_tokenizer="tiktoken"):
    """
    Split documents and returns back in Original document format i.e. llama-index or langchain
    """
    ret_list = documents
    try:

        if use_tokenizer == "tiktoken":
            text_splitter = CharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base",
                                                                        chunk_size=chunk_size,
                                                                        chunk_overlap=chunk_overlap)
        else:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        if not isinstance(documents[0], LangDoc):
            langchain_documents = [d.to_langchain_format() for d in
                                   documents]
        else:
            langchain_documents = documents
        splitted_document_list = text_splitter.split_documents(langchain_documents)
        for doc in splitted_document_list:
            doc.page_content = doc.page_content.replace('\n', ' ').replace('\r', '').replace('\t',
                                                                                             ' ')
        if isinstance(documents[0], LangDoc):
            ret_list = splitted_document_list
        else:
            ret_list = [LIDoc.from_langchain_format(d) for d in splitted_document_list]
    except Exception as ex:
        err_template = "{0} {1!r}"
        err_message = err_template.format(ex.__class__.__name__, ex.args)
        print(err_message)
    return ret_list


def ensure_documents_format(documents, ensure_format):
    """
    LangChain Document: doc.page_content, extra_info
    llama-index: doc.text, metadata
    """
    if not isinstance(documents, list):
        raise Exception('DOCUMENTS_MUST_BE_TYPE_LIST {}'.format(type(documents)))
    doc_list = documents
    assert ensure_format in ["llama-index", "langchain"]
    try:
        if documents:
            if ensure_format == "llama-index":
                if not isinstance(documents[0], LIDoc):
                    doc_list = [LIDoc.from_langchain_format(d) for d in documents]
            elif ensure_format == "langchain":
                if not isinstance(documents[0], LangDoc):
                    doc_list = [d.to_langchain_format() for d in documents]
        else:
            print("No documents passed to ensure_documents_format({})".format(ensure_format))

    except Exception as ex:
        err_template = "{0} {1!r}"
        err_message = err_template.format(ex.__class__.__name__, ex.args)
        print(err_message)
    return doc_list


def embeddings_for_file_id_exist(collection_table, file_id) -> bool:
    ret_val = False
    key_filter_dict = {"key": "custom_meta.file_id",
                       "match": {
                           "any": [
                               file_id

                           ]
                       }
                       }
    req_resp = get_collection_via_filter(collection_table, key_filter_dict=key_filter_dict)
    try:
        if req_resp["status"] == "ok":
            if len(req_resp["result"]["points"]):
                ret_val = True
    except Exception as ex:
        pass

    return ret_val

In [None]:
def get_collection_via_filter(collection_table, key_filter_dict) -> dict:
    """

    @param collection_table: collection name
    @param key_filter_dict: the dictionary that contains the key to search on and the match
    Example:
    key_filter_dict = {"key":"custom_meta.file_id",
    "match" :{
                        "any": [
                            "64b84ee3eb4bba01ab8588a4"

                        ]
                    }
                    }
    POST /collections/{collection_name}/points/scroll

    """

    endpoint_url = "{}/collections/{}/points/scroll".format(qdrant_uri, collection_table)
    req_payload = {
        "limit": 1,
        "filter": {
            "must": [
                {
                    "key": key_filter_dict["key"],
                    "match": key_filter_dict["match"]
                }
            ]
        }
    }
    headers = {
        'api-key': qdrant_api_key,
        'Content-Type': 'application/json'
    }

    req_response_dict = requests.post(endpoint_url, json=req_payload, headers=headers)
    req_response_dict = req_response_dict.json()
    return req_response_dict

In [None]:
def persist_embeddings(collection_table: str, doc_path: str = None, documents: Sequence = None,
                       custom_meta: dict = None) -> int:
    """
        Use this to persist vectors / doc
        if doc_path specified 'documents' is ignored
        documents here must be llama-index doc not lang-Doc
        loaders return LangDoc | SimpleDirectoryReader return llama-index doc
        
        """

    result = 2
    try:
        if doc_path:
            loader = UnstructuredFileLoader(doc_path)
            try:
                documents = loader.load()
            except Exception as ex:
                err_template = "{0} {1!r}"
                err_message = err_template.format(ex.__class__.__name__, ex.args)

        if documents:
            for d in documents:
                d.metadata["custom_meta"] = custom_meta
            documents = ensure_documents_format(documents=documents, ensure_format="llama-index")
            documents = split_docs(documents)

            vector_store = QdrantVectorStore(client=q_client, collection_name=collection_table)

            storage_context = StorageContext.from_defaults(vector_store=vector_store)
            data_exists = False
            if "file_id" in custom_meta:
                if custom_meta["file_id"]:
                    file_id = custom_meta["file_id"]
                    data_exists = embeddings_for_file_id_exist(collection_table, file_id)
            if not data_exists:
                try:
                    VectorStoreIndex.from_documents(documents, storage_context=storage_context,
                                                    service_context=service_context)
                    result = 1

                except Exception as ex:

                    result = 2
                    err_template = "{0} {1!r}"
                    err_message = err_template.format(ex.__class__.__name__, ex.args)
                    print(err_message)
                    pass

    except Exception as ex:
        result = 2
        err_template = "{0} {1!r}"
        err_message = err_template.format(ex.__class__.__name__, ex.args)
        print(err_message)
    return result

In [None]:
def conduct_query(query, collection_table: str):
    ans_dict_list = []
    ans = ""
    TEMPLATE_STR = (
        "We have provided context information below. \n"
        "---------------------\n"
        "{context_str}"
        "\n---------------------\n"
        "Given this information, please answer the question in as much detail as possible: {query_str}\n"
        "\nDo not make up answers.'\n"
        "\nIf you are unsure and the answer is not explicitly written in the documentation, say 'I dont know'.\n"
        "\nIf  the answer is not explicitly in the documentation, say 'I dont know'.\n"
    )

    if collection_table:

        QA_TEMPLATE = Prompt(TEMPLATE_STR)

        try:
            qe = get_query_engine(collection_table, QA_TEMPLATE=QA_TEMPLATE)
            ans, ans_dict_list = conduct_query_from_query_engine(query, qe)
        except Exception as ex:
            err_template = "{0} {1!r}"
            err_message = err_template.format(ex.__class__.__name__, ex.args)
            print(err_message)

    return ans, ans_dict_list


def get_query_engine(collection_table, QA_TEMPLATE=None) -> RetrieverQueryEngine:
    vector_store = QdrantVectorStore(client=q_client, collection_name=collection_table)
    

    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

    retriever = VectorIndexRetriever(
        similarity_top_k=3,
        #vector_store_query_mode=VectorStoreQueryMode.HYBRID,
        alpha=0.5,
        index=index
    )

    response_synthesizer = get_response_synthesizer(
        service_context=service_context,
        text_qa_template=QA_TEMPLATE,
    )

    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )

    return query_engine

In [None]:
def conduct_query_from_query_engine(query, query_engine) -> tuple:
    ans_dict_list = []
    ans_obj = query_engine.query(query)
    ans = ans_obj.response
    try:
        if ans.startswith("\n"):
            ans = ans[1:]
    except:
        pass
    source_list = []
    if ans_obj.source_nodes:
        if isinstance(ans_obj.source_nodes, list):
            for idx, snode in enumerate(ans_obj.source_nodes):
                page_label = file_name = ""
                score = source_text = ""
                try:
                    score = snode.score
                except:
                    pass
                try:
                    source_text = snode.node.get_text()
                except:
                    pass
                try:
                    page_label = snode.node.metadata["page_label"]
                    file_name = snode.node.metadata["file_name"]
                except:
                    pass
                try:

                    source_list.append({"page": page_label,
                                        "file_name": file_name,
                                        "score": score,
                                        "source_text": source_text})
                except:
                    pass

    ans_dict_list.append({"answer": ans, "source_list": source_list})
    return ans, ans_dict_list

In [None]:
persist_embeddings(collection_table, None, documents, custom_meta)

In [None]:
ans, ans_dict_list = conduct_query(query="What is anomaly detection in autonomous driving?", collection_table=collection_table)
print(ans)
print(json.dumps(ans_dict_list))

In [None]:
ans, ans_dict_list = conduct_query(query="Why am I doing this?", collection_table=collection_table)
print(ans)
print(json.dumps(ans_dict_list))