# Parse the PDF and upload to the Vector Store


## 1. Load the PDF


In [None]:
from docling.document_converter import DocumentConverter

# We will be using docling to conver the PDF into markdown.

# This might run for a couple minutes as the PDF is fairly large.
source = "../../fixtures/Delta Lake Definitive Guide.pdf"
converter = DocumentConverter()
result = converter.convert(source)

## 2. Chunk the parsed pdf


In [None]:
from docling.chunking import HybridChunker
from transformers import AutoTokenizer

# Docling has a hybrid chunker that uses a recursive approach to chunk the document.

# We can set a max number of tokens per chunk so that the chunks fit in the context windows of the embedding model.
# The "jinaai/jina-embeddings-v3" has a context window of 8192 tokens.
# However, the reranker model used in the upcoming modules has token window of 1024
# While theoretically the rerank uses a slidign widow approach for chunks longer than 1024, we simply enforoce a max token limit per chunk of 1024.

# We get the embedding model from huggingface
EMBED_MODEL_ID = "jinaai/jina-embeddings-v3"
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)


# We set the max tokens to 8192 / 8 = 1024
MAX_TOKENS = 8192 / 8

chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=MAX_TOKENS,
    merge_peers=True,
)

chunk_iter = chunker.chunk(dl_doc=result.document)
chunks = list(chunk_iter)

## 3. Structure and Embed the chunks


In [None]:
from sentence_transformers import SentenceTransformer

# Here we use the "jinaai/jina-embeddings-v3" model to embed the chunk text.
model_passage = SentenceTransformer(
    "jinaai/jina-embeddings-v3",
    trust_remote_code=True,
    model_kwargs={"default_task": "retrieval.passage"},
)

In [None]:
# We will be using 'direct access' to the vector index. Therefor we need to provide the text and an embedding vector.
# You will also be able to create a vector index that automatically creates the embeddings vector.
# However you need to have an embeddings enpoints with task `llm/v1/embeddings` - which you can host with a serving endpoint.
# But as we do not have this currently, we will manually create the embeddings vector.

simple_chunks = []
for ix, chunk in enumerate(chunks):
    chunk_json = chunk.export_json_dict()

    # When creating the vector index we will define the following schema.
    # Note that a unique index is required, if you rerun this code with new data ix will be different as it start from 0 again.
    # Consider adding a distinct, consistent id for each chunk.
    simple_chunk = {
        "id": ix,
        "text": str(chunk_json["text"]),
        "text_vector": model_passage.encode(chunk_json["text"]).tolist(),
        "heading": str(chunk_json.get("meta", {}).get("headings", "") or ""),
    }

    simple_chunks.append(simple_chunk)

In [None]:
# Example
simple_chunks[0]

## 4. Insert the chunks to the vector index

In [None]:
# source: https://docs.databricks.com/aws/en/generative-ai/create-query-vector-search

# Define the catalog, schema, vector search endpoint name, and vector index name
CATALOG = "generative_ai_toolkit"
SCHEMA = "use_cases"
vector_search_endpoint_name = "generative_ai_toolkit_vs_endpoint"
INDEX = "delta_lake_definitive_guide_index"

In [None]:
from databricks.connect import DatabricksSession as SparkSession

spark = SparkSession.builder.remote(serverless=True).getOrCreate()

In [None]:
from databricks.vector_search.client import VectorSearchClient

# Initialise the vector search client
vsc = VectorSearchClient()


# Check if the endpoint exists
def endpoint_exists(endpoint_name: str, client: VectorSearchClient):
    try:
        return client.get_endpoint(endpoint_name)
    except Exception as e:
        if "NOT_FOUND" in str(e):
            return False
        else:
            raise e


# Create the vector search (store) endpoint
def create_vector_search_endpoint(endpoint_name: str, client: VectorSearchClient):
    endpoint = endpoint_exists(endpoint_name, client)
    if not endpoint:
        try:
            vsc.create_endpoint_and_wait(
                name=endpoint_name, endpoint_type="STANDARD", verbose=True
            )
            return client.get_endpoint(endpoint_name)
        except Exception as e:
            raise e
    elif endpoint["endpoint_status"]["state"] == "ONLINE":
        return endpoint
    elif endpoint["endpoint_status"]["state"] == "PROVISIONING":
        raise Exception(f"Endpoint is provisioning - {endpoint}")
    else:
        raise Exception(
            f"Error with the endpoint - this shouldn't happen. Please delete it and try again.\nEndpoint details: {endpoint}"
        )


create_vector_search_endpoint(vector_search_endpoint_name, client=vsc)

In [None]:
# Check if the index exists
def index_exists(vsc, endpoint_name, index_full_name):
    try:
        index = vsc.get_index(endpoint_name, index_full_name).describe()
        return index
    except Exception as e:
        if "RESOURCE_DOES_NOT_EXIST" in str(e):
            return False


# Create the index if it does not exist
if not index_exists(vsc, vector_search_endpoint_name, f"{CATALOG}.{SCHEMA}.{INDEX}"):
    try:
        index = vsc.create_direct_access_index(
            endpoint_name=vector_search_endpoint_name,
            index_name=f"{CATALOG}.{SCHEMA}.{INDEX}",
            primary_key="id",
            embedding_dimension=1024,
            embedding_vector_column="text_vector",
            schema={
                "id": "int",
                "text": "string",
                "text_vector": "array<float>",
                "heading": "string",
            },
        )
    except Exception as e:
        raise e
    else:
        index = vsc.get_index(
            vector_search_endpoint_name, f"{CATALOG}.{SCHEMA}.{INDEX}"
        )

In [None]:
# Upsert the chunks into the index
index.upsert(simple_chunks)

## 5. Test the vector index


In [None]:
# References: https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html#databricks.vector_search.index.VectorSearchIndex.upsert

In [None]:
query = "How did ETL work in the first generation platforms?"

In [None]:
# We use the same embedding model to embed the query.
model_passage = SentenceTransformer(
    "jinaai/jina-embeddings-v3",
    trust_remote_code=True,
    model_kwargs={"default_task": "retrieval.query"},  # this time be embed for query
)

query_vector = model_passage.encode(query).tolist()

In [None]:
# Provide both the query text and the query vector for the similarity search
results = index.similarity_search(
    query_text=query,
    query_vector=query_vector,
    columns=["id", "text", "heading"],
    num_results=8,
    query_type="hybrid",
    # filters={"heading NOT": '1 Introduction'}
)

In [None]:
# Simply structure the retrieved results


def parse_results(results):
    # Extract the data from the results
    data_array = results["result"]["data_array"]
    columns = results["manifest"]["columns"]

    # Create a list of dictionaries for each result
    parsed_results = []
    for item in data_array:
        parsed_result = {}
        for i, column in enumerate(columns):
            column_name = column["name"]
            parsed_result[column_name] = item[i]
        parsed_results.append(parsed_result)

    return parsed_results


parsed_results = parse_results(results)

In [None]:
parsed_results