# Environment Setup

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.workspace import ExportFormat
import os
import base64
import logging

def create_catalog_schema_volume(spark, catalog_name, schema_name, volume_name):
    try:
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
        spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.{volume_name}")
        logging.info("Catalog, schema, and volume created successfully.")
    except Exception as e:
        logging.error(f"Error creating catalog, schema, or volume: {e}")
        raise

def export_and_upload_files(workspace_folder, volume_path):
    w = WorkspaceClient()
    source_files = f"{volume_path}/original"

    try:
        items = w.workspace.list(workspace_folder)
        for item in items:
            if item.object_type.name == "FILE":
                file_path = item.path
                file_name = os.path.basename(file_path)

                logging.info(f"Exporting {file_path} into memory...")

                export_response = w.workspace.export(path=file_path, format=ExportFormat.AUTO)
                file_content = base64.b64decode(export_response.content)

                with open(f"{source_files}/{file_name}", "wb") as f:
                    f.write(file_content)

                logging.info(f"Written {file_name} into {volume_path}")

        logging.info("All files successfully written to Volume!")
    except Exception as e:
        logging.error(f"Error exporting or uploading files: {e}")
        raise

In [0]:
dbutils.widgets.text("catalog_name", "rohitb_demo")
dbutils.widgets.text("schema_name", "pdf_chat")
dbutils.widgets.text("volume_name", "files")
dbutils.widgets.text("workspace_folder", "/Workspace/Users/rohit.bhagwat@databricks.com/pdf-chat-bot/data")

catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")
volume_name = dbutils.widgets.get("volume_name")
workspace_folder = dbutils.widgets.get("workspace_folder")

volume_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
local_tmp_dir = "/dbfs/tmp/pdf-chat-bot-data"

# Parse PDF Files using Databricks Document Parsing

##Background 
Databricks is previewing Document Parsing, which extracts structured content from unstructured documents. This unlocks the full potential of data that’s currently trapped in unusable formats (e.g., scanned images)—automatically preparing it for a wide variety of analytic and AI use cases. You can use it ad hoc or in a continuous pipeline. 

Specifically, this preview provides three user-defined functions (UDFs):  
`ai_parse` extracts the contextual layout metadata from the document (e.g., title, header, footer). It also extracts the content of the document (e.g., text paragraphs, tables) and represents it in text and markdown format.   
`ai_extract_table_schema` extracts the schema from the document, represented as a STRING.  
`ai_extract_table_data` extracts the table data from the document, represented as a STRUCT.  

If you’re building a RAG or Vector Search application, then Databricks recommends `ai_parse` because the output format caters to these use cases. But if you have a use case that requires a structured table, then Databricks recommends `ai_extract_table_schema` and `ai_extract_table_data`.

In general, `ai_parse` is more performant for larger documents because it can process each page in parallel. However, it also requires more compute because it makes multiple calls per document.

Note
The connector is currently powered by third-party models hosted by Azure OpenAI.


In [0]:
import logging

logging.basicConfig(level=logging.ERROR)

# Create catalog, schema, and volume
create_catalog_schema_volume(spark, catalog_name, schema_name, volume_name)

# Export and upload files
export_and_upload_files(workspace_folder, volume_path)

# Process the files using ai_parse().
df = spark.sql(
    f"""
        WITH corpus AS (
          SELECT
            path,
            ai_parse(content, map('format', 'pdf')) AS parsed
          FROM
            READ_FILES('{volume_path}/original/*', format => 'binaryFile')
        )
        SELECT
          path,
          parsed,
          parsed:document AS document,
          parsed:pages AS pages,
          parsed:elements AS elements,
          parsed:_corrupted_data AS _corrupted_data
        FROM
          corpus
          """
)
display(df)

In [0]:
# Create table
df.write.saveAsTable(f"{catalog_name}.{schema_name}.parsed_pdf_docs")

In [0]:
parsed_records = spark.sql(f"select element_at(split(split(path, ':')[1], '/'), -1) AS name, parsed from {catalog_name}.{schema_name}.parsed_pdf_docs").collect()
workspace_path = '/Workspace/Users/rohit.bhagwat@databricks.com/pdf-chat-bot/data/parsed_text/'
for row in parsed_records:
    with open(f"{workspace_path}{row['name']}.txt", "w") as file:
        file.write(str(row['parsed']))

In [0]:
target_volume = f'/Volumes/{catalog_name}/{schema_name}/{volume_name}/parsed/'
for row in parsed_records:
    with open(f"{target_volume}{row['name']}.txt", "w") as file:
        file.write(str(row['parsed']))