In [None]:
from common import utils

CATALOG_NAME = utils.config_value("catalog_name")
SCHEMA_NAME = utils.config_value("schema_name")
KNOWLEDGE_ASSISTANT_NAME = utils.snake_case(f"{CATALOG_NAME}_{SCHEMA_NAME}_knowledge_assistant")
KNOWLEDGE_ASSISTANT_SYNC_INTERVAL_MS = int(utils.config_value("knowledge_assistant_sync_interval_ms"))

In [1]:
import json
from datetime import datetime, timedelta, timezone
from common import vector_search
from common import knowledge_assistant


def create_knowledge_assistant():
    url = utils.api_url("/api/2.0/knowledge-assistants")
    payload = {
        "name": KNOWLEDGE_ASSISTANT_NAME,
        "description": utils.config_value("knowledge_assistant_description"),
        "instructions": utils.config_value("knowledge_assistant_instructions", ""),
        "knowledge_sources": [
            {
                "files_source": {
                    "name": "files",
                    "type": "files",
                    "description": utils.config_value("file_source_description"),
                    "files": {
                        "path": vector_search.VOLUME_PATH
                    }
                }
            },
            {
                "index_source": {
                    "name": "vector_search_index",
                    "type": "index",
                    "description": utils.config_value("vector_search_index_description"),
                    "index": {
                        "name": vector_search.INDEX_NAME,
                        "doc_uri_col": "path",
                        "text_col": "text"
                    }
                }
            }
        ]
    }
    utils.logger().info("knowledge-assistant create request body:\n%s", json.dumps(payload))
    utils.http_request(url, method="POST", json=payload)


ka = knowledge_assistant.get_by_name(KNOWLEDGE_ASSISTANT_NAME)
if not ka:
    create_knowledge_assistant()
else:
    knowledge_sources = ka.get("knowledge_assistant", {}).get("knowledge_sources", []) or []
    latest_last_successful_update_timestamp_ms = None
    for knowledge_source in knowledge_sources:
        file_source = knowledge_source.get("files_source", {})
        if not file_source:
            continue
        state = knowledge_source.get("state", "")
        if "KNOWLEDGE_SOURCE_STATE_UPDATED" == state:
            timestamp = knowledge_source["last_successful_update_timestamp_ms"]
            if timestamp:
                if (latest_last_successful_update_timestamp_ms is None) or (
                        timestamp > latest_last_successful_update_timestamp_ms):
                    latest_last_successful_update_timestamp_ms = timestamp
                continue
        latest_last_successful_update_timestamp_ms = None
        break
    if latest_last_successful_update_timestamp_ms is not None:
        last_update = datetime.fromtimestamp(latest_last_successful_update_timestamp_ms / 1000, tz=timezone.utc)
    cutoff = datetime.now(timezone.utc) - timedelta(milliseconds=KNOWLEDGE_ASSISTANT_SYNC_INTERVAL_MS)
    if last_update >= cutoff:
        utils.logger().info("Skipping knowledge assistant sync - last_update:%s", last_update.isoformat())
    else:
        utils.logger().info("Syncing knowledge assistant - last_update:%s", last_update.isoformat())
        tile_id = ka.get("tile").get("tile_id")
        knowledge_assistant.sync(ka.get("tile").get("tile_id"))




ModuleNotFoundError: No module named 'pyspark.dbutils'