# Notebook Purpose

* Setup Whisper v3 Large PT Endpoint
  * Whisper v3 already exists in `system.ai.whisper_large_v3`
* Perform Audio Transcription using AI Function for LLM Batch Inference

In [0]:
%pip install -U --quiet databricks-sdk==0.28.0 mlflow
dbutils.library.restartPython()

In [0]:
%run ./config

In [0]:
dbutils.widgets.text("endpoint_name", defaultValue="whisper_large_v3_fins_genai")

In [0]:
version = "1"
model_uc_path = "system.ai.whisper_large_v3"
endpoint_name = dbutils.widgets.get("endpoint_name")


print(f"Unity Catalog Model Path: {model_uc_path}")
print(f"Endpoint Name: {endpoint_name}")

# Deploy the Whisper Large V3 Model

In [0]:
import datetime
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointCoreConfigInput
from databricks.sdk.service.serving import AutoCaptureConfigInput

w = WorkspaceClient()

In [0]:
workload_type = "GPU_LARGE"
version = '3' # latest version based on system.ai

config = EndpointCoreConfigInput.from_dict({
    "served_models": [
        {
            "name": endpoint_name,
            "model_name": model_uc_path,
            "model_version": version,
            "workload_type": workload_type,
            "workload_size": "Small",
            "scale_to_zero_enabled": "True",
        }
    ]
})

model_details = w.serving_endpoints.create_and_wait(name=endpoint_name, config=config)


# Audio Transcription with AI Query

## Ingestion Raw Audio with autoloader

In [0]:
from pyspark.sql.functions import regexp_extract

volume_checkpoints = f"/Volumes/{catalog}/{schema}/checkpoints"
volume_path = f"/Volumes/{catalog}/{schema}/volume_audio_files/"

spark.sql(f"USE CATALOG {catalog};")
spark.sql(f"USE SCHEMA {schema};")

df = (spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "binaryFile")
    .option("recursiveFileLookup", "true")
    .load(volume_path))

(df.writeStream
 .trigger(availableNow=True)
 .option("checkpointLocation", f'{volume_checkpoints}/raw_audio')
 .table('raw_audio').awaitTermination()
 )

In [0]:
ai_query_sql = f"""
CREATE TABLE IF NOT EXISTS {catalog}.{schema}.audio_transcription AS ( 
  select
    regexp_extract(path, r'.*\/policy_no_(\d+)\/.*', 1) AS policy_number,
    ai_query(
      '{endpoint_name}',
      content,
      failOnError => True
    ) as transcripts
  from {catalog}.{schema}.raw_audio
)
"""

spark.sql(ai_query_sql)

In [0]:
display(spark.table(f"audio_transcription"))