In [0]:
%pip install databricks-vectorsearch
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting databricks-vectorsearch
  Downloading databricks_vectorsearch-0.57-py3-none-any.whl (16 kB)
Collecting deprecation>=2
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting mlflow-skinny<4,>=2.11.3
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl (1.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 7.2 MB/s eta 0:00:00
Collecting fastapi<1
  Downloading fastapi-0.116.1-py3-none-any.whl (95 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 95.6/95.6 kB 15.2 MB/s eta 0:00:00
Collecting pydantic<3,>=1.10.8
  Downloading pydantic-2.11.7-py3-none-any.whl (444 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 444.8/444.8 kB 37.7 MB/s eta 0:00:00
Collecting sqlparse<1,>=0.4.0
  Downloading sqlparse-0.5.3-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.4/44.4 kB 5.5 MB/s eta 0:00:00
Collecting cachetools<7,>=5.0.0
  

In [0]:
spark.sql("""
CREATE TABLE telemetry.lwrca.document_table (
    id STRING,
    document STRING
)
USING DELTA
""")

DataFrame[]

In [0]:
data = [
    ("1", "Databricks simplifies big data and AI."),
    ("2", "Azure Cognitive Search enables powerful search capabilities."),
    ("3", "Delta Lake provides ACID transactions for Spark."),
    ("4", "Vector search is essential for semantic retrieval."),
    ("5", "Python is widely used for data science.")
]

df = spark.createDataFrame(data, ["id", "document"])
df.write.format("delta").mode("append").saveAsTable("telemetry.lwrca.document_table")
display(spark.table("telemetry.lwrca.document_table"))

id,document
3,Delta Lake provides ACID transactions for Spark.
5,Python is widely used for data science.
1,Databricks simplifies big data and AI.
2,Azure Cognitive Search enables powerful search capabilities.
4,Vector search is essential for semantic retrieval.


In [0]:
spark.sql("""
ALTER TABLE telemetry.lwrca.document_table SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

DataFrame[]

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient()

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


In [0]:
help(VectorSearchClient)

Help on class VectorSearchClient in module databricks.vector_search.client:

class VectorSearchClient(builtins.object)
 |  VectorSearchClient(workspace_url=None, personal_access_token=None, service_principal_client_id=None, service_principal_client_secret=None, azure_tenant_id=None, azure_login_id='2ff814a6-3304-4ab8-85cb-cd0e6f879c1d', disable_notice=False, credential_strategy=None)
 |  
 |  A client for interacting with the Vector Search service.
 |  
 |  This client provides methods for managing endpoints and indexes in the Vector Search service.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, workspace_url=None, personal_access_token=None, service_principal_client_id=None, service_principal_client_secret=None, azure_tenant_id=None, azure_login_id='2ff814a6-3304-4ab8-85cb-cd0e6f879c1d', disable_notice=False, credential_strategy=None)
 |      Initialize the VectorSearchClient.
 |      
 |      :param str workspace_url: The URL of the workspace.
 |      :param str personal_acce

In [0]:
endpoints = vsc.list_endpoints()
print("Available endpoints:", endpoints)

Available endpoints: {}


In [0]:
vector_search_endpoint_name = "vector-search-demo-endpoint"

In [0]:
vsc.create_endpoint(
    name=vector_search_endpoint_name,
    endpoint_type="STANDARD" # or "STORAGE_OPTIMIZED"
)

{'name': 'vector-search-demo-endpoint',
 'creator': 'senthilkumar.marimuthu@tvsmotor.com',
 'creation_timestamp': 1753070384426,
 'last_updated_timestamp': 1753070384426,
 'endpoint_type': 'STANDARD',
 'last_updated_user': 'senthilkumar.marimuthu@tvsmotor.com',
 'id': 'c9feecdc-1bf1-4e0c-b7f9-dd2e14eb5dea',
 'endpoint_status': {'state': 'PROVISIONING'},
 'num_indexes': 0}

In [0]:
endpoint = vsc.get_endpoint(
  name=vector_search_endpoint_name)
endpoint

{'name': 'vector-search-demo-endpoint',
 'creator': 'senthilkumar.marimuthu@tvsmotor.com',
 'creation_timestamp': 1753070384426,
 'last_updated_timestamp': 1753070384426,
 'endpoint_type': 'STANDARD',
 'last_updated_user': 'senthilkumar.marimuthu@tvsmotor.com',
 'id': 'c9feecdc-1bf1-4e0c-b7f9-dd2e14eb5dea',
 'endpoint_status': {'state': 'PROVISIONING'},
 'num_indexes': 0}

In [0]:
# Vector index
vs_index = "en_wiki_index"
vs_index_fullname = f"telemetry.lwrca.{vs_index}"

embedding_model_endpoint = "databricks-gte-large-en"

In [0]:
index = vsc.create_delta_sync_index(
  endpoint_name=vector_search_endpoint_name,
  source_table_name='telemetry.lwrca.document_table',
  index_name=vs_index_fullname,
  pipeline_type='TRIGGERED',
  primary_key="id",
  embedding_source_column="document",
  embedding_model_endpoint_name=embedding_model_endpoint
)
index.describe()

{'name': 'telemetry.lwrca.en_wiki_index',
 'endpoint_name': 'vector-search-demo-endpoint',
 'primary_key': 'id',
 'index_type': 'DELTA_SYNC',
 'delta_sync_index_spec': {'source_table': 'telemetry.lwrca.document_table',
  'embedding_source_columns': [{'name': 'document',
    'embedding_model_endpoint_name': 'databricks-gte-large-en'}],
  'pipeline_type': 'TRIGGERED',
  'pipeline_id': '54f00dd7-6270-4307-96dd-4dfb080e3243'},
 'status': {'detailed_state': 'PROVISIONING_INDEX',
  'message': 'Delta sync Index creation is pending. Check latest status: https://adb-1948512269941134.14.azuredatabricks.net/explore/data/telemetry/lwrca/en_wiki_index',
  'indexed_row_count': 0,
  'ready': False,
  'index_url': 'adb-1948512269941134.14.azuredatabricks.net/api/2.0/vector-search/indexes/telemetry.lwrca.en_wiki_index'},
 'creator': 'senthilkumar.marimuthu@tvsmotor.com',
 'endpoint_type': 'STANDARD'}

In [0]:
#vs_index_fullname = "telemetry.lwrca.sample_doc_vectors_index"

In [0]:
index = vsc.get_index(endpoint_name=vector_search_endpoint_name, index_name=vs_index_fullname)

index.describe()

{'name': 'telemetry.lwrca.en_wiki_index',
 'endpoint_name': 'vector-search-demo-endpoint',
 'primary_key': 'id',
 'index_type': 'DELTA_SYNC',
 'delta_sync_index_spec': {'source_table': 'telemetry.lwrca.document_table',
  'embedding_source_columns': [{'name': 'document',
    'embedding_model_endpoint_name': 'databricks-gte-large-en'}],
  'pipeline_type': 'TRIGGERED',
  'pipeline_id': '54f00dd7-6270-4307-96dd-4dfb080e3243'},
 'status': {'detailed_state': 'ONLINE_TRIGGERED_UPDATE',
  'message': 'Index is online but is currently is in the process of re-syncing initial data. Check latest status: https://adb-1948512269941134.14.azuredatabricks.net/explore/data/telemetry/lwrca/en_wiki_index',
  'indexed_row_count': 5,
  'triggered_update_status': {'last_processed_commit_version': 2,
   'last_processed_commit_timestamp': '2025-07-21T03:41:31Z',
   'triggered_update_progress': {'latest_version_currently_processing': 2,
    'num_synced_rows': 5,
    'total_rows_to_sync': 5,
    'sync_progress_co

In [0]:
import time
while not index.describe().get('status').get('detailed_state').startswith('ONLINE'):

    print("Waiting for index to be ONLINE...")
    time.sleep(5)
print("Index is ONLINE")
index.describe()

Index is ONLINE


{'name': 'telemetry.lwrca.en_wiki_index',
 'endpoint_name': 'vector-search-demo-endpoint',
 'primary_key': 'id',
 'index_type': 'DELTA_SYNC',
 'delta_sync_index_spec': {'source_table': 'telemetry.lwrca.document_table',
  'embedding_source_columns': [{'name': 'document',
    'embedding_model_endpoint_name': 'databricks-gte-large-en'}],
  'pipeline_type': 'TRIGGERED',
  'pipeline_id': '54f00dd7-6270-4307-96dd-4dfb080e3243'},
 'status': {'detailed_state': 'ONLINE_NO_PENDING_UPDATE',
  'message': 'Index creation succeeded. Check latest status: https://adb-1948512269941134.14.azuredatabricks.net/explore/data/telemetry/lwrca/en_wiki_index',
  'indexed_row_count': 5,
  'triggered_update_status': {'last_processed_commit_version': 2,
   'last_processed_commit_timestamp': '2025-07-21T03:41:31Z'},
  'ready': True,
  'index_url': 'adb-1948512269941134.14.azuredatabricks.net/api/2.0/vector-search/indexes/telemetry.lwrca.en_wiki_index'},
 'creator': 'senthilkumar.marimuthu@tvsmotor.com',
 'endpoint_

In [0]:
results = index.similarity_search(
  query_text="python used widely",
  columns=['document'],
  num_results=2)

results

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


{'manifest': {'column_count': 2,
  'columns': [{'name': 'document'}, {'name': 'score'}]},
 'result': {'row_count': 2,
  'data_array': [['Python is widely used for data science.', 0.0034594734],
   ['Databricks simplifies big data and AI.', 0.0021118384]]},
 'debug_info': {'response_time': 1494.0,
  'ann_time': 18.0,
  'embedding_gen_time': 1470.0}}