
# The Notebook will Deploy a Fine-tuned Adapter + Base SBERT Model

**NOTE**: 
* Need to use 14.3 ML LTS to avoid python `snappy-c.h` related issue when perform model deployment
* Be sure to use the same model environment you used to log/registered the model

In [0]:
%pip install -U databricks-sdk
%pip install -U sentence-transformers
%pip install -U mlflow
%pip install python-snappy==0.7.3
%pip install einops
%pip install torch==2.4.0 torchvision==0.19.0
dbutils.library.restartPython()

In [0]:
dbutils.widgets.text(name="target_catalog", label="Catalog", defaultValue="dev_catalog")
dbutils.widgets.text(name="target_schema", label="Schema", defaultValue="dev_schema")
dbutils.widgets.text(name="ft_adapter_model", label="Fine-tuned adapter model", defaultValue="snowflake-arctic-embed-m-long-linear-adapter")

In [0]:
catalog = dbutils.widgets.get("target_catalog")
schema = dbutils.widgets.get("target_schema")
ft_adapter_model = dbutils.widgets.get("ft_adapter_model")
print(f"catalog: {catalog}, schema: {schema}, embedding model: {ft_adapter_model}")

# Download the Pretrained Embedding Model from HuggingFace

In [0]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import torch

In [0]:
from mlflow.tracking import MlflowClient

def get_latest_model_version(model_name):
  client = MlflowClient()
  model_version_infos = client.search_model_versions(f"name = '{model_name}'")
  return max([int(model_version_info.version) for model_version_info in model_version_infos])

# If instructor needs to update the model, the schema needs to change to SHARED_SCHEMA
latest_model_version = get_latest_model_version(f"{catalog}.{schema}.{ft_adapter_model}")
print(f"To deploy {ft_adapter_model}/{latest_model_version}")

# Load Fine-tuned Adapter to Test

In [0]:
model_uc_path = f'{catalog}.{schema}.{ft_adapter_model}'
endpoint_name = ft_adapter_model
model_uri = f"models:/{model_uc_path}/{latest_model_version}"
workload_type = "GPU_SMALL"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

In [0]:
import mlflow

load_sbert_model = mlflow.sentence_transformers.load_model(model_uri)

In [0]:
load_sbert_model.encode(["Hello world", "This is a test"])

# Serving the model with Model Serving

In [0]:
# Create or update serving endpoint
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointCoreConfigInput, ServedModelInput

config = EndpointCoreConfigInput.from_dict({
    "served_models": [
        {
            "name": endpoint_name,
            "model_name": model_uc_path,
            "model_version": latest_model_version,
            "workload_type": workload_type,
            "workload_size": "Small",
            "scale_to_zero_enabled": "True",
        }
    ]
})

In [0]:
import datetime
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
w.serving_endpoints.create_and_wait(name=endpoint_name, 
                                    config=config, 
                                    timeout=datetime.timedelta(minutes=30))