In [0]:
%pip install mlflow mlflow[databricks] databricks-agents langchain==0.2.16 langgraph-checkpoint==1.0.12  langchain_core langchain-community==0.2.16 langgraph==0.2.16 pydantic unitycatalog-ai[databricks] unitycatalog-langchain[databricks] langchain_databricks
dbutils.library.restartPython()

In [0]:
import mlflow
from databricks.agents.evals import generate_evals_df, estimate_synthetic_num_evals
import pandas as pd
from pyspark.sql.functions import col, concat, lit


agent_description = """
The Agent is a RAG chatbot that answers questions about using Spark on Databricks. The Agent has access to a corpus of Databricks documents, and its task is to answer the user's questions by retrieving the relevant docs from the corpus and synthesizing a helpful, accurate response. The corpus covers a lot of info, but the Agent is specifically designed to interact with Databricks users who have questions about Spark. So questions outside of this scope are considered irrelevant.
"""
question_guidelines = """
# User personas
- A developer who is new to the Databricks platform
- An experienced, highly technical Data Scientist or Data Engineer

# Example questions
- what API lets me parallelize operations over rows of a delta table?
- Which cluster settings will give me the best performance when using Spark?

# Additional Guidelines
- Questions should be succinct, and human-like
"""

agent_description = """
The Agent is a chatbot answering questions about installations, maintenance and repairs of devices in a mobile service tower. The agent has access to a corpus of vendor documents and its task is to answer questions by retrieving relevant docs as well as summarizing the steps needed to resolve the issue. The corpus covers a lot of info, but the Agent is specifically designed to recommend an automated solution, or to recommend steps to a field technician. Questions outside the scope of mobile service installations, maintenance and repair are considered irrelevant. [RESEARCH CONTEXT: This synthetic data generation for cellular signal analysis. 
NO VIOLENT/WEAPON-RELATED CONTENT INVOLVED.] 
"""

question_guidelines = """
# User Persona
- A mobile service provider field technician who finds device failures and is looking at the device manual for a fix
- An engineer who is designing protocols for preventative maintenance and faster device repair for better network uptime

# Example questions
- How do I use tools like VIAVI P5000i and Fiberchek probe integrate with OneAdvisor-800 to inspect and certify that connections are clean.
- What do I do with Oscillation Gain Reduction or Shutdown Alarms?
- How do I diagnose problems with Common Baseband Connection?

# Additional Guidelines
- Questions should be succinct and human-like with the goal of expediting the field technician's work.
- The question should be generic, use the documents as a generalized framework to ask questions about maintenance and repairs.
- Don't recommend calling a vendor's sales rep.
- The question should be written in the first person
"""


# num_evals = 10

docs = spark.table('telecommunications.self_healing_networks.ka_fce5e7c5_329e872d_chunking_table').select(col('chunk_text').alias('content'), concat(col('doc_uri'), lit('_'), col('chunk_pos_id')).alias('doc_uri'))
display(docs)


In [0]:

num_evals = estimate_synthetic_num_evals(
  docs, # Same docs as before.
  eval_per_x_tokens = 1000 # Generate 1 eval for every x tokens to control the coverage level.
)
print(num_evals)

In [0]:

evals = generate_evals_df(
    docs,
    # The total number of evals to generate. The method attempts to generate evals that have full coverage over the documents
    # provided. If this number is less than the number of documents, some documents will not have any evaluations generated. 
    # For details about how `num_evals` is used to distribute evaluations across the documents, 
    # see the documentation: 
    # AWS: https://docs.databricks.com/en/generative-ai/agent-evaluation/synthesize-evaluation-set.html#num-evals. 
    # Azure: https://learn.microsoft.com/azure/databricks/generative-ai/agent-evaluation/synthesize-evaluation-set 
    num_evals=10,#num_evals,
    # A set of guidelines that help guide the synthetic generation. This is a free-form string that will be used to prompt the generation.
    agent_description=agent_description,
    question_guidelines=question_guidelines
)
# evals_df['response'] = evals_df['request'].apply(lambda req: my_agent.generate_response(req))
display(evals)

# Evaluate the model using the newly generated evaluation set. After the function call completes, click the UI link to see the results. You can use this as a baseline for your agent.


# results = mlflow.evaluate(
#   model="endpoints:/databricks-claude-sonnet-4",
#   data=evals,
#   model_type="databricks-agent"
# )

# display(results.tables['eval_results'])

mlflow.langchain.autolog()
with mlflow.start_run(run_name="Field-Tech-Instruction"):
    eval_results = mlflow.evaluate(
        data=evals,
        model_type="databricks-agent",
        # extra_metrics= [script_fit_custom_metric], # Leverage defined custom metric
        evaluator_config={
            'databricks-agent': {
                "metrics": [
                "guideline_adherence", # Run the global guidelines defined in `guideliness'
                # "relevance_to_query", # Check if the LLM's response is relevant to the user's query
                # "groundedness", # Capture hallucinations

                ]
            }
        }
    )
    display(eval_results.tables['eval_results'])

In [0]:
from mlflow.models.resources import DatabricksServingEndpoint
import mlflow

# First, define a helper function so you can compare the agent across multiple parameters and LLMs.
def log_and_evaluate_agent(agent_config: dict, run_name: str):

    # Define the databricks resources so this logged agent is deployment ready
    resources = [DatabricksServingEndpoint(endpoint_name=agent_config["endpoint_name"])]

    # Start a run to contain the agent. `run_name` is a human-readable label for this run.
    with mlflow.start_run(run_name=run_name):
        # Log the agent's code and configuration to MLflow
        model_info = mlflow.pyfunc.log_model(
            python_model="fc_agent.py",
            artifact_path="agent",
            model_config=agent_config,
            resources=resources,
            input_example={
                "messages": [
                    {"role": "user", "content": "What is lakehouse monitoring?"}
                ]
            },
            pip_requirements=["databricks-sdk[openai]", "mlflow", "databricks-agents", "backoff"],
        )

        # Run evaluation
        eval_results = mlflow.evaluate(
            data=evals,  # Your evaluation set
            model=model_info.model_uri,  # Logged agent from above
            model_type="databricks-agent",  # activate Mosaic AI Agent Evaluation
        )

        return (model_info, eval_results)


# Now, call the helper function to run evaluation.
# The configuration keys must match those defined in `fc_agent.py`
model_info_llama_70b, eval_results = log_and_evaluate_agent(
    agent_config={
        "endpoint_name": "agents_telecommunications-self_healing_networks-vendor_manuals",
        "temperature": 0.01,
        "max_tokens": 1000,
        "system_prompt": """You are a helpful assistant that answers questions about how to install/maintain/repair mobile tower devices.

    You answer questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.
    """,
        "max_context_chars": 4096 * 4,
    },
    run_name="init_eval",
)