From 52eecbed7d00eb5a574c468fd13b819766b921c8 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:32:00 +0200 Subject: [PATCH 1/8] =?UTF-8?q?feat:=20Phase=201=20=E2=80=94=20tech=20tren?= =?UTF-8?q?ds=20agent=20with=20web=20search=20(Bing=20Grounding)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- evals/eval-config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/eval-config.json b/evals/eval-config.json index 971e043..bba838d 100644 --- a/evals/eval-config.json +++ b/evals/eval-config.json @@ -11,6 +11,6 @@ "groundedness": 0.75, "coherence": 0.80 }, - "phase_filter": null, - "notes": "Set phase_filter to '1' or '2' to run only phase-specific cases. null runs all." + "phase_filter": "1", + "notes": "Phase 1: Only web search queries evaluated. Phase 2 data analysis queries excluded." } From d419a15caa0be3932ba038c2648c4747ddcbacfa Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:40:24 +0200 Subject: [PATCH 2/8] fix: switch from BingGroundingTool to WebSearchTool (no connection required) - Replace bing_grounding tool with web_search in agent config and deploy script - WebSearchTool requires no Bing resource or connection setup - Update Bicep API version to 2025-06-01 and default region to swedencentral - Add --skip-foundry flag to bootstrap for using existing Foundry projects - Fix RBAC role assignment to use role definition ID for Azure AI User - Add SPN and resource metadata to .env output --- .env.example | 12 ++ .gitignore | 1 + agents/tech-trends-agent.json | 2 +- infra/main.bicep | 7 +- scripts/bootstrap.sh | 148 +++++++++++++++------- scripts/deploy_agent.py | 22 ++-- scripts/lifecycle/01-phase1-web-search.sh | 8 +- 7 files changed, 136 insertions(+), 64 deletions(-) diff --git a/.env.example b/.env.example index c9a24b9..2d2f0d1 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,9 @@ +# Identity (Service Principal) — populated by bootstrap.sh +AZURE_CLIENT_ID= +AZURE_TENANT_ID= +AZURE_SUBSCRIPTION_ID= +SP_OBJECT_ID= + # Azure AI Foundry endpoints (no secrets — auth is via OIDC or az login) FOUNDRY_TEST_ENDPOINT=https://eastus.api.azureml.ms/foundry/v1/subscriptions//resourceGroups//projects/ FOUNDRY_PROD_ENDPOINT=https://eastus.api.azureml.ms/foundry/v1/subscriptions//resourceGroups//projects/ @@ -7,3 +13,9 @@ GPT_DEPLOYMENT=gpt-4o-2024-11-20 # Bing Grounding connection name configured in the Foundry project BING_CONNECTION_NAME=bing-grounding + +# Resource metadata +RESOURCE_GROUP= +LOCATION=swedencentral +ACCOUNT_NAME= +GITHUB_REPO= diff --git a/.gitignore b/.gitignore index e617638..a15918c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ build/ # Environment .env +.bootstrap-state.json # IDE .vscode/ diff --git a/agents/tech-trends-agent.json b/agents/tech-trends-agent.json index 6a095ca..fb6c08b 100644 --- a/agents/tech-trends-agent.json +++ b/agents/tech-trends-agent.json @@ -5,7 +5,7 @@ "model": "${GPT_DEPLOYMENT}", "instructions_file": "prompts/tech-trends-agent.md", "tools": [ - { "type": "bing_grounding" } + { "type": "web_search" } ] }, "eval": { diff --git a/infra/main.bicep b/infra/main.bicep index 83d6851..8e7346d 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -23,7 +23,7 @@ param gptModelVersion string = '2024-11-20' param gptCapacity int = 30 // --- Cognitive Services account (hosts the Foundry project) --- -resource aiAccount 'Microsoft.CognitiveServices/accounts@2024-10-01' = { +resource aiAccount 'Microsoft.CognitiveServices/accounts@2025-06-01' = { name: accountName location: location kind: 'AIServices' @@ -33,11 +33,12 @@ resource aiAccount 'Microsoft.CognitiveServices/accounts@2024-10-01' = { properties: { customSubDomainName: accountName publicNetworkAccess: 'Enabled' + allowProjectManagement: true } } // --- AI Project --- -resource aiProject 'Microsoft.CognitiveServices/accounts/projects@2024-10-01' = { +resource aiProject 'Microsoft.CognitiveServices/accounts/projects@2025-06-01' = { parent: aiAccount name: projectName location: location @@ -45,7 +46,7 @@ resource aiProject 'Microsoft.CognitiveServices/accounts/projects@2024-10-01' = } // --- GPT model deployment --- -resource gptDeployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = { +resource gptDeployment 'Microsoft.CognitiveServices/accounts/deployments@2025-06-01' = { parent: aiAccount name: gptDeploymentName sku: { diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index cd2d005..4763408 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -4,8 +4,8 @@ # # Provisions: # 1. Resource group -# 2. TEST Foundry project (Bicep) -# 3. PROD Foundry project (Bicep) +# 2. TEST Foundry project (Bicep) — skipped with --skip-foundry +# 3. PROD Foundry project (Bicep) — skipped with --skip-foundry # 4. App Registration + Service Principal # 5. 3 Federated credentials (main, PR, tags) # 6. RBAC role assignments @@ -16,15 +16,23 @@ # Usage: # ./scripts/bootstrap.sh \ # --resource-group rg-agent-devops \ -# --location eastus \ +# --location swedencentral \ # --account-name agentdevops \ # --github-repo san360/agent-devops +# +# # Skip Foundry project creation (use existing projects): +# ./scripts/bootstrap.sh \ +# --resource-group rg-agent-devops \ +# --account-name agentdevops \ +# --skip-foundry \ +# --test-endpoint "https://..." \ +# --prod-endpoint "https://..." set -euo pipefail # ---------- defaults ---------- RESOURCE_GROUP="" -LOCATION="eastus" +LOCATION="swedencentral" ACCOUNT_NAME="" GITHUB_REPO="san360/agent-devops" GPT_MODEL_NAME="gpt-4o" @@ -32,22 +40,34 @@ GPT_MODEL_VERSION="2024-11-20" GPT_DEPLOYMENT_NAME="gpt-4o-2024-11-20" GPT_CAPACITY=30 BING_CONNECTION_NAME="bing-grounding" +SKIP_FOUNDRY=false +TEST_ENDPOINT="" +PROD_ENDPOINT="" # ---------- parse args ---------- while [[ $# -gt 0 ]]; do case $1 in - --resource-group) RESOURCE_GROUP="$2"; shift 2 ;; - --location) LOCATION="$2"; shift 2 ;; - --account-name) ACCOUNT_NAME="$2"; shift 2 ;; - --github-repo) GITHUB_REPO="$2"; shift 2 ;; + --resource-group) RESOURCE_GROUP="$2"; shift 2 ;; + --location) LOCATION="$2"; shift 2 ;; + --account-name) ACCOUNT_NAME="$2"; shift 2 ;; + --github-repo) GITHUB_REPO="$2"; shift 2 ;; --gpt-deployment) GPT_DEPLOYMENT_NAME="$2"; shift 2 ;; - --gpt-capacity) GPT_CAPACITY="$2"; shift 2 ;; + --gpt-capacity) GPT_CAPACITY="$2"; shift 2 ;; + --skip-foundry) SKIP_FOUNDRY=true; shift ;; + --test-endpoint) TEST_ENDPOINT="$2"; shift 2 ;; + --prod-endpoint) PROD_ENDPOINT="$2"; shift 2 ;; *) echo "Unknown flag: $1"; exit 1 ;; esac done if [[ -z "$RESOURCE_GROUP" || -z "$ACCOUNT_NAME" ]]; then echo "Usage: $0 --resource-group --account-name [--location ] [--github-repo ]" + echo " Add --skip-foundry --test-endpoint --prod-endpoint to use existing projects" + exit 1 +fi + +if [[ "$SKIP_FOUNDRY" == true && ( -z "$TEST_ENDPOINT" || -z "$PROD_ENDPOINT" ) ]]; then + echo "ERROR: --skip-foundry requires both --test-endpoint and --prod-endpoint" exit 1 fi @@ -61,8 +81,14 @@ echo "============================================" echo " Resource Group: $RESOURCE_GROUP" echo " Location: $LOCATION" echo " Account Name: $ACCOUNT_NAME" -echo " Test Project: $TEST_PROJECT" -echo " Prod Project: $PROD_PROJECT" +echo " Skip Foundry: $SKIP_FOUNDRY" +if [[ "$SKIP_FOUNDRY" == true ]]; then + echo " TEST endpoint: $TEST_ENDPOINT (provided)" + echo " PROD endpoint: $PROD_ENDPOINT (provided)" +else + echo " Test Project: $TEST_PROJECT" + echo " Prod Project: $PROD_PROJECT" +fi echo " GitHub Repo: $GITHUB_REPO" echo " GPT Deployment: $GPT_DEPLOYMENT_NAME" echo "============================================" @@ -75,39 +101,43 @@ az group create \ --location "$LOCATION" \ --output none -# ---------- Step 2: Deploy TEST project ---------- -echo "[2/7] Deploying TEST Foundry project..." -TEST_OUTPUT=$(az deployment group create \ - --resource-group "$RESOURCE_GROUP" \ - --template-file infra/main.bicep \ - --parameters \ - accountName="${ACCOUNT_NAME}test" \ - projectName="$TEST_PROJECT" \ - gptDeploymentName="$GPT_DEPLOYMENT_NAME" \ - gptModelName="$GPT_MODEL_NAME" \ - gptModelVersion="$GPT_MODEL_VERSION" \ - gptCapacity="$GPT_CAPACITY" \ - --output json) - -TEST_ENDPOINT=$(echo "$TEST_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['properties']['outputs']['projectEndpoint']['value'])") -echo " TEST endpoint: $TEST_ENDPOINT" - -# ---------- Step 3: Deploy PROD project ---------- -echo "[3/7] Deploying PROD Foundry project..." -PROD_OUTPUT=$(az deployment group create \ - --resource-group "$RESOURCE_GROUP" \ - --template-file infra/main.bicep \ - --parameters \ - accountName="${ACCOUNT_NAME}prod" \ - projectName="$PROD_PROJECT" \ - gptDeploymentName="$GPT_DEPLOYMENT_NAME" \ - gptModelName="$GPT_MODEL_NAME" \ - gptModelVersion="$GPT_MODEL_VERSION" \ - gptCapacity="$GPT_CAPACITY" \ - --output json) - -PROD_ENDPOINT=$(echo "$PROD_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['properties']['outputs']['projectEndpoint']['value'])") -echo " PROD endpoint: $PROD_ENDPOINT" +# ---------- Step 2 & 3: Deploy Foundry projects (or skip) ---------- +if [[ "$SKIP_FOUNDRY" == true ]]; then + echo "[2/7] Skipping TEST Foundry project (using provided endpoint)" + echo "[3/7] Skipping PROD Foundry project (using provided endpoint)" +else + echo "[2/7] Deploying TEST Foundry project..." + TEST_OUTPUT=$(az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file infra/main.bicep \ + --parameters \ + accountName="${ACCOUNT_NAME}test" \ + projectName="$TEST_PROJECT" \ + gptDeploymentName="$GPT_DEPLOYMENT_NAME" \ + gptModelName="$GPT_MODEL_NAME" \ + gptModelVersion="$GPT_MODEL_VERSION" \ + gptCapacity="$GPT_CAPACITY" \ + --output json) + + TEST_ENDPOINT=$(echo "$TEST_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['properties']['outputs']['projectEndpoint']['value'])") + echo " TEST endpoint: $TEST_ENDPOINT" + + echo "[3/7] Deploying PROD Foundry project..." + PROD_OUTPUT=$(az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file infra/main.bicep \ + --parameters \ + accountName="${ACCOUNT_NAME}prod" \ + projectName="$PROD_PROJECT" \ + gptDeploymentName="$GPT_DEPLOYMENT_NAME" \ + gptModelName="$GPT_MODEL_NAME" \ + gptModelVersion="$GPT_MODEL_VERSION" \ + gptCapacity="$GPT_CAPACITY" \ + --output json) + + PROD_ENDPOINT=$(echo "$PROD_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['properties']['outputs']['projectEndpoint']['value'])") + echo " PROD endpoint: $PROD_ENDPOINT" +fi # ---------- Step 4: App Registration + Service Principal ---------- echo "[4/7] Creating App Registration and Service Principal..." @@ -159,7 +189,7 @@ SCOPE="/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP" az role assignment create \ --assignee "$SP_OBJ_ID" \ - --role "Azure AI User" \ + --role "53ca6127-db72-4b80-b1b0-d745d6d5456d" \ --scope "$SCOPE" \ --output none echo " + Azure AI User" @@ -235,3 +265,31 @@ json.dump({ }, open('$STATE_FILE', 'w'), indent=2) " echo " State saved to $STATE_FILE (used by teardown.sh)" + +# Save .env for local development +ENV_FILE=".env" +cat > "$ENV_FILE" < agents/tech-trends-agent.json << 'AGENT_EOF' "model": "${GPT_DEPLOYMENT}", "instructions_file": "prompts/tech-trends-agent.md", "tools": [ - { "type": "bing_grounding" } + { "type": "web_search" } ] }, "eval": { @@ -102,7 +102,7 @@ EVAL_EOF # --- Commit, push, open PR --- git add agents/ prompts/ evals/ -git commit -m "feat: Phase 1 — tech trends agent with web search (Bing Grounding)" +git commit -m "feat: Phase 1 — tech trends agent with web search" git push origin "$BRANCH" @@ -111,7 +111,7 @@ PR_URL=$(gh pr create \ --title "Phase 1: Tech Trends Agent with Web Search" \ --body "$(cat <<'PR_EOF' ## Summary -- Initial agent deployment with Bing Grounding (web search) capability +- Initial agent deployment with Web Search capability (no connection required) - System prompt defines structured research analyst behaviour - Evaluation runs Phase 1 queries only (5 test cases) From c7ac261e6f63a326b4dd29a4b579b57f5695e63a Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:44:48 +0200 Subject: [PATCH 3/8] fix: add Azure AI Developer role on Foundry account for agents/write permission --- scripts/bootstrap.sh | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 4763408..8eee2f9 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -192,14 +192,32 @@ az role assignment create \ --role "53ca6127-db72-4b80-b1b0-d745d6d5456d" \ --scope "$SCOPE" \ --output none -echo " + Azure AI User" +echo " + Azure AI User (on resource group)" az role assignment create \ --assignee "$SP_OBJ_ID" \ --role "Cognitive Services OpenAI User" \ --scope "$SCOPE" \ --output none -echo " + Cognitive Services OpenAI User" +echo " + Cognitive Services OpenAI User (on resource group)" + +# Azure AI Developer on Foundry account scope (needed for agents/write data action) +if [[ -n "$TEST_ENDPOINT" ]]; then + FOUNDRY_HOST=$(echo "$TEST_ENDPOINT" | sed -E 's|https://([^/]+)/.*|\1|') + FOUNDRY_ACCOUNT_NAME=$(echo "$FOUNDRY_HOST" | sed -E 's|\.services\.ai\.azure\.com||') + FOUNDRY_ACCOUNT_ID=$(az cognitiveservices account list \ + --query "[?name=='${FOUNDRY_ACCOUNT_NAME}'].id | [0]" -o tsv 2>/dev/null) + if [[ -n "$FOUNDRY_ACCOUNT_ID" ]]; then + az role assignment create \ + --assignee "$SP_OBJ_ID" \ + --role "Azure AI Developer" \ + --scope "$FOUNDRY_ACCOUNT_ID" \ + --output none + echo " + Azure AI Developer (on Foundry account: $FOUNDRY_ACCOUNT_NAME)" + else + echo " ! Could not resolve Foundry account ID — assign Azure AI Developer manually" + fi +fi # ---------- Step 7: GitHub Variables ---------- echo "[7/7] Setting GitHub repository variables..." From 0355c648510ecd5190443a450d9b26c55b028681 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:47:58 +0200 Subject: [PATCH 4/8] fix: assign Azure AI Developer role at project level, not account level --- scripts/bootstrap.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 8eee2f9..0b97a27 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -201,21 +201,33 @@ az role assignment create \ --output none echo " + Cognitive Services OpenAI User (on resource group)" -# Azure AI Developer on Foundry account scope (needed for agents/write data action) +# Azure AI Developer on Foundry project scope (needed for agents/write data action) if [[ -n "$TEST_ENDPOINT" ]]; then FOUNDRY_HOST=$(echo "$TEST_ENDPOINT" | sed -E 's|https://([^/]+)/.*|\1|') FOUNDRY_ACCOUNT_NAME=$(echo "$FOUNDRY_HOST" | sed -E 's|\.services\.ai\.azure\.com||') + TEST_PROJECT_NAME=$(echo "$TEST_ENDPOINT" | sed -E 's|.*/projects/([^/]+).*|\1|') FOUNDRY_ACCOUNT_ID=$(az cognitiveservices account list \ --query "[?name=='${FOUNDRY_ACCOUNT_NAME}'].id | [0]" -o tsv 2>/dev/null) if [[ -n "$FOUNDRY_ACCOUNT_ID" ]]; then az role assignment create \ --assignee "$SP_OBJ_ID" \ --role "Azure AI Developer" \ - --scope "$FOUNDRY_ACCOUNT_ID" \ + --scope "$FOUNDRY_ACCOUNT_ID/projects/$TEST_PROJECT_NAME" \ --output none - echo " + Azure AI Developer (on Foundry account: $FOUNDRY_ACCOUNT_NAME)" + echo " + Azure AI Developer (on project: $TEST_PROJECT_NAME)" else - echo " ! Could not resolve Foundry account ID — assign Azure AI Developer manually" + echo " ! Could not resolve Foundry account — assign Azure AI Developer on project manually" + fi +fi +if [[ -n "$PROD_ENDPOINT" && "$PROD_ENDPOINT" != "$TEST_ENDPOINT" ]]; then + PROD_PROJECT_NAME=$(echo "$PROD_ENDPOINT" | sed -E 's|.*/projects/([^/]+).*|\1|') + if [[ -n "$FOUNDRY_ACCOUNT_ID" ]]; then + az role assignment create \ + --assignee "$SP_OBJ_ID" \ + --role "Azure AI Developer" \ + --scope "$FOUNDRY_ACCOUNT_ID/projects/$PROD_PROJECT_NAME" \ + --output none + echo " + Azure AI Developer (on project: $PROD_PROJECT_NAME)" fi fi From 08d3533150071c62c84800e2b1ed6e049110d680 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:52:22 +0200 Subject: [PATCH 5/8] fix: use Azure AI User (Foundry User) role on account scope for agents/write --- scripts/bootstrap.sh | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 0b97a27..05ef463 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -201,33 +201,21 @@ az role assignment create \ --output none echo " + Cognitive Services OpenAI User (on resource group)" -# Azure AI Developer on Foundry project scope (needed for agents/write data action) +# Azure AI User (Foundry User) on Foundry account scope — grants agents/write data action if [[ -n "$TEST_ENDPOINT" ]]; then FOUNDRY_HOST=$(echo "$TEST_ENDPOINT" | sed -E 's|https://([^/]+)/.*|\1|') FOUNDRY_ACCOUNT_NAME=$(echo "$FOUNDRY_HOST" | sed -E 's|\.services\.ai\.azure\.com||') - TEST_PROJECT_NAME=$(echo "$TEST_ENDPOINT" | sed -E 's|.*/projects/([^/]+).*|\1|') FOUNDRY_ACCOUNT_ID=$(az cognitiveservices account list \ --query "[?name=='${FOUNDRY_ACCOUNT_NAME}'].id | [0]" -o tsv 2>/dev/null) if [[ -n "$FOUNDRY_ACCOUNT_ID" ]]; then az role assignment create \ --assignee "$SP_OBJ_ID" \ - --role "Azure AI Developer" \ - --scope "$FOUNDRY_ACCOUNT_ID/projects/$TEST_PROJECT_NAME" \ + --role "53ca6127-db72-4b80-b1b0-d745d6d5456d" \ + --scope "$FOUNDRY_ACCOUNT_ID" \ --output none - echo " + Azure AI Developer (on project: $TEST_PROJECT_NAME)" + echo " + Azure AI User / Foundry User (on account: $FOUNDRY_ACCOUNT_NAME)" else - echo " ! Could not resolve Foundry account — assign Azure AI Developer on project manually" - fi -fi -if [[ -n "$PROD_ENDPOINT" && "$PROD_ENDPOINT" != "$TEST_ENDPOINT" ]]; then - PROD_PROJECT_NAME=$(echo "$PROD_ENDPOINT" | sed -E 's|.*/projects/([^/]+).*|\1|') - if [[ -n "$FOUNDRY_ACCOUNT_ID" ]]; then - az role assignment create \ - --assignee "$SP_OBJ_ID" \ - --role "Azure AI Developer" \ - --scope "$FOUNDRY_ACCOUNT_ID/projects/$PROD_PROJECT_NAME" \ - --output none - echo " + Azure AI Developer (on project: $PROD_PROJECT_NAME)" + echo " ! Could not resolve Foundry account — assign Azure AI User (53ca6127-...) manually" fi fi From dad27752c19ee931be672de56016454698219474 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:55:32 +0200 Subject: [PATCH 6/8] fix: convert eval dataset from JSONL to JSON array for ai-agent-evals action --- .github/workflows/evaluate.yml | 2 +- .github/workflows/monitor.yml | 2 +- agents/tech-trends-agent.json | 2 +- evals/golden-dataset.json | 10 ++++++++++ evals/golden-dataset.jsonl | 8 -------- scripts/lifecycle/01-phase1-web-search.sh | 2 +- scripts/lifecycle/02-phase2-code-interpreter.sh | 2 +- tests/conftest.py | 2 +- 8 files changed, 16 insertions(+), 14 deletions(-) create mode 100644 evals/golden-dataset.json delete mode 100644 evals/golden-dataset.jsonl diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index c8d5659..e3a3f24 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -74,7 +74,7 @@ jobs: azure-ai-project-endpoint: ${{ vars.FOUNDRY_TEST_ENDPOINT }} deployment-name: ${{ vars.GPT_DEPLOYMENT }} agent-ids: "tech-trends-agent:latest" - data-path: "./evals/golden-dataset.jsonl" + data-path: "./evals/golden-dataset.json" evaluation-result-view: "all-scores" - name: Post evaluation summary to PR diff --git a/.github/workflows/monitor.yml b/.github/workflows/monitor.yml index 3eec93d..9241bff 100644 --- a/.github/workflows/monitor.yml +++ b/.github/workflows/monitor.yml @@ -29,7 +29,7 @@ jobs: azure-ai-project-endpoint: ${{ vars.FOUNDRY_PROD_ENDPOINT }} deployment-name: ${{ vars.GPT_DEPLOYMENT }} agent-ids: "tech-trends-agent:latest" - data-path: "./evals/golden-dataset.jsonl" + data-path: "./evals/golden-dataset.json" evaluation-result-view: "default" - name: Open GitHub issue if scores degraded diff --git a/agents/tech-trends-agent.json b/agents/tech-trends-agent.json index fb6c08b..9d6b181 100644 --- a/agents/tech-trends-agent.json +++ b/agents/tech-trends-agent.json @@ -9,7 +9,7 @@ ] }, "eval": { - "dataset": "evals/golden-dataset.jsonl", + "dataset": "evals/golden-dataset.json", "phase_filter": "1", "config": "evals/eval-config.json" }, diff --git a/evals/golden-dataset.json b/evals/golden-dataset.json new file mode 100644 index 0000000..3a16666 --- /dev/null +++ b/evals/golden-dataset.json @@ -0,0 +1,10 @@ +[ + {"query": "What are the top three AI model releases in the last 90 days and their key capabilities?", "ground_truth": "Response should identify at least 3 recent model releases with specific capability descriptions, cite web sources, and be structured with a summary and key findings section.", "phase": "1", "category": "trend_research"}, + {"query": "How is the major cloud provider landscape shifting in 2025 regarding AI infrastructure?", "ground_truth": "Response should cover at least 2 major cloud providers, discuss AI infrastructure investment or announcements, and cite current sources.", "phase": "1", "category": "trend_research"}, + {"query": "What is the current state of open-source LLM adoption in enterprise settings?", "ground_truth": "Response should address enterprise adoption, mention specific models or frameworks, and provide balanced perspective on open vs closed source.", "phase": "1", "category": "market_analysis"}, + {"query": "Summarise recent developer sentiment around AI coding tools based on community discussions.", "ground_truth": "Response should reflect actual developer perspectives, not vendor claims, and cite community sources such as surveys, forums or publications.", "phase": "1", "category": "community_sentiment"}, + {"query": "What are analysts predicting for AI chip demand over the next 12 months?", "ground_truth": "Response should include analyst predictions, reference specific companies or market segments, and note the source and date of predictions.", "phase": "1", "category": "market_forecast"}, + {"query": "Search for the latest GPU benchmark comparison data and calculate which GPU offers the best performance-per-dollar ratio based on the data you find.", "ground_truth": "Response should retrieve actual benchmark data, perform a calculation or comparison, present results in a structured format, and cite the data source.", "phase": "2", "category": "data_analysis"}, + {"query": "Find recent AI model API pricing tables and produce a comparison showing cost per million tokens for at least 4 models.", "ground_truth": "Response should retrieve current pricing data, present a structured comparison table, and identify the most cost-effective option for different use cases.", "phase": "2", "category": "data_analysis"}, + {"query": "Retrieve the latest Stack Overflow developer survey data on AI tool usage and compute the percentage increase in adoption compared to the prior year.", "ground_truth": "Response should locate survey data, perform a percentage calculation, present the result clearly, and note the source and survey year.", "phase": "2", "category": "data_analysis"} +] diff --git a/evals/golden-dataset.jsonl b/evals/golden-dataset.jsonl deleted file mode 100644 index 15b29a9..0000000 --- a/evals/golden-dataset.jsonl +++ /dev/null @@ -1,8 +0,0 @@ -{"query": "What are the top three AI model releases in the last 90 days and their key capabilities?", "ground_truth": "Response should identify at least 3 recent model releases with specific capability descriptions, cite web sources, and be structured with a summary and key findings section.", "phase": "1", "category": "trend_research"} -{"query": "How is the major cloud provider landscape shifting in 2025 regarding AI infrastructure?", "ground_truth": "Response should cover at least 2 major cloud providers, discuss AI infrastructure investment or announcements, and cite current sources.", "phase": "1", "category": "trend_research"} -{"query": "What is the current state of open-source LLM adoption in enterprise settings?", "ground_truth": "Response should address enterprise adoption, mention specific models or frameworks, and provide balanced perspective on open vs closed source.", "phase": "1", "category": "market_analysis"} -{"query": "Summarise recent developer sentiment around AI coding tools based on community discussions.", "ground_truth": "Response should reflect actual developer perspectives, not vendor claims, and cite community sources such as surveys, forums or publications.", "phase": "1", "category": "community_sentiment"} -{"query": "What are analysts predicting for AI chip demand over the next 12 months?", "ground_truth": "Response should include analyst predictions, reference specific companies or market segments, and note the source and date of predictions.", "phase": "1", "category": "market_forecast"} -{"query": "Search for the latest GPU benchmark comparison data and calculate which GPU offers the best performance-per-dollar ratio based on the data you find.", "ground_truth": "Response should retrieve actual benchmark data, perform a calculation or comparison, present results in a structured format, and cite the data source.", "phase": "2", "category": "data_analysis"} -{"query": "Find recent AI model API pricing tables and produce a comparison showing cost per million tokens for at least 4 models.", "ground_truth": "Response should retrieve current pricing data, present a structured comparison table, and identify the most cost-effective option for different use cases.", "phase": "2", "category": "data_analysis"} -{"query": "Retrieve the latest Stack Overflow developer survey data on AI tool usage and compute the percentage increase in adoption compared to the prior year.", "ground_truth": "Response should locate survey data, perform a percentage calculation, present the result clearly, and note the source and survey year.", "phase": "2", "category": "data_analysis"} diff --git a/scripts/lifecycle/01-phase1-web-search.sh b/scripts/lifecycle/01-phase1-web-search.sh index fc5668f..247acfa 100644 --- a/scripts/lifecycle/01-phase1-web-search.sh +++ b/scripts/lifecycle/01-phase1-web-search.sh @@ -37,7 +37,7 @@ cat > agents/tech-trends-agent.json << 'AGENT_EOF' ] }, "eval": { - "dataset": "evals/golden-dataset.jsonl", + "dataset": "evals/golden-dataset.json", "phase_filter": "1", "config": "evals/eval-config.json" }, diff --git a/scripts/lifecycle/02-phase2-code-interpreter.sh b/scripts/lifecycle/02-phase2-code-interpreter.sh index 066670a..c2df73e 100644 --- a/scripts/lifecycle/02-phase2-code-interpreter.sh +++ b/scripts/lifecycle/02-phase2-code-interpreter.sh @@ -40,7 +40,7 @@ cat > agents/tech-trends-agent.json << 'AGENT_EOF' ] }, "eval": { - "dataset": "evals/golden-dataset.jsonl", + "dataset": "evals/golden-dataset.json", "phase_filter": null, "config": "evals/eval-config.json" }, diff --git a/tests/conftest.py b/tests/conftest.py index 9f07b9b..c3249a4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,7 @@ def tmp_project(tmp_path): "tools": [{"type": "bing_grounding"}], }, "eval": { - "dataset": "evals/golden-dataset.jsonl", + "dataset": "evals/golden-dataset.json", "phase_filter": "1", "config": "evals/eval-config.json", }, From 97bcafa843c1e9b242a21c62c07d19ad0b21dbdc Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 17:58:35 +0200 Subject: [PATCH 7/8] fix: pass actual agent version to ai-agent-evals action instead of 'latest' --- .github/workflows/evaluate.yml | 2 +- .github/workflows/monitor.yml | 26 +++++++++++++++++++++++++- scripts/deploy_agent.py | 7 +++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index e3a3f24..5e81110 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -73,7 +73,7 @@ jobs: with: azure-ai-project-endpoint: ${{ vars.FOUNDRY_TEST_ENDPOINT }} deployment-name: ${{ vars.GPT_DEPLOYMENT }} - agent-ids: "tech-trends-agent:latest" + agent-ids: "tech-trends-agent:${{ steps.deploy.outputs.agent_version }}" data-path: "./evals/golden-dataset.json" evaluation-result-view: "all-scores" diff --git a/.github/workflows/monitor.yml b/.github/workflows/monitor.yml index 9241bff..cd9bbf3 100644 --- a/.github/workflows/monitor.yml +++ b/.github/workflows/monitor.yml @@ -16,19 +16,43 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install dependencies + run: pip install -r requirements.txt + - uses: azure/login@v2 with: client-id: ${{ vars.AZURE_CLIENT_ID }} tenant-id: ${{ vars.AZURE_TENANT_ID }} subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + - name: Get latest agent version + id: agent + env: + FOUNDRY_PROD_ENDPOINT: ${{ vars.FOUNDRY_PROD_ENDPOINT }} + run: | + VERSION=$(python3 -c " + import os + from azure.ai.projects import AIProjectClient + from azure.identity import DefaultAzureCredential + client = AIProjectClient(endpoint=os.environ['FOUNDRY_PROD_ENDPOINT'], credential=DefaultAzureCredential()) + agent = client.agents.get_agent('tech-trends-agent') + print(agent.version) + ") + echo "version=$VERSION" >> $GITHUB_OUTPUT + - name: Run evaluation against production agent id: eval uses: microsoft/ai-agent-evals@v3-beta with: azure-ai-project-endpoint: ${{ vars.FOUNDRY_PROD_ENDPOINT }} deployment-name: ${{ vars.GPT_DEPLOYMENT }} - agent-ids: "tech-trends-agent:latest" + agent-ids: "tech-trends-agent:${{ steps.agent.outputs.version }}" data-path: "./evals/golden-dataset.json" evaluation-result-view: "default" diff --git a/scripts/deploy_agent.py b/scripts/deploy_agent.py index 8f6338d..4cd433e 100644 --- a/scripts/deploy_agent.py +++ b/scripts/deploy_agent.py @@ -111,6 +111,13 @@ def deploy_agent(env: str, tools: list, semver: str): json.dump(artifact, f, indent=2) print(f"Deployed {agent.version} | artifact -> {artifact_path}") + + # Output for GitHub Actions + gh_output = os.environ.get("GITHUB_OUTPUT") + if gh_output: + with open(gh_output, "a") as f: + f.write(f"agent_version={agent.version}\n") + return artifact, artifact_path From f62a258108a7538385ef5c0f21884837f5c377ac Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:03:32 +0200 Subject: [PATCH 8/8] fix: restructure eval dataset to match ai-agent-evals v3-beta expected format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The action expects a JSON object with name, evaluators, and data fields — not a bare array. Also align evaluator names to builtin.* convention. --- evals/eval-config.json | 8 ++++---- evals/golden-dataset.json | 29 +++++++++++++++++++---------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/evals/eval-config.json b/evals/eval-config.json index bba838d..5cafb0a 100644 --- a/evals/eval-config.json +++ b/evals/eval-config.json @@ -1,9 +1,9 @@ { "evaluators": [ - "TaskAdherenceEvaluator", - "RelevanceEvaluator", - "GroundednessEvaluator", - "CoherenceEvaluator" + "builtin.task_adherence", + "builtin.relevance", + "builtin.groundedness", + "builtin.coherence" ], "thresholds": { "task_adherence": 0.80, diff --git a/evals/golden-dataset.json b/evals/golden-dataset.json index 3a16666..25a48ff 100644 --- a/evals/golden-dataset.json +++ b/evals/golden-dataset.json @@ -1,10 +1,19 @@ -[ - {"query": "What are the top three AI model releases in the last 90 days and their key capabilities?", "ground_truth": "Response should identify at least 3 recent model releases with specific capability descriptions, cite web sources, and be structured with a summary and key findings section.", "phase": "1", "category": "trend_research"}, - {"query": "How is the major cloud provider landscape shifting in 2025 regarding AI infrastructure?", "ground_truth": "Response should cover at least 2 major cloud providers, discuss AI infrastructure investment or announcements, and cite current sources.", "phase": "1", "category": "trend_research"}, - {"query": "What is the current state of open-source LLM adoption in enterprise settings?", "ground_truth": "Response should address enterprise adoption, mention specific models or frameworks, and provide balanced perspective on open vs closed source.", "phase": "1", "category": "market_analysis"}, - {"query": "Summarise recent developer sentiment around AI coding tools based on community discussions.", "ground_truth": "Response should reflect actual developer perspectives, not vendor claims, and cite community sources such as surveys, forums or publications.", "phase": "1", "category": "community_sentiment"}, - {"query": "What are analysts predicting for AI chip demand over the next 12 months?", "ground_truth": "Response should include analyst predictions, reference specific companies or market segments, and note the source and date of predictions.", "phase": "1", "category": "market_forecast"}, - {"query": "Search for the latest GPU benchmark comparison data and calculate which GPU offers the best performance-per-dollar ratio based on the data you find.", "ground_truth": "Response should retrieve actual benchmark data, perform a calculation or comparison, present results in a structured format, and cite the data source.", "phase": "2", "category": "data_analysis"}, - {"query": "Find recent AI model API pricing tables and produce a comparison showing cost per million tokens for at least 4 models.", "ground_truth": "Response should retrieve current pricing data, present a structured comparison table, and identify the most cost-effective option for different use cases.", "phase": "2", "category": "data_analysis"}, - {"query": "Retrieve the latest Stack Overflow developer survey data on AI tool usage and compute the percentage increase in adoption compared to the prior year.", "ground_truth": "Response should locate survey data, perform a percentage calculation, present the result clearly, and note the source and survey year.", "phase": "2", "category": "data_analysis"} -] +{ + "name": "tech-trends-agent-eval", + "evaluators": [ + "builtin.task_adherence", + "builtin.relevance", + "builtin.groundedness", + "builtin.coherence" + ], + "data": [ + {"query": "What are the top three AI model releases in the last 90 days and their key capabilities?", "ground_truth": "Response should identify at least 3 recent model releases with specific capability descriptions, cite web sources, and be structured with a summary and key findings section.", "phase": "1", "category": "trend_research"}, + {"query": "How is the major cloud provider landscape shifting in 2025 regarding AI infrastructure?", "ground_truth": "Response should cover at least 2 major cloud providers, discuss AI infrastructure investment or announcements, and cite current sources.", "phase": "1", "category": "trend_research"}, + {"query": "What is the current state of open-source LLM adoption in enterprise settings?", "ground_truth": "Response should address enterprise adoption, mention specific models or frameworks, and provide balanced perspective on open vs closed source.", "phase": "1", "category": "market_analysis"}, + {"query": "Summarise recent developer sentiment around AI coding tools based on community discussions.", "ground_truth": "Response should reflect actual developer perspectives, not vendor claims, and cite community sources such as surveys, forums or publications.", "phase": "1", "category": "community_sentiment"}, + {"query": "What are analysts predicting for AI chip demand over the next 12 months?", "ground_truth": "Response should include analyst predictions, reference specific companies or market segments, and note the source and date of predictions.", "phase": "1", "category": "market_forecast"}, + {"query": "Search for the latest GPU benchmark comparison data and calculate which GPU offers the best performance-per-dollar ratio based on the data you find.", "ground_truth": "Response should retrieve actual benchmark data, perform a calculation or comparison, present results in a structured format, and cite the data source.", "phase": "2", "category": "data_analysis"}, + {"query": "Find recent AI model API pricing tables and produce a comparison showing cost per million tokens for at least 4 models.", "ground_truth": "Response should retrieve current pricing data, present a structured comparison table, and identify the most cost-effective option for different use cases.", "phase": "2", "category": "data_analysis"}, + {"query": "Retrieve the latest Stack Overflow developer survey data on AI tool usage and compute the percentage increase in adoption compared to the prior year.", "ground_truth": "Response should locate survey data, perform a percentage calculation, present the result clearly, and note the source and survey year.", "phase": "2", "category": "data_analysis"} + ] +}