Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Identity (Service Principal) — populated by bootstrap.sh
AZURE_CLIENT_ID=<app-registration-client-id>
AZURE_TENANT_ID=<entra-tenant-id>
AZURE_SUBSCRIPTION_ID=<azure-subscription-id>
SP_OBJECT_ID=<service-principal-object-id>

# Azure AI Foundry endpoints (no secrets — auth is via OIDC or az login)
FOUNDRY_TEST_ENDPOINT=https://eastus.api.azureml.ms/foundry/v1/subscriptions/<sub>/resourceGroups/<rg>/projects/<test-project>
FOUNDRY_PROD_ENDPOINT=https://eastus.api.azureml.ms/foundry/v1/subscriptions/<sub>/resourceGroups/<rg>/projects/<prod-project>
Expand All @@ -7,3 +13,9 @@ GPT_DEPLOYMENT=gpt-4o-2024-11-20

# Bing Grounding connection name configured in the Foundry project
BING_CONNECTION_NAME=bing-grounding

# Resource metadata
RESOURCE_GROUP=<resource-group-name>
LOCATION=swedencentral
ACCOUNT_NAME=<foundry-account-name>
GITHUB_REPO=<owner/repo>
4 changes: 2 additions & 2 deletions .github/workflows/evaluate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ jobs:
with:
azure-ai-project-endpoint: ${{ vars.FOUNDRY_TEST_ENDPOINT }}
deployment-name: ${{ vars.GPT_DEPLOYMENT }}
agent-ids: "tech-trends-agent:latest"
data-path: "./evals/golden-dataset.jsonl"
agent-ids: "tech-trends-agent:${{ steps.deploy.outputs.agent_version }}"
data-path: "./evals/golden-dataset.json"
evaluation-result-view: "all-scores"

- name: Post evaluation summary to PR
Expand Down
28 changes: 26 additions & 2 deletions .github/workflows/monitor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,44 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip

- name: Install dependencies
run: pip install -r requirements.txt

- uses: azure/login@v2
with:
client-id: ${{ vars.AZURE_CLIENT_ID }}
tenant-id: ${{ vars.AZURE_TENANT_ID }}
subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }}

- name: Get latest agent version
id: agent
env:
FOUNDRY_PROD_ENDPOINT: ${{ vars.FOUNDRY_PROD_ENDPOINT }}
run: |
VERSION=$(python3 -c "
import os
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
client = AIProjectClient(endpoint=os.environ['FOUNDRY_PROD_ENDPOINT'], credential=DefaultAzureCredential())
agent = client.agents.get_agent('tech-trends-agent')
print(agent.version)
")
echo "version=$VERSION" >> $GITHUB_OUTPUT

- name: Run evaluation against production agent
id: eval
uses: microsoft/ai-agent-evals@v3-beta
with:
azure-ai-project-endpoint: ${{ vars.FOUNDRY_PROD_ENDPOINT }}
deployment-name: ${{ vars.GPT_DEPLOYMENT }}
agent-ids: "tech-trends-agent:latest"
data-path: "./evals/golden-dataset.jsonl"
agent-ids: "tech-trends-agent:${{ steps.agent.outputs.version }}"
data-path: "./evals/golden-dataset.json"
evaluation-result-view: "default"

- name: Open GitHub issue if scores degraded
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ build/

# Environment
.env
.bootstrap-state.json

# IDE
.vscode/
Expand Down
4 changes: 2 additions & 2 deletions agents/tech-trends-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
"model": "${GPT_DEPLOYMENT}",
"instructions_file": "prompts/tech-trends-agent.md",
"tools": [
{ "type": "bing_grounding" }
{ "type": "web_search" }
]
},
"eval": {
"dataset": "evals/golden-dataset.jsonl",
"dataset": "evals/golden-dataset.json",
"phase_filter": "1",
"config": "evals/eval-config.json"
},
Expand Down
12 changes: 6 additions & 6 deletions evals/eval-config.json
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
{
"evaluators": [
"TaskAdherenceEvaluator",
"RelevanceEvaluator",
"GroundednessEvaluator",
"CoherenceEvaluator"
"builtin.task_adherence",
"builtin.relevance",
"builtin.groundedness",
"builtin.coherence"
],
"thresholds": {
"task_adherence": 0.80,
"relevance": 0.75,
"groundedness": 0.75,
"coherence": 0.80
},
"phase_filter": null,
"notes": "Set phase_filter to '1' or '2' to run only phase-specific cases. null runs all."
"phase_filter": "1",
"notes": "Phase 1: Only web search queries evaluated. Phase 2 data analysis queries excluded."
}
19 changes: 19 additions & 0 deletions evals/golden-dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"name": "tech-trends-agent-eval",
"evaluators": [
"builtin.task_adherence",
"builtin.relevance",
"builtin.groundedness",
"builtin.coherence"
],
"data": [
{"query": "What are the top three AI model releases in the last 90 days and their key capabilities?", "ground_truth": "Response should identify at least 3 recent model releases with specific capability descriptions, cite web sources, and be structured with a summary and key findings section.", "phase": "1", "category": "trend_research"},
{"query": "How is the major cloud provider landscape shifting in 2025 regarding AI infrastructure?", "ground_truth": "Response should cover at least 2 major cloud providers, discuss AI infrastructure investment or announcements, and cite current sources.", "phase": "1", "category": "trend_research"},
{"query": "What is the current state of open-source LLM adoption in enterprise settings?", "ground_truth": "Response should address enterprise adoption, mention specific models or frameworks, and provide balanced perspective on open vs closed source.", "phase": "1", "category": "market_analysis"},
{"query": "Summarise recent developer sentiment around AI coding tools based on community discussions.", "ground_truth": "Response should reflect actual developer perspectives, not vendor claims, and cite community sources such as surveys, forums or publications.", "phase": "1", "category": "community_sentiment"},
{"query": "What are analysts predicting for AI chip demand over the next 12 months?", "ground_truth": "Response should include analyst predictions, reference specific companies or market segments, and note the source and date of predictions.", "phase": "1", "category": "market_forecast"},
{"query": "Search for the latest GPU benchmark comparison data and calculate which GPU offers the best performance-per-dollar ratio based on the data you find.", "ground_truth": "Response should retrieve actual benchmark data, perform a calculation or comparison, present results in a structured format, and cite the data source.", "phase": "2", "category": "data_analysis"},
{"query": "Find recent AI model API pricing tables and produce a comparison showing cost per million tokens for at least 4 models.", "ground_truth": "Response should retrieve current pricing data, present a structured comparison table, and identify the most cost-effective option for different use cases.", "phase": "2", "category": "data_analysis"},
{"query": "Retrieve the latest Stack Overflow developer survey data on AI tool usage and compute the percentage increase in adoption compared to the prior year.", "ground_truth": "Response should locate survey data, perform a percentage calculation, present the result clearly, and note the source and survey year.", "phase": "2", "category": "data_analysis"}
]
}
8 changes: 0 additions & 8 deletions evals/golden-dataset.jsonl

This file was deleted.

7 changes: 4 additions & 3 deletions infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ param gptModelVersion string = '2024-11-20'
param gptCapacity int = 30

// --- Cognitive Services account (hosts the Foundry project) ---
resource aiAccount 'Microsoft.CognitiveServices/accounts@2024-10-01' = {
resource aiAccount 'Microsoft.CognitiveServices/accounts@2025-06-01' = {
name: accountName
location: location
kind: 'AIServices'
Expand All @@ -33,19 +33,20 @@ resource aiAccount 'Microsoft.CognitiveServices/accounts@2024-10-01' = {
properties: {
customSubDomainName: accountName
publicNetworkAccess: 'Enabled'
allowProjectManagement: true
}
}

// --- AI Project ---
resource aiProject 'Microsoft.CognitiveServices/accounts/projects@2024-10-01' = {
resource aiProject 'Microsoft.CognitiveServices/accounts/projects@2025-06-01' = {
parent: aiAccount
name: projectName
location: location
properties: {}
}

// --- GPT model deployment ---
resource gptDeployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = {
resource gptDeployment 'Microsoft.CognitiveServices/accounts/deployments@2025-06-01' = {
parent: aiAccount
name: gptDeploymentName
sku: {
Expand Down
Loading
Loading