diff --git a/ENV_SETUP.md b/ENV_SETUP.md index c1ad886..92d5dc0 100644 --- a/ENV_SETUP.md +++ b/ENV_SETUP.md @@ -59,6 +59,46 @@ export SEARCH_BACKEND=DuckDuckGo export NO_FORCE_TERMINAL=false ``` +## Alerting 服务环境变量(数据库 + Webhook 鉴权) + +用于接收 Alertmanager Webhook 并将事件入库。 + +### macOS/Linux +```bash +# 数据库连接(示例:本机 Docker Postgres) +export DB_HOST=localhost +export DB_PORT=5432 +export DB_USER=postgres +export DB_PASSWORD=postgres +export DB_NAME=zeroops +export DB_SSLMODE=disable + +# Webhook 鉴权(与 Alertmanager http_config 对齐,二选一) +# 1) Basic Auth +export ALERT_WEBHOOK_BASIC_USER=alert +export ALERT_WEBHOOK_BASIC_PASS=REDACTED +# 2) Bearer Token(如使用该方式,注释掉上面的 Basic) +# export ALERT_WEBHOOK_BEARER=your_token_here +``` + +### Windows(PowerShell) +```powershell +$env:DB_HOST="localhost" +$env:DB_PORT="5432" +$env:DB_USER="postgres" +$env:DB_PASSWORD="postgres" +$env:DB_NAME="zeroops" +$env:DB_SSLMODE="disable" + +# Basic Auth +$env:ALERT_WEBHOOK_BASIC_USER="alert" +$env:ALERT_WEBHOOK_BASIC_PASS="REDACTED" +# 或 Bearer +# $env:ALERT_WEBHOOK_BEARER="your_token_here" +``` + +> 启动服务后,可用 README 中的 curl 示例向 `/v1/integrations/alertmanager/webhook` 发送事件并在数据库中验证。 + ## 环境变量详细说明 ### 必需配置 diff --git a/client/package.json b/client/package.json index 97991b1..2cbb4b7 100644 --- a/client/package.json +++ b/client/package.json @@ -38,6 +38,7 @@ "typescript": "~5.8.0", "vite": "^7.0.6", "vite-plugin-vue-devtools": "^8.0.0", + "vue-eslint-parser": "^10", "vue-tsc": "^3.0.4" } } diff --git a/client/src/api/index.ts b/client/src/api/index.ts index 60c3d26..52d1887 100644 --- a/client/src/api/index.ts +++ b/client/src/api/index.ts @@ -2,7 +2,6 @@ import axios from 'axios' // 创建 axios 实例 const api = axios.create({ - baseURL: 'http://localhost:8070', // 发布准备服务端口 timeout: 10000, headers: { 'Content-Type': 'application/json' diff --git a/client/src/mock/api.ts b/client/src/mock/api.ts index 2724660..cc8406a 100644 --- a/client/src/mock/api.ts +++ b/client/src/mock/api.ts @@ -127,6 +127,38 @@ export class MockApiService { return { status: 200 } } + // 创建部署计划 - 新的API接口 + static async createDeployment(data: {service: string, version: string, scheduleTime?: string}): Promise<{ status: number, data: {id: string, message: string} }> { + await delay(500) + console.log(`Mock API: 创建部署计划 - service: ${data.service}, version: ${data.version}`) + + // 生成模拟的部署ID + const deployID = `deploy-${Date.now()}` + + // 模拟创建成功,返回状态码201 + return { + status: 201, + data: { + id: deployID, + message: 'deployment created successfully' + } + } + } + + // 更新部署计划 - 新的API接口 + static async updateDeployment(deployID: string, data: {version?: string, scheduleTime?: string}): Promise<{ status: number, data: {message: string} }> { + await delay(300) + console.log(`Mock API: 更新部署计划 - ${deployID}`, data) + + // 模拟更新成功,返回状态码200 + return { + status: 200, + data: { + message: 'deployment updated successfully' + } + } + } + // 获取部署变更记录 - 新的API接口 static async getDeploymentChangelog(start?: string, limit?: number): Promise { await delay(300) diff --git a/client/src/mock/services.ts b/client/src/mock/services.ts index 462e0e9..1894f22 100644 --- a/client/src/mock/services.ts +++ b/client/src/mock/services.ts @@ -3,9 +3,9 @@ export interface ServiceItem { name: string - deployState: 'InDeploying' | 'AllDeployFinish' + deployState: 'unrelease' | 'deploying' | 'stop' | 'rollback' | 'completed' health: 'Normal' | 'Warning' | 'Error' - dependencies: string[] + deps: string[] } export interface ServicesResponse { @@ -17,39 +17,39 @@ export const mockServicesData: ServicesResponse = { items: [ { name: "s3", - deployState: "InDeploying", - health: "Warning", - dependencies: [] // s3是根节点,无依赖 + deployState: "completed", + health: "Normal", + deps: [] // s3是根节点,无依赖 }, { name: "stg", - deployState: "InDeploying", - health: "Warning", - dependencies: ["s3"] // stg依赖s3 + deployState: "completed", + health: "Normal", + deps: ["s3"] // stg依赖s3 }, { name: "meta", - deployState: "AllDeployFinish", + deployState: "completed", health: "Normal", - dependencies: ["s3"] // meta依赖s3 + deps: ["s3"] // meta依赖s3 }, { name: "mq", - deployState: "AllDeployFinish", + deployState: "completed", health: "Normal", - dependencies: ["s3"] // mq依赖s3 + deps: ["s3"] // mq依赖s3 }, { name: "worker", - deployState: "AllDeployFinish", + deployState: "completed", health: "Normal", - dependencies: ["mq"] // worker依赖mq + deps: ["mq"] // worker依赖mq }, { name: "mongodb", - deployState: "AllDeployFinish", - health: "Error", - dependencies: ["meta"] // mongodb依赖meta + deployState: "completed", + health: "Normal", + deps: ["meta"] // mongodb依赖meta } ] } @@ -68,68 +68,69 @@ export interface ServiceVersion { export interface ServiceDetail { name: string - deployState: 'InDeploying' | 'AllDeployFinish' + deployState: 'unrelease' | 'deploying' | 'stop' | 'rollback' | 'completed' health: 'Normal' | 'Warning' | 'Error' - dependencies: string[] + deps: string[] versions: ServiceVersion[] } export const mockServiceDetails: Record = { "s3": { name: "s3", - deployState: "InDeploying", + deployState: "completed", health: "Normal", - dependencies: [], + deps: [], versions: [ - { label: "v1.0.0", value: 55, eta: "~ 2h 30m", anomalous: false, observing: false }, - { label: "v1.0.1", value: 30, eta: "~ 1h 10m", anomalous: false, observing: true, rolling: true, elapsedMin: 30, remainingMin: 60 }, - { label: "v1.0.3", value: 15, eta: "~ 40m", anomalous: false, observing: false, rolling: true, elapsedMin: 10, remainingMin: 30 } + { label: "v1.0.0", value: 60, eta: "~ 2h 30m", anomalous: false, observing: false }, + { label: "v1.0.1", value: 30, eta: "~ 1h 10m", anomalous: false, observing: false }, + { label: "v1.0.3", value: 10, eta: "~ 40m", anomalous: false, observing: false } ] }, "stg": { name: "stg", - deployState: "InDeploying", + deployState: "completed", health: "Normal", - dependencies: ["s3"], + deps: ["s3"], versions: [ { label: "v1.0.0", value: 70, eta: "~ 3h 00m", anomalous: false, observing: false }, - { label: "v1.0.2", value: 30, eta: "~ 30m", anomalous: false, observing: true, rolling: true, elapsedMin: 15, remainingMin: 20 } + { label: "v1.0.2", value: 30, eta: "~ 30m", anomalous: false, observing: false } ] }, "meta": { name: "meta", - deployState: "AllDeployFinish", + deployState: "completed", health: "Normal", - dependencies: ["s3"], + deps: ["s3"], versions: [ { label: "v1.0.3", value: 100, eta: "~ 25m", anomalous: false, observing: false } ] }, "mq": { name: "mq", - deployState: "AllDeployFinish", + deployState: "completed", health: "Normal", - dependencies: ["s3"], + deps: ["s3"], versions: [ { label: "v1.0.1", value: 100, eta: "~ 50m", anomalous: false, observing: false } ] }, "worker": { name: "worker", - deployState: "AllDeployFinish", + deployState: "completed", health: "Normal", - dependencies: ["mq"], + deps: ["mq"], versions: [ { label: "v1.0.1", value: 100, eta: "~ 20m", anomalous: false, observing: false } ] }, "mongodb": { name: "mongodb", - deployState: "AllDeployFinish", - health: "Error", - dependencies: ["meta"], + deployState: "completed", + health: "Normal", + deps: ["meta"], versions: [ - { label: "v1.0.1", value: 100, eta: "~ 1h 10m", anomalous: true, observing: false } + { label: "v1.0.1", value: 80, eta: "~ 1h 10m", anomalous: false, observing: false }, + { label: "v1.0.2", value: 20, eta: "~ 45m", anomalous: false, observing: false } ] } } @@ -283,13 +284,6 @@ export const mockDeploymentPlans: Record = { status: "InDeployment", scheduleTime: "2024-01-03T05:00:00Z", isPaused: false - }, - { - id: "2003", - service: "stg", - version: "v1.0.3", - status: "Finished", - finishTime: "2024-01-03T05:00:00Z" } ] }, @@ -978,7 +972,7 @@ export const mockServiceActiveVersions: Record 80%', labels: [ { key: 'service', value: 'meta' }, @@ -1537,9 +1539,9 @@ export const mockAlertDetails: Record = { }, 'alert-4': { id: 'alert-4', - state: 'Open', + state: 'Closed', level: 'Warning', - alertState: 'InProcessing', + alertState: 'Restored', title: 'gz Meta 数据库连接池使用率: 85% > 80%', labels: [ { key: 'service', value: 'meta' }, @@ -1700,3 +1702,437 @@ export const mockAlertRuleChangelog: AlertRuleChangelogResponse = { } ], } + +// ==================== 数据持久化函数 ==================== +// 使用 localStorage 实现前端数据持久化 + +/** + * 保存数据到 localStorage + */ +const saveDataToStorage = () => { + try { + localStorage.setItem('mockServiceActiveVersions', JSON.stringify(mockServiceActiveVersions)) + localStorage.setItem('mockAvailableVersions', JSON.stringify(mockAvailableVersions)) + console.log('数据已保存到 localStorage') + } catch (error) { + console.error('保存数据到 localStorage 失败:', error) + } +} + +/** + * 从 localStorage 加载数据 + */ +const loadDataFromStorage = () => { + try { + const activeVersionsData = localStorage.getItem('mockServiceActiveVersions') + const availableVersionsData = localStorage.getItem('mockAvailableVersions') + + if (activeVersionsData) { + const parsedData = JSON.parse(activeVersionsData) + Object.assign(mockServiceActiveVersions, parsedData) + console.log('已从 localStorage 加载活跃版本数据') + } + + if (availableVersionsData) { + const parsedData = JSON.parse(availableVersionsData) + Object.assign(mockAvailableVersions, parsedData) + console.log('已从 localStorage 加载可发布版本数据') + } + } catch (error) { + console.error('从 localStorage 加载数据失败:', error) + } +} + +// 页面加载时自动从 localStorage 恢复数据 +loadDataFromStorage() + +// ==================== 服务告警状态管理 ==================== +// 管理服务节点的告警状态,用于改变拓扑图中节点的颜色 + +/** + * 服务告警状态类型 + */ +export type ServiceAlertStatus = 'normal' | 'pending' | 'processing' + +/** + * 服务告警状态映射 + */ +export const serviceAlertStatusMap: Record = {} + +/** + * 保存服务告警状态到 localStorage + */ +const saveServiceAlertStatus = () => { + try { + localStorage.setItem('serviceAlertStatusMap', JSON.stringify(serviceAlertStatusMap)) + console.log('服务告警状态已保存到 localStorage') + } catch (error) { + console.error('保存服务告警状态失败:', error) + } +} + +/** + * 从 localStorage 加载服务告警状态 + */ +const loadServiceAlertStatus = () => { + try { + const data = localStorage.getItem('serviceAlertStatusMap') + if (data) { + const parsedData = JSON.parse(data) + Object.assign(serviceAlertStatusMap, parsedData) + console.log('已从 localStorage 加载服务告警状态') + } + } catch (error) { + console.error('从 localStorage 加载服务告警状态失败:', error) + } +} + +/** + * 根据告警状态更新服务状态 + * @param serviceName 服务名称 + * @param alertState 告警状态 + */ +export const updateServiceAlertStatus = (serviceName: string, alertState: string) => { + let status: ServiceAlertStatus = 'normal' + + switch (alertState) { + case 'Pending': + status = 'pending' + break + case 'InProcessing': + status = 'processing' + break + case 'Restored': + case 'AutoRestored': + default: + status = 'normal' + break + } + + serviceAlertStatusMap[serviceName] = status + saveServiceAlertStatus() + + console.log(`服务 ${serviceName} 告警状态更新为: ${status} (${alertState})`) + return status +} + +/** + * 获取服务告警状态 + * @param serviceName 服务名称 + */ +export const getServiceAlertStatus = (serviceName: string): ServiceAlertStatus => { + return serviceAlertStatusMap[serviceName] || 'normal' +} + +/** + * 清除服务告警状态 + * @param serviceName 服务名称 + */ +export const clearServiceAlertStatus = (serviceName: string) => { + delete serviceAlertStatusMap[serviceName] + saveServiceAlertStatus() + console.log(`已清除服务 ${serviceName} 的告警状态`) +} + +// 页面加载时自动从 localStorage 恢复服务告警状态 +loadServiceAlertStatus() + +// ==================== 服务版本告警状态管理 ==================== +// 让首页服务详情中的各版本颜色与告警状态同步 + +/** + * 服务版本告警状态映射:service -> version -> status + */ +export const serviceVersionAlertStatusMap: Record> = {} + +const saveServiceVersionAlertStatus = () => { + try { + localStorage.setItem('serviceVersionAlertStatusMap', JSON.stringify(serviceVersionAlertStatusMap)) + console.log('服务版本告警状态已保存到 localStorage') + } catch (error) { + console.error('保存服务版本告警状态失败:', error) + } +} + +const loadServiceVersionAlertStatus = () => { + try { + const data = localStorage.getItem('serviceVersionAlertStatusMap') + if (data) { + const parsed = JSON.parse(data) + Object.assign(serviceVersionAlertStatusMap, parsed) + console.log('已从 localStorage 加载服务版本告警状态') + } + } catch (error) { + console.error('从 localStorage 加载服务版本告警状态失败:', error) + } +} + +/** + * 根据告警状态更新服务版本状态 + * Pending -> pending, InProcessing -> processing, Restored/AutoRestored -> normal + */ +export const updateServiceVersionAlertStatus = (serviceName: string, version: string, alertState: string) => { + if (!serviceVersionAlertStatusMap[serviceName]) { + serviceVersionAlertStatusMap[serviceName] = {} + } + + let status: ServiceAlertStatus = 'normal' + switch (alertState) { + case 'Pending': + status = 'pending' + break + case 'InProcessing': + status = 'processing' + break + case 'Restored': + case 'AutoRestored': + default: + status = 'normal' + break + } + + serviceVersionAlertStatusMap[serviceName][version] = status + saveServiceVersionAlertStatus() + console.log(`服务 ${serviceName} 版本 ${version} 告警状态更新为: ${status} (${alertState})`) + return status +} + +/** + * 获取服务版本告警状态 + */ +export const getServiceVersionAlertStatus = (serviceName: string, version: string): ServiceAlertStatus => { + return serviceVersionAlertStatusMap[serviceName]?.[version] || 'normal' +} + +/** + * 清除服务版本告警状态 + */ +export const clearServiceVersionAlertStatus = (serviceName: string, version?: string) => { + if (version) { + if (serviceVersionAlertStatusMap[serviceName]) { + delete serviceVersionAlertStatusMap[serviceName][version] + } + } else { + delete serviceVersionAlertStatusMap[serviceName] + } + saveServiceVersionAlertStatus() + console.log(`已清除服务 ${serviceName} ${version ? '版本 ' + version : '所有版本'} 的告警状态`) +} + +// 页面加载时恢复服务版本告警状态 +loadServiceVersionAlertStatus() + +// ==================== 发布任务状态管理 ==================== +// 管理服务的发布任务状态,用于显示发布指示器 + +/** + * 发布任务状态类型 + */ +export type DeploymentStatus = 'idle' | 'deploying' + +/** + * 服务发布状态映射 + */ +export const serviceDeploymentStatusMap: Record = {} + +/** + * 保存服务发布状态到 localStorage + */ +const saveServiceDeploymentStatus = () => { + try { + localStorage.setItem('serviceDeploymentStatusMap', JSON.stringify(serviceDeploymentStatusMap)) + console.log('服务发布状态已保存到 localStorage') + } catch (error) { + console.error('保存服务发布状态失败:', error) + } +} + +/** + * 从 localStorage 加载服务发布状态 + */ +const loadServiceDeploymentStatus = () => { + try { + const data = localStorage.getItem('serviceDeploymentStatusMap') + if (data) { + const parsedData = JSON.parse(data) + Object.assign(serviceDeploymentStatusMap, parsedData) + console.log('已从 localStorage 加载服务发布状态') + } + } catch (error) { + console.error('从 localStorage 加载服务发布状态失败:', error) + } +} + +/** + * 设置服务发布状态 + * @param serviceName 服务名称 + * @param status 发布状态 + */ +export const setServiceDeploymentStatus = (serviceName: string, status: DeploymentStatus) => { + serviceDeploymentStatusMap[serviceName] = status + saveServiceDeploymentStatus() + console.log(`服务 ${serviceName} 发布状态更新为: ${status}`) +} + +/** + * 获取服务发布状态 + * @param serviceName 服务名称 + */ +export const getServiceDeploymentStatus = (serviceName: string): DeploymentStatus => { + return serviceDeploymentStatusMap[serviceName] || 'idle' +} + +/** + * 清除服务发布状态 + * @param serviceName 服务名称 + */ +export const clearServiceDeploymentStatus = (serviceName: string) => { + delete serviceDeploymentStatusMap[serviceName] + saveServiceDeploymentStatus() + console.log(`已清除服务 ${serviceName} 的发布状态`) +} + +// 页面加载时自动从 localStorage 恢复服务发布状态 +loadServiceDeploymentStatus() + +// ==================== 数据操作函数 ==================== +// 这些函数模拟后端操作数据的过程,用于支持前端的发布和回滚功能 + +/** + * 从可发布版本列表中移除指定版本 + * @param serviceName 服务名称 + * @param version 要移除的版本 + */ +export const removeVersionFromAvailable = (serviceName: string, version: string) => { + if (mockAvailableVersions[serviceName]) { + const index = mockAvailableVersions[serviceName].items.findIndex(item => item.version === version) + if (index >= 0) { + mockAvailableVersions[serviceName].items.splice(index, 1) + console.log(`已从 ${serviceName} 的可发布版本列表中移除 ${version}`) + saveDataToStorage() // 保存到 localStorage + } + } +} + +/** + * 将版本添加到可发布版本列表中 + * @param serviceName 服务名称 + * @param version 要添加的版本 + */ +export const addVersionToAvailable = (serviceName: string, version: string) => { + if (mockAvailableVersions[serviceName]) { + // 检查是否已存在 + const existing = mockAvailableVersions[serviceName].items.find(item => item.version === version) + if (!existing) { + mockAvailableVersions[serviceName].items.push({ + version: version, + createTime: new Date().toISOString() + }) + console.log(`已将 ${version} 添加到 ${serviceName} 的可发布版本列表中`) + saveDataToStorage() // 保存到 localStorage + } + } +} + +/** + * 从服务活跃版本中移除指定版本 + * @param serviceName 服务名称 + * @param version 要移除的版本 + */ +export const removeVersionFromServiceActiveVersions = (serviceName: string, version: string) => { + if (mockServiceActiveVersions[serviceName]) { + const index = mockServiceActiveVersions[serviceName].items.findIndex(v => v.version === version) + if (index >= 0) { + mockServiceActiveVersions[serviceName].items.splice(index, 1) + console.log(`已从 ${serviceName} 的服务活跃版本中移除 ${version}`) + saveDataToStorage() // 保存到 localStorage + } + } +} + +/** + * 将版本添加到服务活跃版本中(用于饼状图显示) + * @param serviceName 服务名称 + * @param version 要添加的版本 + */ +export const addVersionToServiceActiveVersions = (serviceName: string, version: string) => { + if (mockServiceActiveVersions[serviceName]) { + // 检查是否已存在 + const existing = mockServiceActiveVersions[serviceName].items.find(v => v.version === version) + if (!existing) { + const now = new Date() + const endTime = new Date(now.getTime() + 2 * 60 * 60 * 1000) // 两小时后 + + // 计算总实例数,用于确定15%对应的实例数 + const totalInstances = mockServiceActiveVersions[serviceName].items.reduce((sum, item) => sum + item.instances, 0) + const newVersionInstances = Math.max(1, Math.round(totalInstances * 0.15)) // 新版本占15%,最少1个实例 + + // 如果有现有版本,从最早发布的版本(第一个)减少15% + if (mockServiceActiveVersions[serviceName].items.length > 0) { + const earliestVersion = mockServiceActiveVersions[serviceName].items[0] + const reduceInstances = Math.min(earliestVersion.instances, newVersionInstances) + earliestVersion.instances = Math.max(0, earliestVersion.instances - reduceInstances) + console.log(`从最早版本 ${earliestVersion.version} 减少 ${reduceInstances} 个实例`) + } + + mockServiceActiveVersions[serviceName].items.push({ + version: version, + deployID: `deploy-${Date.now()}`, + startTime: now.toISOString(), + estimatedCompletionTime: endTime.toISOString(), + instances: newVersionInstances, // 新版本占15% + health: "Normal" + }) + console.log(`已将 ${version} 添加到 ${serviceName} 的服务活跃版本中,实例数: ${newVersionInstances}`) + saveDataToStorage() // 保存到 localStorage + } + } +} + +/** + * 创建发布任务 - 模拟后端操作 + * @param serviceName 服务名称 + * @param version 要发布的版本 + */ +export const createReleaseTask = (serviceName: string, version: string) => { + console.log(`开始创建发布任务: ${serviceName} -> ${version}`) + console.log('修改前的可发布版本列表:', mockAvailableVersions[serviceName]?.items) + console.log('修改前的服务活跃版本列表:', mockServiceActiveVersions[serviceName]?.items) + + // 1. 从可发布版本列表中移除 + removeVersionFromAvailable(serviceName, version) + + // 2. 添加到服务活跃版本中 + addVersionToServiceActiveVersions(serviceName, version) + + // 3. 设置服务发布状态为发布中 + setServiceDeploymentStatus(serviceName, 'deploying') + + console.log('修改后的可发布版本列表:', mockAvailableVersions[serviceName]?.items) + console.log('修改后的服务活跃版本列表:', mockServiceActiveVersions[serviceName]?.items) + console.log(`发布任务创建成功: ${serviceName} -> ${version}`) +} + +/** + * 回滚版本 - 模拟后端操作 + * @param serviceName 服务名称 + * @param version 要回滚的版本 + */ +export const rollbackVersion = (serviceName: string, version: string) => { + console.log(`开始回滚版本: ${serviceName} -> ${version}`) + console.log('回滚前的服务活跃版本列表:', mockServiceActiveVersions[serviceName]?.items) + console.log('回滚前的可发布版本列表:', mockAvailableVersions[serviceName]?.items) + + // 1. 从服务活跃版本中移除 + removeVersionFromServiceActiveVersions(serviceName, version) + + // 2. 恢复到可发布版本列表中 + addVersionToAvailable(serviceName, version) + + // 3. 清除服务发布状态 + clearServiceDeploymentStatus(serviceName) + + console.log('回滚后的服务活跃版本列表:', mockServiceActiveVersions[serviceName]?.items) + console.log('回滚后的可发布版本列表:', mockAvailableVersions[serviceName]?.items) + console.log(`版本回滚成功: ${serviceName} -> ${version}`) +} diff --git a/client/src/views/AlertsView.vue b/client/src/views/AlertsView.vue index dd4b429..ce62d21 100644 --- a/client/src/views/AlertsView.vue +++ b/client/src/views/AlertsView.vue @@ -217,7 +217,8 @@ import { Loading } from '@element-plus/icons-vue' import { ElMessage } from 'element-plus' -import { mockApi } from '@/mock/api' +import { apiService } from '@/api' +import { updateServiceAlertStatus, updateServiceVersionAlertStatus } from '@/mock/services' import type { AlertsResponse, AlertIssue, AlertDetail } from '@/mock/services' import { marked } from 'marked' @@ -304,11 +305,13 @@ const getLevelType = (level: string) => { const getStateType = (alertState: string) => { switch (alertState) { + case 'Pending': + return 'danger' + case 'InProcessing': + return 'warning' case 'Restored': case 'AutoRestored': return 'success' - case 'InProcessing': - return 'danger' default: return 'info' } @@ -316,19 +319,21 @@ const getStateType = (alertState: string) => { const getStateText = (alertState: string) => { switch (alertState) { + case 'Pending': + return '待处理' + case 'InProcessing': + return '处理中' case 'Restored': return '已恢复' case 'AutoRestored': - return '自然恢复' - case 'InProcessing': - return '处理中' + return '系统自动恢复' default: return alertState } } const canShowAnalysis = (alertState: string) => { - return ['InProcessing', 'Restored', 'AutoRestored'].includes(alertState) + return ['Pending', 'InProcessing', 'Restored', 'AutoRestored'].includes(alertState) } // Markdown渲染方法 @@ -347,11 +352,12 @@ const showAIAnalysis = async (alert: AlertIssue) => { showAnalysisDialog.value = true detailLoading.value = true - // 调用API获取告警详情 - const detail = await mockApi.getAlertDetail(alert.id) - alertDetail.value = detail + // 调用API获取告警详情(真实后端) + const detailResp = await apiService.getAlertDetail(alert.id) + console.log('告警详情响应 data:', detailResp.data) + alertDetail.value = detailResp.data - console.log('告警详情加载成功:', detail) + console.log('告警详情加载成功:', detailResp.data) } catch (err) { console.error('加载告警详情失败:', err) ElMessage.error('加载告警详情失败') @@ -379,21 +385,40 @@ const loadAlerts = async () => { try { loading.value = true error.value = null - - // 根据 filterState 构造 API 参数 - const apiState = filterState.value === 'all' ? undefined : - filterState.value === 'open' ? 'Open' : 'Closed' - - const response = await mockApi.getAlerts(undefined, 10, apiState) - alerts.value = response.items - - // 如果是首次加载(allAlerts为空),则加载所有数据用于计数 - if (allAlerts.value.length === 0) { - const allResponse = await mockApi.getAlerts(undefined, 100) // 获取更多数据用于计数 - allAlerts.value = allResponse.items + + if (filterState.value === 'all') { + // All 需要同时包含 Open 和 Closed,后端未传 state 时可能默认仅返回 Open + const [openResp, closedResp] = await Promise.all([ + apiService.getAlerts(undefined, 100, 'Open'), + apiService.getAlerts(undefined, 100, 'Closed') + ]) + const merged = [...openResp.data.items, ...closedResp.data.items] + .sort((a: any, b: any) => new Date(b.alertSince).getTime() - new Date(a.alertSince).getTime()) + + alerts.value = merged.slice(0, 10) + allAlerts.value = merged + // 同步拓扑服务状态 + syncServiceAlertStatuses(allAlerts.value) + console.log('告警数据加载成功: All', { total: merged.length }) + } else { + const state = filterState.value === 'open' ? 'Open' : 'Closed' + + // 并行请求当前筛选列表,以及用于右上角计数的全量 Open/Closed + const [listResp, openResp, closedResp] = await Promise.all([ + apiService.getAlerts(undefined, 10, state), + apiService.getAlerts(undefined, 100, 'Open'), + apiService.getAlerts(undefined, 100, 'Closed') + ]) + + alerts.value = listResp.data.items + + const mergedAll = [...openResp.data.items, ...closedResp.data.items] + .sort((a: any, b: any) => new Date(b.alertSince).getTime() - new Date(a.alertSince).getTime()) + allAlerts.value = mergedAll + // 同步拓扑服务状态 + syncServiceAlertStatuses(allAlerts.value) + console.log('告警数据加载成功:', { filter: state, count: alerts.value.length, total: mergedAll.length }) } - - console.log('告警数据加载成功:', response) } catch (err) { console.error('加载告警数据失败:', err) error.value = '加载告警数据失败' @@ -403,6 +428,59 @@ const loadAlerts = async () => { } } +// 将告警状态同步到首页拓扑的服务节点颜色 +const syncServiceAlertStatuses = (issues: AlertIssue[]) => { + // 优先级:Pending > InProcessing > Restored > AutoRestored + const priority: Record = { + Pending: 4, + InProcessing: 3, + Restored: 2, + AutoRestored: 1 + } + + // 可能需要从其他标签映射到首页的服务名 + const prophetToServiceMap: Record = { + s3apiv2: 's3' + } + + const latestStateByService = new Map() + + for (const issue of issues) { + // 解析服务名:优先 labels.service,其次 prophet_service 的映射 + const serviceLabel = issue.labels.find(l => l.key === 'service')?.value + const prophetService = issue.labels.find(l => l.key === 'prophet_service')?.value + const mapped = prophetService ? prophetToServiceMap[prophetService] : undefined + const serviceName = serviceLabel || mapped + if (!serviceName) continue + + const ts = new Date(issue.alertSince).getTime() + const prio = priority[issue.alertState] || 0 + const existing = latestStateByService.get(serviceName) + if (!existing || prio > existing.prio || (prio === existing.prio && ts > existing.ts)) { + latestStateByService.set(serviceName, { state: issue.alertState, ts, prio }) + } + + // 同步版本状态(如果存在 service_version 标签) + // 版本标签检测:兼容多种后端命名 + const versionLabel = + issue.labels.find(l => l.key === 'service_version')?.value || + issue.labels.find(l => l.key === 'version')?.value || + issue.labels.find(l => l.key === 'serviceVersion')?.value || + issue.labels.find(l => l.key === 'svc_version')?.value || + issue.labels.find(l => l.key === 'deploy_version')?.value || + issue.labels.find(l => l.key === 'deployVersion')?.value || + issue.labels.find(l => l.key.toLowerCase().includes('version'))?.value + if (versionLabel) { + updateServiceVersionAlertStatus(serviceName, versionLabel, issue.alertState) + } + } + + // 写入共享状态映射(持久化到 localStorage) + latestStateByService.forEach((val, service) => { + updateServiceAlertStatus(service, val.state) + }) +} + // 生命周期 onMounted(() => { loadAlerts() diff --git a/client/src/views/HomeView.vue b/client/src/views/HomeView.vue index 6f05e7a..20b5e68 100644 --- a/client/src/views/HomeView.vue +++ b/client/src/views/HomeView.vue @@ -410,6 +410,7 @@ import * as echarts from 'echarts' import { apiService } from '@/api' import { mockApi } from '@/mock/api' import type { ServicesResponse, ServiceDetail, ServiceActiveVersionsResponse, ServiceMetricsResponse, AvailableVersionsResponse, DeploymentPlansResponse, MetricsResponse } from '@/mock/services' +import { createReleaseTask, rollbackVersion as rollbackVersionInMock, updateServiceAlertStatus, getServiceAlertStatus, type ServiceAlertStatus, getServiceDeploymentStatus, type DeploymentStatus, getServiceVersionAlertStatus } from '@/mock/services' const router = useRouter() @@ -475,13 +476,13 @@ const calculateAutoLayout = (services: any[]) => { const reverseGraph = new Map() services.forEach(service => { - dependencyGraph.set(service.name, service.dependencies || []) + dependencyGraph.set(service.name, service.deps || []) reverseGraph.set(service.name, []) }) // 构建反向图 services.forEach(service => { - service.dependencies?.forEach((dep: string) => { + service.deps?.forEach((dep: string) => { if (reverseGraph.has(dep)) { reverseGraph.get(dep)!.push(service.name) } @@ -495,7 +496,7 @@ const calculateAutoLayout = (services: any[]) => { // 计算入度 services.forEach(service => { - inDegree.set(service.name, service.dependencies?.length || 0) + inDegree.set(service.name, service.deps?.length || 0) }) // 找到所有入度为0的节点(根节点) @@ -509,7 +510,6 @@ const calculateAutoLayout = (services: any[]) => { // 分层处理 while (currentLevel.length > 0) { levels.push([...currentLevel]) - console.log(`层级 ${levels.length - 1}:`, currentLevel) const nextLevel: string[] = [] currentLevel.forEach(serviceName => { @@ -530,7 +530,6 @@ const calculateAutoLayout = (services: any[]) => { currentLevel = nextLevel } - console.log('自动布局层级结构:', levels) // 3. 计算位置 const positions = new Map() @@ -540,21 +539,12 @@ const calculateAutoLayout = (services: any[]) => { const levelWidth = (level.length - 1) * layoutConfig.nodeSpacing const startX = layoutConfig.startX - levelWidth / 2 - console.log(`层级 ${levelIndex} 布局:`, { - level, - levelY, - levelWidth, - startX - }) - level.forEach((serviceName, nodeIndex) => { const x = startX + nodeIndex * layoutConfig.nodeSpacing positions.set(serviceName, { x, y: levelY }) - console.log(` ${serviceName}: (${x}, ${levelY})`) }) }) - console.log('最终位置映射:', positions) return positions } @@ -577,14 +567,14 @@ const transformServiceData = (data: ServicesResponse) => { y: position.y, health: service.health, deployState: service.deployState, - dependencies: service.dependencies, + dependencies: service.deps, // 根据发布状态生成版本信息 versions: generateVersionsFromDeployState(service) } nodes.push(node) // 生成依赖关系边 - service.dependencies.forEach(dep => { + service.deps.forEach(dep => { edges.push({ source: service.name, target: dep @@ -597,7 +587,7 @@ const transformServiceData = (data: ServicesResponse) => { // 根据发布状态生成版本信息 const generateVersionsFromDeployState = (service: any) => { - if (service.deployState === 'InDeploying') { + if (service.deployState === 'deploying') { // 发布中:生成多个版本,其中一个在发布 return [ { label: "v1.0.0", value: 70, eta: "~ 2h 30m", anomalous: false, observing: false }, @@ -624,17 +614,23 @@ const loadServicesData = async () => { error.value = null try { - // 加载服务数据 + // 加载服务数据 - 使用Mock API const servicesResponse = await mockApi.getServices() + servicesData.value = servicesResponse // 转换数据 - const { nodes: transformedNodes, edges: transformedEdges } = transformServiceData(servicesResponse) - nodes.value = transformedNodes - edges.value = transformedEdges + try { + const { nodes: transformedNodes, edges: transformedEdges } = transformServiceData(servicesResponse) + nodes.value = transformedNodes + edges.value = transformedEdges + } catch (transformError) { + console.error('数据转换失败:', transformError) + throw transformError + } - console.log('服务数据加载成功:', servicesResponse) + console.log('服务数据加载成功,返回值数据:', servicesResponse) } catch (err) { error.value = '加载服务数据失败' console.error('加载服务数据失败:', err) @@ -645,7 +641,13 @@ const loadServicesData = async () => { } // 数据转换函数:将后端返回的活跃版本数据转换为前端需要的格式 -const transformActiveVersionsToFrontend = (activeVersionsResponse: ServiceActiveVersionsResponse) => { +const transformActiveVersionsToFrontend = (activeVersionsResponse: ServiceActiveVersionsResponse, serviceName: string) => { + // 添加空值检查 + if (!activeVersionsResponse || !activeVersionsResponse.items) { + console.warn('活跃版本数据为空:', activeVersionsResponse) + return [] + } + const totalInstances = activeVersionsResponse.items.reduce((sum, item) => sum + item.instances, 0) return activeVersionsResponse.items.map(item => { @@ -671,12 +673,17 @@ const transformActiveVersionsToFrontend = (activeVersionsResponse: ServiceActive const isAnomalous = item.health === 'Error' const isObserving = item.health === 'Warning' + // 读取版本级别告警状态以驱动颜色 + const versionStatus = getServiceVersionAlertStatus(serviceName, item.version) + const versionAnomalous = versionStatus === 'pending' + const versionObserving = versionStatus === 'processing' + return { label: item.version, value: percentage, eta: eta, - anomalous: isAnomalous, - observing: isObserving, + anomalous: versionAnomalous || isAnomalous, + observing: versionObserving || (!versionAnomalous && isObserving), rolling: isRolling, elapsedMin: elapsedMinutes, remainingMin: remainingMinutes, @@ -692,15 +699,32 @@ const transformActiveVersionsToFrontend = (activeVersionsResponse: ServiceActive // 获取服务详情 - 使用新的API接口 const loadServiceDetail = async (serviceName: string) => { try { - // 调用新的活跃版本API + // 调用活跃版本API - 使用Mock API const activeVersionsResponse = await mockApi.getServiceActiveVersions(serviceName) // 转换数据格式 - const transformedVersions = transformActiveVersionsToFrontend(activeVersionsResponse) + const transformedVersions = transformActiveVersionsToFrontend(activeVersionsResponse, serviceName) + + // 获取服务的基础信息(包括原始health状态) + const serviceInfo = nodes.value.find(node => node.name === serviceName) + + // 根据告警状态确定最终的health状态 + const alertStatus = getServiceAlertStatus(serviceName) + let finalHealth = serviceInfo?.health || 'Normal' + + // 如果服务有告警状态,覆盖原始health状态 + if (alertStatus === 'pending') { + finalHealth = 'Error' // 红色 + } else if (alertStatus === 'processing') { + finalHealth = 'Warning' // 黄色 + } return { name: serviceName, - versions: transformedVersions + versions: transformedVersions, + health: finalHealth, + deployState: serviceInfo?.deployState || 'completed', + deps: serviceInfo?.deps || [] } } catch (err) { console.error('获取服务活跃版本失败:', err) @@ -709,15 +733,15 @@ const loadServiceDetail = async (serviceName: string) => { } } -// 获取服务指标数据 - 使用新的API接口 +// 获取服务指标数据 - 使用Mock API(后端暂未实现) const loadServiceMetrics = async (serviceName: string) => { try { - // 调用新的指标API + // 调用Mock API,等待后端实现真实接口 const metricsResponse = await mockApi.getServiceMetrics(serviceName) return metricsResponse } catch (err) { - console.error('获取服务指标数据失败:', err) - ElMessage.error('获取服务指标数据失败') + console.warn(`获取服务 ${serviceName} 指标数据失败:`, err) + // 不显示错误消息,因为某些服务可能没有指标数据 return null } } @@ -725,7 +749,7 @@ const loadServiceMetrics = async (serviceName: string) => { // 获取服务可发布版本列表 - 使用新的API接口 const loadServiceAvailableVersions = async (serviceName: string) => { try { - // 调用新的可发布版本API + // 调用可发布版本API - 使用Mock API const availableVersionsResponse = await mockApi.getServiceAvailableVersions(serviceName) return availableVersionsResponse } catch (err) { @@ -738,7 +762,7 @@ const loadServiceAvailableVersions = async (serviceName: string) => { // 获取服务发布计划列表 - 使用新的API接口 const loadServiceDeploymentPlans = async (serviceName: string) => { try { - // 调用新的发布计划API + // 调用发布计划API - 使用Mock API const deploymentPlansResponse = await mockApi.getServiceDeploymentPlans(serviceName) return deploymentPlansResponse } catch (err) { @@ -751,7 +775,7 @@ const loadServiceDeploymentPlans = async (serviceName: string) => { // 获取服务指标数据 - 使用新的API接口 const loadServiceMetricsData = async (serviceName: string, version: string) => { try { - // 并行获取四大黄金指标数据 + // 并行获取四大黄金指标数据 - 使用Mock API const [latencyData, trafficData, errorsData, saturationData] = await Promise.all([ mockApi.getServiceMetricsData(serviceName, 'latency', version), mockApi.getServiceMetricsData(serviceName, 'traffic', version), @@ -867,7 +891,15 @@ const goToAlerts = () => { } const getNodeStatus = (node: any) => { - // 直接使用后端返回的health状态 + // 首先检查告警状态 + const alertStatus = getServiceAlertStatus(node.name) + + // 如果服务有告警状态,优先使用告警状态 + if (alertStatus !== 'normal') { + return alertStatus + } + + // 否则使用后端返回的health状态 const healthMap: Record = { 'Normal': 'healthy', 'Warning': 'canary', @@ -879,9 +911,11 @@ const getNodeStatus = (node: any) => { const getNodeStatusColor = (node: any) => { const status = getNodeStatus(node) const statusMap: Record = { - healthy: "#10b981", // 绿色 - abnormal: "#f43f5e", // 红色 - canary: "#f59e0b" // 黄色 + healthy: "#10b981", // 绿色 - 正常 + abnormal: "#f43f5e", // 红色 - 异常 + canary: "#f59e0b", // 黄色 - 观察中 + pending: "#ef4444", // 红色 - 待处理 + processing: "#eab308" // 黄色 - 处理中 } return statusMap[status] || "#6b7280" } @@ -891,7 +925,9 @@ const getNodeStatusStroke = (node: any) => { const statusMap: Record = { healthy: "#10b981", abnormal: "#f43f5e", - canary: "#f59e0b" + canary: "#f59e0b", + pending: "#ef4444", + processing: "#eab308" } return statusMap[status] || "#6b7280" } @@ -901,14 +937,17 @@ const getNodeStatusFill = (node: any) => { const statusMap: Record = { healthy: "#10b981", abnormal: "#f43f5e", - canary: "#f59e0b" + canary: "#f59e0b", + pending: "#ef4444", + processing: "#eab308" } return statusMap[status] || "#6b7280" } const hasRollingVersion = (node: any) => { - // 根据deployState判断是否显示灰度发布指示器 - return node.deployState === 'InDeploying' + // 检查发布状态或deployState判断是否显示灰度发布指示器 + const deploymentStatus = getServiceDeploymentStatus(node.name) + return deploymentStatus === 'deploying' || node.deployState === 'deploying' } const getNodePosition = (nodeId: string) => { @@ -921,35 +960,41 @@ const handleNodeClick = async (node: any) => { dialogVisible.value = true // 并行加载服务详情、指标数据、可发布版本和发布计划 - const [serviceDetail, metricsData, availableVersionsData, deploymentPlansData] = await Promise.all([ + const [serviceDetail, metricsData, availableVersionsData, deploymentPlansData] = await Promise.allSettled([ loadServiceDetail(node.name), loadServiceMetrics(node.name), loadServiceAvailableVersions(node.name), loadServiceDeploymentPlans(node.name) ]) - if (serviceDetail) { + // 提取成功的结果 + const serviceDetailResult = serviceDetail.status === 'fulfilled' ? serviceDetail.value : null + const metricsDataResult = metricsData.status === 'fulfilled' ? metricsData.value : null + const availableVersionsDataResult = availableVersionsData.status === 'fulfilled' ? availableVersionsData.value : null + const deploymentPlansDataResult = deploymentPlansData.status === 'fulfilled' ? deploymentPlansData.value : null + + if (serviceDetailResult) { // 更新节点的版本信息 - selectedNode.value.versions = serviceDetail.versions + selectedNode.value.versions = serviceDetailResult.versions } - if (metricsData) { + if (metricsDataResult) { // 存储指标数据 - currentServiceMetrics.value = metricsData + currentServiceMetrics.value = metricsDataResult } - if (availableVersionsData) { + if (availableVersionsDataResult) { // 存储可发布版本数据 - currentServiceAvailableVersions.value = availableVersionsData + currentServiceAvailableVersions.value = availableVersionsDataResult // 重置选中的版本为第一个可用版本 - if (availableVersionsData.items.length > 0) { - selectedVersion.value = availableVersionsData.items[0].version + if (availableVersionsDataResult.items && availableVersionsDataResult.items.length > 0) { + selectedVersion.value = availableVersionsDataResult.items[0].version } } - if (deploymentPlansData) { + if (deploymentPlansDataResult) { // 存储发布计划数据 - currentServiceDeploymentPlans.value = deploymentPlansData + currentServiceDeploymentPlans.value = deploymentPlansDataResult } nextTick(() => { @@ -971,6 +1016,8 @@ const getNodeStatusText = (status: string) => { case 'healthy': return '服务正常' case 'canary': return '有异常,AI正在观察和分析' case 'abnormal': return '服务有异常' + case 'pending': return '告警待处理' + case 'processing': return '告警处理中' default: return '未知状态' } } @@ -1046,6 +1093,11 @@ const handleCloseDialog = () => { currentServiceMetrics.value = null currentServiceAvailableVersions.value = null currentServiceDeploymentPlans.value = null + // 释放饼图实例 + if (pieChart) { + pieChart.dispose() + pieChart = null + } } // 处理指标弹窗关闭 @@ -1083,38 +1135,83 @@ const createRelease = async () => { } try { - // 准备请求数据 - const requestData: any = { - service: selectedNode.value.name, - version: selectedVersion.value + // 1. 调用mock数据操作函数,修改底层数据 + createReleaseTask(selectedNode.value.name, selectedVersion.value) + + // 2. 重新加载服务详情数据(这样饼状图会自动更新) + const serviceDetailResult = await loadServiceDetail(selectedNode.value.name) + console.log('重新加载的服务详情数据:', serviceDetailResult) + if (serviceDetailResult) { + selectedNode.value = { ...serviceDetailResult, status: getNodeStatus(serviceDetailResult) } + console.log('更新后的selectedNode.value:', selectedNode.value) } - // 如果有计划时间,转换为ISO格式 - if (scheduledStart.value) { - requestData.scheduleTime = new Date(scheduledStart.value).toISOString() + // 3. 重新加载可发布版本数据(这样下拉框会自动更新) + const availableVersionsResult = await loadServiceAvailableVersions(selectedNode.value.name) + if (availableVersionsResult) { + currentServiceAvailableVersions.value = availableVersionsResult } - // 调用创建部署API - const result = await apiService.createDeployment(requestData) + // 4. 生成合理的黄金指标数值并更新表格 + const generateMetrics = () => { + return { + latency: (Math.random() * 50 + 10).toFixed(1), // 10-60ms + traffic: (Math.random() * 1000 + 100).toFixed(0), // 100-1100 req/s + errorRatio: (Math.random() * 2).toFixed(1), // 0-2% + saturation: (Math.random() * 20 + 60).toFixed(0) // 60-80% + } + } + + const newMetrics = generateMetrics() - if (result.status === 201) { - ElMessage.success('发布计划创建成功') + // 直接更新当前服务的指标数据 + if (currentServiceMetrics.value) { + // 添加新版本的指标数据 + const newVersionMetrics = { + version: selectedVersion.value, + metrics: [ + { name: 'latency', value: parseFloat(newMetrics.latency) }, + { name: 'traffic', value: parseFloat(newMetrics.traffic) }, + { name: 'errorRatio', value: parseFloat(newMetrics.errorRatio) }, + { name: 'saturation', value: parseFloat(newMetrics.saturation) } + ] + } - // 重置表单 - selectedVersion.value = '' - scheduledStart.value = '' + // 检查是否已存在该版本,如果存在则更新,否则添加 + const existingIndex = currentServiceMetrics.value.items.findIndex(item => item.version === selectedVersion.value) + if (existingIndex >= 0) { + currentServiceMetrics.value.items[existingIndex] = newVersionMetrics + } else { + currentServiceMetrics.value.items.push(newVersionMetrics) + } - // 刷新相关数据 - await Promise.all([ - loadServiceDeploymentPlans(selectedNode.value.name), - loadServiceDetail(selectedNode.value.name) - ]) - } else { - ElMessage.error('创建发布计划失败') + // 更新summary数据(使用所有版本的平均值) + const allVersions = currentServiceMetrics.value.items + const avgLatency = allVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'latency')?.value || 0), 0) / allVersions.length + const avgTraffic = allVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'traffic')?.value || 0), 0) / allVersions.length + const avgErrorRatio = allVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'errorRatio')?.value || 0), 0) / allVersions.length + const avgSaturation = allVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'saturation')?.value || 0), 0) / allVersions.length + + currentServiceMetrics.value.summary.metrics = [ + { name: 'latency', value: avgLatency }, + { name: 'traffic', value: avgTraffic }, + { name: 'errorRatio', value: avgErrorRatio }, + { name: 'saturation', value: avgSaturation } + ] } + + // 5. 重新加载服务数据以更新拓扑图 + await loadServicesData() + + ElMessage.success('发布任务创建成功') + + // 重置表单 + selectedVersion.value = '' + scheduledStart.value = '' + } catch (error) { - console.error('创建发布计划失败:', error) - ElMessage.error('创建发布计划失败') + console.error('创建发布任务失败:', error) + ElMessage.error('创建发布任务失败') } } @@ -1170,7 +1267,7 @@ const cancelEdit = () => { const confirmCancel = async (plan: any) => { try { - // 调用取消部署计划API + // 调用取消部署计划API - 使用Mock API const result = await mockApi.cancelDeployment(plan.id) if (result.status === 200) { @@ -1199,7 +1296,7 @@ const togglePauseResume = async (plan: any) => { } } - // 根据当前状态调用不同的API + // 根据当前状态调用不同的API - 使用Mock API const result = plan.isPaused ? await mockApi.continueDeployment(plan.id) // 继续 : await mockApi.pauseDeployment(plan.id) // 暂停 @@ -1233,7 +1330,7 @@ const togglePauseResume = async (plan: any) => { // 回滚发布 const rollbackRelease = async (plan: any) => { try { - // 调用回滚部署计划API + // 调用回滚部署计划API - 使用Mock API const result = await mockApi.rollbackDeployment(plan.id) if (result.status === 200) { @@ -1275,18 +1372,82 @@ const togglePauseResumeForVersion = async (version: any) => { const rollbackVersion = async (version: any) => { try { - await mockApi.rollbackDeployment(version.deployId) - ElMessage.success('回滚成功') - // 刷新服务详情数据 + // 1. 调用mock数据操作函数,修改底层数据 + rollbackVersionInMock(selectedNode.value?.name || '', version.version) + + // 2. 重新加载服务详情数据(这样饼状图会自动更新) if (selectedNode.value) { - await loadServiceDetail(selectedNode.value.name) + const serviceDetailResult = await loadServiceDetail(selectedNode.value.name) + if (serviceDetailResult) { + selectedNode.value = { ...serviceDetailResult, status: getNodeStatus(serviceDetailResult) } + } } + + // 3. 重新加载可发布版本数据(这样下拉框会自动更新) + if (selectedNode.value) { + const availableVersionsResult = await loadServiceAvailableVersions(selectedNode.value.name) + if (availableVersionsResult) { + currentServiceAvailableVersions.value = availableVersionsResult + } + } + + // 4. 从指标数据中移除该版本 + if (currentServiceMetrics.value) { + const versionIndex = currentServiceMetrics.value.items.findIndex(item => item.version === version.version) + if (versionIndex >= 0) { + currentServiceMetrics.value.items.splice(versionIndex, 1) + + // 更新summary数据(使用剩余版本的平均值) + const remainingVersions = currentServiceMetrics.value.items + if (remainingVersions.length > 0) { + const avgLatency = remainingVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'latency')?.value || 0), 0) / remainingVersions.length + const avgTraffic = remainingVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'traffic')?.value || 0), 0) / remainingVersions.length + const avgErrorRatio = remainingVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'errorRatio')?.value || 0), 0) / remainingVersions.length + const avgSaturation = remainingVersions.reduce((sum, v) => sum + (v.metrics.find(m => m.name === 'saturation')?.value || 0), 0) / remainingVersions.length + + currentServiceMetrics.value.summary.metrics = [ + { name: 'latency', value: avgLatency }, + { name: 'traffic', value: avgTraffic }, + { name: 'errorRatio', value: avgErrorRatio }, + { name: 'saturation', value: avgSaturation } + ] + } else { + // 如果没有剩余版本,清空summary数据 + currentServiceMetrics.value.summary.metrics = [ + { name: 'latency', value: 0 }, + { name: 'traffic', value: 0 }, + { name: 'errorRatio', value: 0 }, + { name: 'saturation', value: 0 } + ] + } + } + } + + // 5. 重新加载服务数据以更新拓扑图 + await loadServicesData() + + ElMessage.success('版本回滚成功') + } catch (error) { console.error('回滚失败:', error) ElMessage.error('回滚失败') } } +// 全局函数:更新服务告警状态(供告警界面调用) +const updateServiceAlertStatusFromAlerts = (serviceName: string, alertState: string) => { + const status = updateServiceAlertStatus(serviceName, alertState) + + // 重新加载服务数据以更新拓扑图 + loadServicesData() + + console.log(`服务 ${serviceName} 告警状态已更新为: ${status}`) + return status +} + +// 将函数暴露到全局,供告警界面调用 +;(window as any).updateServiceAlertStatusFromAlerts = updateServiceAlertStatusFromAlerts + // 初始化饼图 let pieChart: echarts.ECharts | null = null @@ -1298,6 +1459,15 @@ let saturationChart: echarts.ECharts | null = null const initPieChart = () => { if (pieChartRef.value && selectedNode.value) { + // 如果该DOM上已有实例,先释放 + const existing = echarts.getInstanceByDom(pieChartRef.value as unknown as HTMLDivElement) + if (existing) { + existing.dispose() + } + if (pieChart) { + pieChart.dispose() + pieChart = null + } pieChart = echarts.init(pieChartRef.value) const option = { @@ -2105,4 +2275,5 @@ const disposeMetricsCharts = () => { justify-content: flex-end; gap: 12px; } + \ No newline at end of file diff --git a/client/vite.config.ts b/client/vite.config.ts index 4217010..cf2a4e3 100644 --- a/client/vite.config.ts +++ b/client/vite.config.ts @@ -15,4 +15,13 @@ export default defineConfig({ '@': fileURLToPath(new URL('./src', import.meta.url)) }, }, + server: { + proxy: { + '/v1': { + target: 'http://127.0.0.1:8080', + changeOrigin: true, + secure: false, + } + } + } }) diff --git a/client/yarn.lock b/client/yarn.lock index d0bb530..76fa708 100644 --- a/client/yarn.lock +++ b/client/yarn.lock @@ -2613,7 +2613,7 @@ vue-echarts@^7.0.3: dependencies: vue-demi "^0.13.11" -vue-eslint-parser@^10.2.0: +vue-eslint-parser@^10, vue-eslint-parser@^10.2.0: version "10.2.0" resolved "https://registry.yarnpkg.com/vue-eslint-parser/-/vue-eslint-parser-10.2.0.tgz#cb53f89b14c7f5bf6a95c9532e3b2961ab619d61" integrity sha512-CydUvFOQKD928UzZhTp4pr2vWz1L+H99t7Pkln2QSPdvmURT0MoC4wUccfCnuEaihNsu9aYYyk+bep8rlfkUXw== diff --git a/cmd/zeroops/main.go b/cmd/zeroops/main.go index 9459940..7422e70 100644 --- a/cmd/zeroops/main.go +++ b/cmd/zeroops/main.go @@ -1,7 +1,17 @@ package main import ( + "context" + "fmt" + "os" + "strconv" + "time" + "github.com/fox-gonic/fox" + alertapi "github.com/qiniu/zeroops/internal/alerting/api" + adb "github.com/qiniu/zeroops/internal/alerting/database" + "github.com/qiniu/zeroops/internal/alerting/service/healthcheck" + "github.com/qiniu/zeroops/internal/alerting/service/remediation" "github.com/qiniu/zeroops/internal/config" "github.com/qiniu/zeroops/internal/middleware" servicemanager "github.com/qiniu/zeroops/internal/service_manager" @@ -25,8 +35,47 @@ func main() { serviceManagerSrv.Close() }() + // optional alerting DB for healthcheck and remediation + var alertDB *adb.Database + { + dsn := func() string { + return fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s", + cfg.Database.Host, cfg.Database.Port, cfg.Database.User, cfg.Database.Password, cfg.Database.DBName, cfg.Database.SSLMode) + }() + if db, derr := adb.New(dsn); derr == nil { + alertDB = db + } else { + log.Error().Err(derr).Msg("healthcheck alerting DB init failed; scheduler/consumer will run without DB") + } + } + + // start healthcheck scheduler and remediation consumer + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + interval := parseDuration(os.Getenv("HC_SCAN_INTERVAL"), 10*time.Second) + batch := parseInt(os.Getenv("HC_SCAN_BATCH"), 200) + workers := parseInt(os.Getenv("HC_WORKERS"), 1) + if workers < 1 { + workers = 1 + } + alertChSize := parseInt(os.Getenv("REMEDIATION_ALERT_CHAN_SIZE"), 1024) + alertCh := make(chan healthcheck.AlertMessage, alertChSize) + + for i := 0; i < workers; i++ { + go healthcheck.StartScheduler(ctx, healthcheck.Deps{ + DB: alertDB, + Redis: healthcheck.NewRedisClientFromEnv(), + AlertCh: alertCh, + Batch: batch, + Interval: interval, + }) + } + rem := remediation.NewConsumer(alertDB, healthcheck.NewRedisClientFromEnv()) + go rem.Start(ctx, alertCh) + router := fox.New() router.Use(middleware.Authentication) + alertapi.NewApiWithConfig(router, cfg) if err := serviceManagerSrv.UseApi(router); err != nil { log.Fatal().Err(err).Msg("bind serviceManagerApi failed.") } @@ -36,3 +85,23 @@ func main() { } log.Info().Msg("zeroops api server exit...") } + +func parseDuration(s string, d time.Duration) time.Duration { + if s == "" { + return d + } + if v, err := time.ParseDuration(s); err == nil { + return v + } + return d +} + +func parseInt(s string, v int) int { + if s == "" { + return v + } + if n, err := strconv.Atoi(s); err == nil { + return n + } + return v +} diff --git a/docs/alerting/api.md b/docs/alerting/api.md new file mode 100644 index 0000000..df0ddce --- /dev/null +++ b/docs/alerting/api.md @@ -0,0 +1,320 @@ +# API 文档 - Monitoring & Alerting Service + +## 概述 + +本文档描述了监控告警服务的 RESTful API 接口,包括告警列表查询、详情获取等核心功能。 + +人工模拟prometheus调用我们的接受告警接口,收到告警事件 + + +实现状态说明: +- 已实现:接收 Alertmanager Webhook(/v1/integrations/alertmanager/webhook) +- 规划中:告警列表与详情查询接口(本文档描述为对外契约,后续实现) + + +## 基础信息 + +- **Base URL**: `/v1` +- **Content-Type**: `application/json` +- **认证方式**: Webhook 端点可通过环境变量启用 Basic 或 Bearer 认证(见下文)。其他查询接口在实现时将采用 Bearer Token。 + +## 接口列表 + +### 1. 获取告警列表 + +获取告警问题列表,支持分页和状态筛选。 + +**请求:** +```http +GET /v1/issues?start={start}&limit={limit}[&state={state}] +``` + +**查询参数:** + +| 参数名 | 类型 | 必填 | 说明 | +|--------|------|------|------| +| start | string | 否 | 游标。第一页可不传或传空;翻页使用上次响应的 `next` | +| limit | integer | 是 | 每页返回数量,建议范围:1-100 | +| state | string | 否 | 问题状态筛选:`Open`、`Closed` | + +分页说明:服务采用基于游标(cursor)的分页。首次请求建议省略 `start`;当返回结果较多时,响应体会包含 `next` 字段,表示下一页的游标。继续翻页时,将该 `next` 作为 `start` 传回。 + +**响应示例:** +```json +{ + "items": [ + { + "id": "xxx", + "state": "Closed", + "level": "P0", + "alertState": "Restored", + "title": "yzh S3APIV2s3apiv2.putobject 0_64K上传响应时间95值:50012ms > 450ms", + "labels": [ + {"key": "api", "value": "s3apiv2.putobject"}, + {"key": "idc", "value": "yzh"} + ], + "alertSince": "2025-05-05T11:00:00.000Z" + } + ], + "next": "xxxx" +} +``` + +**状态码:** +- `200 OK`: 成功获取列表 +- `400 Bad Request`: 参数错误 +- `401 Unauthorized`: 认证失败 +- `500 Internal Server Error`: 服务器内部错误 + +### 2. 获取告警详情 + +获取指定告警问题的详细信息,包括处理历史。 + +**请求:** +```http +GET /v1/issues/{issueID} +``` + +**路径参数:** + +| 参数名 | 类型 | 必填 | 说明 | +|--------|------|------|------| +| issueID | string | 是 | 告警问题ID | + +**响应示例:** +```json +{ + "id": "issue_20250505_001", + "state": "Closed", + "level": "P0", + "alertState": "Restored", + "title": "yzh S3APIV2s3apiv2.putobject 0_64K上传响应时间95值:50012ms > 450ms", + "labels": [ + {"key": "api", "value": "s3apiv2.putobject"}, + {"key": "idc", "value": "yzh"}, + {"key": "service", "value": "s3api"} + ], + "alertSince": "2025-05-05T11:00:00.000Z", + "comments": [ + { + "createdAt": "2025-05-05T11:00:30.000Z", + "content": "## 自动分析\n\n检测到 S3 API 响应时间异常,可能原因:\n- 后端存储负载过高\n- 网络延迟增加\n\n## 建议处理\n1. 检查存储节点状态\n2. 分析网络监控数据" + }, + { + "createdAt": "2025-05-05T11:05:00.000Z", + "content": "## 自动治愈开始\n\n执行治愈策略:重启相关服务实例" + }, + { + "createdAt": "2025-05-05T11:15:00.000Z", + "content": "## 问题已解决\n\n响应时间恢复正常,告警自动关闭" + } + ] +} +``` + +**状态码:** +- `200 OK`: 成功获取详情 +- `404 Not Found`: 告警问题不存在 +- `401 Unauthorized`: 认证失败 +- `500 Internal Server Error`: 服务器内部错误 + +## 数据模型 + +### AlertIssue 对象 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| id | string | 告警问题唯一标识 | +| state | string | 告警工单的生命周期状态状态:`Open`、`Closed` | +| level | string | 告警等级:`P0`、`P1`、`P2`、`Warning` | +| alertState | string | 告警本身的实时状态:`Pending`、`Restored`、`AutoRestored`、`InProcessing` | +| title | string | 告警标题描述 | +| labels | Label[] | 标签数组 | +| alertSince | string | 告警发生时间(ISO 8601格式) | +| comments | Comment[] | 处理评论列表(仅详情接口返回) | + +### Label 对象 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| key | string | 标签键 | +| value | string | 标签值 | + +### Comment 对象 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| createdAt | string | 评论创建时间(ISO 8601格式) | +| content | string | 评论内容(Markdown格式) | + +### 状态语义与映射 + +- **state**:工单生命周期状态,取值:`Open`、`Closed`。 +- **alertState**:告警实时状态,取值:`Pending`、`InProcessing`、`Restored`、`AutoRestored`。 + +典型关系与约定: +- 当 `state = Open` 时,通常表示仍存在未恢复的相关告警,`alertState` 多为 `Pending`或`InProcessing`。 +- 当 `state = Closed` 时,通常表示所有相关告警已恢复,`alertState` 多为 `Restored` 或 `AutoRestored`(自动恢复)。 + +注意:`alertState` 反映的是告警流的当前快照,而 `state` 管理的是问题工单的生命周期。二者并非强绑定,边界时刻可能出现 `state = Open` 但部分告警已恢复的情况。 + +## 错误响应 + +所有接口在出错时返回统一的错误格式: + +```json +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "参数 limit 必须在 1-100 范围内", + "details": { + "field": "limit", + "value": "150" + } + } +} +``` + +### 错误代码 + +| 错误代码 | 说明 | +|----------|------| +| INVALID_PARAMETER | 请求参数错误 | +| UNAUTHORIZED | 认证失败 | +| FORBIDDEN | 权限不足 | +| NOT_FOUND | 资源不存在 | +| INTERNAL_ERROR | 服务器内部错误 | + +## 使用示例 + +### curl 示例 + +```bash +# 获取告警列表 +curl -X GET "https://api.example.com/v1/issues?limit=10&state=Open" \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" + +# 获取第二页(使用上一页响应中的 next 作为 start) +curl -X GET "https://api.example.com/v1/issues?start=c_abcdef12345&limit=10&state=Open" \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" + +# 获取告警详情 +curl -X GET "https://api.example.com/v1/issues/issue_20250505_001" \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" +``` + +### JavaScript 示例 + +```javascript +// 获取告警列表 +const response = await fetch('/v1/issues?limit=10', { + headers: { + 'Authorization': 'Bearer ' + token, + 'Content-Type': 'application/json' + } +}); +const data = await response.json(); + +// 获取第二页(将上一页的 next 作为 start 传回) +const nextResponse = await fetch(`/v1/issues?start=${data.next}&limit=10`, { + headers: { + 'Authorization': 'Bearer ' + token, + 'Content-Type': 'application/json' + } +}); +const nextPage = await nextResponse.json(); + +// 获取告警详情 +const detailResponse = await fetch(`/v1/issues/${issueId}`, { + headers: { + 'Authorization': 'Bearer ' + token, + 'Content-Type': 'application/json' + } +}); +const detail = await detailResponse.json(); +``` + +### 3. 接收 Alertmanager Webhook(告警接入) + +用于接收 Alertmanager 推送的告警事件。 + +**请求:** +```http +POST /v1/integrations/alertmanager/webhook +Content-Type: application/json +``` + +**认证:** +- 可选鉴权(通过环境变量开启): + - Basic:设置 `ALERT_WEBHOOK_BASIC_USER` 与 `ALERT_WEBHOOK_BASIC_PASS` + - Bearer:设置 `ALERT_WEBHOOK_BEARER` + - 若上述变量均未设置,则该端点不强制鉴权(开发/测试便捷) + +**请求体(示例 - firing):** +```json +{ + "receiver": "our-webhook", + "status": "firing", + "alerts": [ + { + "status": "firing", + "labels": { + "alertname": "HighRequestLatency", + "service": "serviceA", + "severity": "P1", + "idc": "yzh" + }, + "annotations": { + "summary": "p95 latency over threshold", + "description": "apitime p95 > 450ms" + }, + "startsAt": "2025-05-05T11:00:00Z", + "endsAt": "0001-01-01T00:00:00Z", + "generatorURL": "http://prometheus/graph?g0.expr=...", + "fingerprint": "3b1b7f4e8f0e" + } + ], + "groupLabels": {"alertname": "HighRequestLatency"}, + "commonLabels": {"service": "serviceA", "severity": "P1"}, + "version": "4" +} +``` + +**字段要点:** +- `status`: `firing` | `resolved` +- `alerts[]`: 多条告警,关键字段 `labels`、`annotations`、`startsAt`、`fingerprint` +- `fingerprint + startsAt`:用于应用层幂等 + +**响应:** +- `200 OK {"ok": true, "created": }` 当 `status=firing` 时返回本次创建条数 +- `200 OK {"ok": true, "msg": "ignored (not firing)"}` 当非 `firing` 时快速返回 + +**curl 示例:** +```bash +# firing +curl -X POST http://localhost:8080/v1/integrations/alertmanager/webhook \ + -H 'Content-Type: application/json' \ + -d '{ + "receiver":"our-webhook", + "status":"firing", + "alerts":[{ + "status":"firing", + "labels":{"alertname":"HighRequestLatency","service":"serviceA","severity":"P1","idc":"yzh"}, + "annotations":{"summary":"p95 latency over threshold","description":"apitime p95 > 450ms"}, + "startsAt":"2025-05-05T11:00:00Z", + "endsAt":"0001-01-01T00:00:00Z", + "generatorURL":"http://prometheus/graph?g0.expr=...", + "fingerprint":"3b1b7f4e8f0e" + }], + "groupLabels":{"alertname":"HighRequestLatency"}, + "commonLabels":{"service":"serviceA","severity":"P1"}, + "version":"4" + }' +``` + +## 版本历史 + +- **v1.0** (2025-09-11): 初始版本,支持基础的告警列表和详情查询 diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md new file mode 100644 index 0000000..47508d1 --- /dev/null +++ b/docs/alerting/database-design.md @@ -0,0 +1,197 @@ +# 数据库设计 - Monitoring & Alerting Service + +## 概述 + +本文档为最新数据库设计,总计包含 7 张表: + +- alert_issues +- alert_issue_comments +- metric_alert_changes +- alert_rules +- service_alert_metas +- service_metrics +- service_states + +## 数据表设计 + +### 1) alert_issues(告警问题表) + +存储告警问题的主要信息。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| id | varchar(64) PK | 告警 issue ID | +| state | enum(Closed, Open) | 问题状态 | +| level | varchar(32) | 告警等级:如 P0/P1/Px | +| alert_state | enum(Pending, Restored, AutoRestored, InProcessing) | 处理状态 | +| title | varchar(255) | 告警标题 | +| labels | json | 标签,格式:[{key, value}] | +| alert_since | TIMESTAMP(6) | 告警首次发生时间 | + +**索引建议:** +- PRIMARY KEY: `id` +- INDEX: `(state, level, alert_since)` +- INDEX: `(alert_state, alert_since)` + +--- + +### 2) alert_issue_comments(告警评论/处理记录表) + +记录 AI/系统/人工在处理告警过程中的动作与备注。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| issue_id | varchar(64) FK | 对应 `alert_issues.id` | +| create_at | TIMESTAMP(6) | 评论创建时间 | +| content | text | Markdown 内容 | + +**索引建议:** +- PRIMARY KEY: `(issue_id, create_at)` +- FOREIGN KEY: `issue_id` REFERENCES `alert_issues(id)` + +--- + +### 3) metric_alert_changes(指标告警规则变更记录表) + +用于追踪指标类告警规则或参数的变更历史。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| id | varchar(64) PK | 变更记录 ID | +| change_time | TIMESTAMP(6) | 变更时间 | +| alert_name | varchar(255) | 告警名称/规则名 | +| change_items | json | 变更项数组:[{key, old_value, new_value}] | + +**索引建议:** +- PRIMARY KEY: `id` +- INDEX: `(change_time)` +- INDEX: `(alert_name, change_time)` + +--- + +### 4) alert_rules(告警规则表) + +定义可复用的规则表达式,支持作用域绑定。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| id | varchar(255) PK | 规则 ID(可与 K8s 资源 ID 对应或做映射) | +| name | varchar(255) | 规则名称,表达式可读的名称 | +| scopes | varchar(255) | 作用域,例:"services:svc1,svc2" | +| expr | text | 规则表达式(可含占位符) | + +**索引建议:** +- PRIMARY KEY: `id` +- INDEX: `(name)` +- INDEX: `(scopes)` + +--- + +### 5) service_alert_metas(服务告警元数据表) + +按服务维度存放参数化配置,用于渲染具体规则。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| service | varchar(255) | 服务名 | +| key | varchar(255) | 参数名(如 `apitime_threshold`) | +| value | varchar(255) | 参数值(如 `50`) | + +**索引建议:** +- PRIMARY KEY: `(service, key)` +- INDEX: `(service)` + +--- + +### 6) service_metrics(服务指标清单表) + +记录服务所关注的指标清单(可用于 UI 侧展示或校验)。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| service | varchar(255) PK | 服务名 | +| metrics | json | 指标名数组:["metric1", "metric2", ...] | + +**索引建议:** +- PRIMARY KEY: `service` + +--- + +### 7) service_states(服务状态表) + +追踪服务在某一版本上的健康状态与处置进度。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| service | varchar(255) PK | 服务名 | +| version | varchar(255) PK | 版本号 | +| report_at | TIMESTAMP(6) | 同步alert_issue_ids中,alert_issue中alert_state=InProcessing状态的alert_since的最早时间 | +| resolved_at | TIMESTAMP(6) | 解决时间(可空) | +| health_state | enum(Normal,Warning,Error) | 处置阶段 | +| alert_issue_ids | [] alert_issue_id | 关联alert_issues表的id | + +**索引建议:** +- PRIMARY KEY: `(service, version)` + +## 数据关系(ER) + +```mermaid +erDiagram + alert_issues ||--o{ alert_issue_comments : "has comments" + + alert_rules { + varchar id PK + varchar name + varchar scopes + text expr + } + + service_alert_metas { + varchar service PK + varchar key PK + varchar value + } + + service_metrics { + varchar service PK + json metrics + } + + service_states { + varchar service PK + varchar version PK + + text detail + timestamp report_at + timestamp resolved_at + varchar health_state + varchar correlation_id + } + + alert_issues { + varchar id PK + enum state + varchar level + enum alert_state + varchar title + json labels + timestamp alert_since + } + + alert_issue_comments { + varchar issue_id FK + timestamp create_at + text content + } + + %% 通过 service 逻辑关联 + service_alert_metas ||..|| service_metrics : "by service" + service_states ||..|| service_alert_metas : "by service" +``` + +## 数据流转 + +1. 以 `alert_rules` 为模版,结合 `service_alert_metas` 渲染出面向具体服务的规则。 +2. 指标或规则参数发生调整时,记录到 `metric_alert_changes`。 +3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 +4. 面向服务的整体健康态以 `service_states` 记录和推进(new → analyzing → processing → resolved)。 \ No newline at end of file diff --git a/env_example.txt b/env_example.txt index 8c41e9f..a492403 100644 --- a/env_example.txt +++ b/env_example.txt @@ -84,3 +84,69 @@ NO_FORCE_TERMINAL=false # - 定期轮换 API 密钥 # - 在生产环境中使用环境变量而非文件 # - 使用最小权限原则配置 API 密钥 + +# ============================================================================= +# Alerting 服务配置(数据库连接 + Webhook 鉴权) +# ============================================================================= + +# 数据库连接(用于 Alertmanager Webhook 入库) +# 示例使用本机 Docker Postgres(见 internal/alerting/service/receiver/README.md 验证步骤) +DB_HOST=localhost +DB_PORT=5432 +DB_USER=postgres +DB_PASSWORD=postgres +DB_NAME=zeroops +DB_SSLMODE=disable + +# Webhook 鉴权(与 Alertmanager http_config 对齐,二选一) +# 1) Basic Auth +ALERT_WEBHOOK_BASIC_USER=alert +ALERT_WEBHOOK_BASIC_PASS=REDACTED +# 2) Bearer Token(如使用该方式,注释掉上面的 Basic) +# ALERT_WEBHOOK_BEARER=your_token_here + +# ============================================================================= +# Alerting 查询 API 配置(Redis 连接) +# ============================================================================= + +# Redis 连接(用于 /v1/issues 与 /v1/issues/{issueID} 从缓存读取) +# 示例使用本机 Docker Redis(见 internal/alerting/service/receiver/README.md 验证步骤) +REDIS_ADDR=localhost:6379 +REDIS_PASSWORD="" +REDIS_DB=0 + +# ============================================================================= +# 服务监听配置 +# ============================================================================= + +# API 服务监听地址(默认 0.0.0.0:8080) +SERVER_BIND_ADDR=0.0.0.0:8080 + +# ============================================================================= +# Healthcheck 扫描任务(Pending 告警扫描与分发) +# ============================================================================= + +# 扫描间隔,默认 10s(示例:1s/5s/10s/30s/1m) +HC_SCAN_INTERVAL=10s +# 每次扫描的最大 Pending 数量(建议 50-500) +HC_SCAN_BATCH=200 +# 并发 worker 数(建议 1-4) +HC_WORKERS=1 + +# 预留:未来切换到消息队列时启用 +# ALERT_QUEUE_KIND=redis_stream|kafka|nats +# ALERT_QUEUE_DSN=redis://localhost:6379/0 +# ALERT_QUEUE_TOPIC=alerts.pending + +# ============================================================================= +# Remediation 自动化回滚(消费者) +# ============================================================================= + +# 通道容量(healthcheck → remediation) +REMEDIATION_ALERT_CHAN_SIZE=1024 + +# 回滚接口(Mock 用;真实场景可指向部署系统) +REMEDIATION_ROLLBACK_URL=http://localhost:8080/v1/deployments/%s/rollback + +# 回滚等待时间(用于演示观察 InProcessing → Restored 的间隔) +REMEDIATION_ROLLBACK_SLEEP=30s diff --git a/go.mod b/go.mod index 1c04c14..6094f9c 100644 --- a/go.mod +++ b/go.mod @@ -4,14 +4,19 @@ go 1.24 require ( github.com/fox-gonic/fox v0.0.6 + github.com/google/uuid v1.6.0 + github.com/jackc/pgx/v5 v5.5.5 github.com/lib/pq v1.10.9 + github.com/redis/go-redis/v9 v9.5.1 github.com/rs/zerolog v1.34.0 ) require ( github.com/bytedance/sonic v1.13.3 // indirect github.com/bytedance/sonic/loader v0.2.4 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/gabriel-vasile/mimetype v1.4.9 // indirect github.com/gin-contrib/cors v1.7.6 // indirect github.com/gin-contrib/sse v1.1.0 // indirect @@ -20,6 +25,9 @@ require ( github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.27.0 // indirect github.com/goccy/go-json v0.10.5 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect + github.com/jackc/puddle/v2 v2.2.1 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/cpuid/v2 v2.2.10 // indirect github.com/kr/text v0.2.0 // indirect @@ -36,6 +44,7 @@ require ( golang.org/x/arch v0.18.0 // indirect golang.org/x/crypto v0.39.0 // indirect golang.org/x/net v0.41.0 // indirect + golang.org/x/sync v0.15.0 // indirect golang.org/x/sys v0.33.0 // indirect golang.org/x/text v0.26.0 // indirect google.golang.org/protobuf v1.36.6 // indirect diff --git a/go.sum b/go.sum index 7f6360e..04e9b56 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,16 @@ github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/bytedance/sonic v1.13.3 h1:MS8gmaH16Gtirygw7jV91pDCN33NyMrPbN7qiYhEsF0= github.com/bytedance/sonic v1.13.3/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1+KgkJhz4= github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY= github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= @@ -14,6 +20,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/fox-gonic/fox v0.0.6 h1:Otz6bTpiboGfCoAp4bTDZOTxI6PQw1uEID/VZRlklws= github.com/fox-gonic/fox v0.0.6/go.mod h1:l1C0zu5H44YV60tEq6rbNRvv0z14hnlpsl8lMlzqpFg= github.com/gabriel-vasile/mimetype v1.4.9 h1:5k+WDwEsD9eTLL8Tz3L0VnmVh9QxGjRmjBvAG7U/oYY= @@ -38,6 +46,16 @@ github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5x github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.5.5 h1:amBjrZVmksIdNjxGW/IiIMzxMKZFelXbUoPNb+8sjQw= +github.com/jackc/pgx/v5 v5.5.5/go.mod h1:ez9gk+OAat140fv9ErkZDYFWmXLfV+++K0uAOiwgm1A= +github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk= +github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= @@ -74,6 +92,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/redis/go-redis/v9 v9.5.1 h1:H1X4D3yHPaYrkL5X06Wh6xNVM/pX0Ft4RV0vMGvLBh8= +github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= @@ -99,6 +119,8 @@ golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= +golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/alerting/README.md b/internal/alerting/README.md index 32895f1..208d371 100644 --- a/internal/alerting/README.md +++ b/internal/alerting/README.md @@ -127,4 +127,119 @@ flowchart TD ## 相关文档 - [数据库设计文档](../../docs/alerting/database-design.md) - 详细的表结构和索引设计 -- [API 文档](../../docs/alerting/api.md) - RESTful API 接口规范 \ No newline at end of file +- [API 文档](../../docs/alerting/api.md) - RESTful API 接口规范 + +--- + +## 联调流程(使用 .env 加载环境变量) + +以下步骤演示从告警接收到自动治愈的完整链路,且通过 .env 文件加载环境变量(不使用逐条 export)。 + +### 0) 准备 .env + +```bash +cp env_example.txt .env +# 按需编辑 .env,至少确认: +# - DB_* 指向本机 Postgres +# - REDIS_* 指向本机 Redis +# - ALERT_WEBHOOK_BASIC_USER / ALERT_WEBHOOK_BASIC_PASS +# - HC_SCAN_INTERVAL/HC_SCAN_BATCH/HC_WORKERS(可选:例如 1s/50/1 便于观察) +# - REMEDIATION_ROLLBACK_SLEEP(建议 30s,便于观察 InProcessing→Restored) +``` + +### 1) 启动依赖容器 + +```bash +docker rm -f zeroops-redis zeroops-pg 2>/dev/null || true +docker run -d --name zeroops-redis -p 6379:6379 redis:7-alpine +docker run -d --name zeroops-pg \ + -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=zeroops \ + -p 5432:5432 postgres:16 +until docker exec zeroops-redis redis-cli ping >/dev/null 2>&1; do sleep 0.5; done +until docker exec zeroops-pg pg_isready -U postgres >/dev/null 2>&1; do sleep 0.5; done +``` + +初始化/校验最小表(不存在则创建): + +```bash +docker exec -i zeroops-pg psql -U postgres -d zeroops -c \ + "CREATE TABLE IF NOT EXISTS alert_issues (id text primary key, state text, level text, alert_state text, title text, labels json, alert_since timestamp);" +docker exec -i zeroops-pg psql -U postgres -d zeroops -c \ + "CREATE TABLE IF NOT EXISTS service_states (service text, version text, report_at timestamp, resolved_at timestamp, health_state text, alert_issue_ids text[], PRIMARY KEY(service,version));" +docker exec -i zeroops-pg psql -U postgres -d zeroops -c \ + "CREATE TABLE IF NOT EXISTS alert_issue_comments (issue_id text, create_at timestamp, content text, PRIMARY KEY(issue_id, create_at));" +``` + +### 2) 清空数据库与缓存(可选,保证从空开始) + +```bash +docker exec -i zeroops-pg psql -U postgres -d zeroops -c "TRUNCATE TABLE alert_issue_comments, service_states, alert_issues;" +docker exec -i zeroops-redis redis-cli --raw DEL $(docker exec -i zeroops-redis redis-cli --raw KEYS 'alert:*' | tr '\n' ' ') 2>/dev/null || true +docker exec -i zeroops-redis redis-cli --raw DEL $(docker exec -i zeroops-redis redis-cli --raw KEYS 'service_state:*' | tr '\n' ' ') 2>/dev/null || true +``` + +### 3) 启动服务(使用 .env 加载) + +```bash +set -a; . ./.env; set +a +nohup go run ./cmd/zeroops -- 1>/tmp/zeroops.out 2>&1 & echo $! +# 日志:tail -f /tmp/zeroops.out +``` + +### 4) 触发告警(Webhook) + +```bash +export ALERT_WEBHOOK_BASIC_USER=alert +export ALERT_WEBHOOK_BASIC_PASS=REDACTED +``` +```bash +curl -s -u "${ALERT_WEBHOOK_BASIC_USER}:${ALERT_WEBHOOK_BASIC_PASS}" -H 'Content-Type: application/json' \ + -X POST http://localhost:8080/v1/integrations/alertmanager/webhook -d '{ + "receiver":"our-webhook", + "status":"firing", + "alerts":[{ + "status":"firing", + "labels":{"alertname":"HighRequestLatency","service":"stg","service_version":"v1.0.4","severity":"P1","idc":"yzh","deploy_id":"deploy-001"}, + "annotations":{"summary":"p95 latency over threshold","description":"apitime p95 > 450ms"}, + "startsAt":"2025-09-15T11:00:00Z", + "endsAt":"0001-09-16T00:00:00Z", + "generatorURL":"http://prometheus/graph?g0.expr=...", + "fingerprint":"manual-fp-001" + }], + "groupLabels":{"alertname":"HighRequestLatency"}, + "commonLabels":{"service":"stg","severity":"P1"}, + "version":"4" +}' +``` + +### 5) 验证 Pending → InProcessing(healthcheck) + +```bash +ISSUE_ID=$(docker exec -i zeroops-pg psql -U postgres -d zeroops -t -A -c "SELECT id FROM alert_issues LIMIT 1;"); echo ISSUE_ID=$ISSUE_ID +docker exec -i zeroops-redis redis-cli --raw GET alert:issue:${ISSUE_ID} | jq .alertState +docker exec -i zeroops-redis redis-cli --raw SMEMBERS alert:index:alert_state:InProcessing | grep -c "${ISSUE_ID}" || true +``` + +### 6) 验证 InProcessing → Restored(remediation) + +等待 `REMEDIATION_ROLLBACK_SLEEP` 指定的时间(建议 30s),然后: + +```bash +# DB +docker exec -i zeroops-pg psql -U postgres -d zeroops -c "SELECT id,alert_state FROM alert_issues WHERE id='${ISSUE_ID}';" +docker exec -i zeroops-pg psql -U postgres -d zeroops -c "SELECT service,version,health_state,to_char(resolved_at,'YYYY-MM-DD HH24:MI:SS') FROM service_states WHERE service='serviceA' AND version='v1.3.7';" +docker exec -i zeroops-pg psql -U postgres -d zeroops -c "SELECT to_char(create_at,'YYYY-MM-DD HH24:MI:SS') AS ts, substr(content,1,80) FROM alert_issue_comments WHERE issue_id='${ISSUE_ID}' ORDER BY create_at DESC LIMIT 1;" + +# Redis +docker exec -i zeroops-redis redis-cli --raw GET alert:issue:${ISSUE_ID} | jq .alertState +docker exec -i zeroops-redis redis-cli --raw GET service_state:serviceA:v1.3.7 | jq '.health_state, .resolved_at' +``` + +### 7) 查询 API + +```bash +curl -s -H 'Authorization: Bearer test' "http://localhost:8080/v1/issues/${ISSUE_ID}" | jq . +curl -s -H 'Authorization: Bearer test' "http://localhost:8080/v1/issues?limit=10&state=Open" | jq . +``` + +> 提示:如需更容易观察 InProcessing 状态,可将 `REMEDIATION_ROLLBACK_SLEEP` 调大(如 30s+),或适当增大 `HC_SCAN_INTERVAL`。 \ No newline at end of file diff --git a/internal/alerting/api/api.go b/internal/alerting/api/api.go new file mode 100644 index 0000000..ccdfbde --- /dev/null +++ b/internal/alerting/api/api.go @@ -0,0 +1,42 @@ +package api + +import ( + "fmt" + + "github.com/fox-gonic/fox" + adb "github.com/qiniu/zeroops/internal/alerting/database" + "github.com/qiniu/zeroops/internal/alerting/service/healthcheck" + receiver "github.com/qiniu/zeroops/internal/alerting/service/receiver" + "github.com/qiniu/zeroops/internal/config" +) + +type Api struct{} + +func NewApi(router *fox.Engine) *Api { return NewApiWithConfig(router, nil) } + +func NewApiWithConfig(router *fox.Engine, cfg *config.Config) *Api { + api := &Api{} + api.setupRouters(router, cfg) + return api +} + +func (api *Api) setupRouters(router *fox.Engine, cfg *config.Config) { + var h *receiver.Handler + var alertDB *adb.Database + if cfg != nil { + dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s", + cfg.Database.Host, cfg.Database.Port, cfg.Database.User, cfg.Database.Password, cfg.Database.DBName, cfg.Database.SSLMode) + if db, err := adb.New(dsn); err == nil { + alertDB = db + h = receiver.NewHandlerWithCache(receiver.NewPgDAO(db), receiver.NewCacheFromEnv()) + } else { + h = receiver.NewHandler(receiver.NewNoopDAO()) + } + } else { + h = receiver.NewHandler(receiver.NewNoopDAO()) + } + receiver.RegisterReceiverRoutes(router, h) + + // Issues query API (reads from Redis cache and loads comments from DB) + RegisterIssueRoutes(router, healthcheck.NewRedisClientFromEnv(), alertDB) +} diff --git a/internal/alerting/api/issues_api.go b/internal/alerting/api/issues_api.go new file mode 100644 index 0000000..f364759 --- /dev/null +++ b/internal/alerting/api/issues_api.go @@ -0,0 +1,265 @@ +package api + +import ( + "context" + "encoding/json" + "net/http" + "os" + "strconv" + "strings" + "time" + + "github.com/fox-gonic/fox" + adb "github.com/qiniu/zeroops/internal/alerting/database" + "github.com/redis/go-redis/v9" +) + +type IssueAPI struct { + R *redis.Client + DB *adb.Database +} + +// RegisterIssueRoutes registers issue query routes. If rdb is nil, a client is created from env. +// db can be nil; when nil, comments will be empty. +func RegisterIssueRoutes(router *fox.Engine, rdb *redis.Client, db *adb.Database) { + if rdb == nil { + rdb = newRedisFromEnv() + } + api := &IssueAPI{R: rdb, DB: db} + router.GET("/v1/issues/:issueID", api.GetIssueByID) + router.GET("/v1/issues", api.ListIssues) +} + +func newRedisFromEnv() *redis.Client { + addr := os.Getenv("REDIS_ADDR") + pass := os.Getenv("REDIS_PASSWORD") + var db int + if v := os.Getenv("REDIS_DB"); v != "" { + if d, err := strconv.Atoi(v); err == nil { + db = d + } + } + if addr == "" { + addr = "localhost:6379" + } + return redis.NewClient(&redis.Options{Addr: addr, Password: pass, DB: db}) +} + +type labelKV struct { + Key string `json:"key"` + Value string `json:"value"` +} + +type issueCacheRecord struct { + ID string `json:"id"` + State string `json:"state"` + Level string `json:"level"` + AlertState string `json:"alertState"` + Title string `json:"title"` + Labels json.RawMessage `json:"labels"` + AlertSince string `json:"alertSince"` +} + +type issueDetailResponse struct { + ID string `json:"id"` + State string `json:"state"` + Level string `json:"level"` + AlertState string `json:"alertState"` + Title string `json:"title"` + Labels []labelKV `json:"labels"` + AlertSince string `json:"alertSince"` + Comments []comment `json:"comments"` +} + +type comment struct { + CreatedAt string `json:"createdAt"` + Content string `json:"content"` +} + +func (api *IssueAPI) GetIssueByID(c *fox.Context) { + issueID := c.Param("issueID") + if issueID == "" { + c.JSON(http.StatusBadRequest, map[string]any{"error": map[string]any{"code": "INVALID_PARAMETER", "message": "missing issueID"}}) + return + } + ctx := context.Background() + key := "alert:issue:" + issueID + val, err := api.R.Get(ctx, key).Result() + if err == redis.Nil || val == "" { + c.JSON(http.StatusNotFound, map[string]any{"error": map[string]any{"code": "NOT_FOUND", "message": "issue not found"}}) + return + } + if err != nil { + c.JSON(http.StatusInternalServerError, map[string]any{"error": map[string]any{"code": "INTERNAL_ERROR", "message": err.Error()}}) + return + } + + var record issueCacheRecord + if uerr := json.Unmarshal([]byte(val), &record); uerr != nil { + c.JSON(http.StatusInternalServerError, map[string]any{"error": map[string]any{"code": "INTERNAL_ERROR", "message": "invalid cache format"}}) + return + } + + var labels []labelKV + if len(record.Labels) > 0 { + _ = json.Unmarshal(record.Labels, &labels) + } + + resp := issueDetailResponse{ + ID: record.ID, + State: record.State, + Level: record.Level, + AlertState: record.AlertState, + Title: record.Title, + Labels: labels, + AlertSince: normalizeTimeString(record.AlertSince), + Comments: api.fetchComments(c.Request.Context(), record.ID), + } + c.JSON(http.StatusOK, resp) +} + +func normalizeTimeString(s string) string { + if s == "" { + return s + } + if t, err := time.Parse(time.RFC3339Nano, s); err == nil { + return t.UTC().Format(time.RFC3339Nano) + } + if t, err := time.Parse(time.RFC3339, s); err == nil { + return t.UTC().Format(time.RFC3339Nano) + } + return s +} + +func (api *IssueAPI) fetchComments(ctx context.Context, issueID string) []comment { + if api.DB == nil || issueID == "" { + return []comment{} + } + const q = `SELECT create_at, content FROM alert_issue_comments WHERE issue_id=$1 ORDER BY create_at ASC` + rows, err := api.DB.QueryContext(ctx, q, issueID) + if err != nil { + return []comment{} + } + defer rows.Close() + out := make([]comment, 0, 4) + for rows.Next() { + var t time.Time + var content string + if err := rows.Scan(&t, &content); err != nil { + continue + } + out = append(out, comment{CreatedAt: t.UTC().Format(time.RFC3339Nano), Content: content}) + } + return out +} + +type listResponse struct { + Items []issueListItem `json:"items"` + Next string `json:"next,omitempty"` +} + +type issueListItem struct { + ID string `json:"id"` + State string `json:"state"` + Level string `json:"level"` + AlertState string `json:"alertState"` + Title string `json:"title"` + Labels []labelKV `json:"labels"` + AlertSince string `json:"alertSince"` +} + +func (api *IssueAPI) ListIssues(c *fox.Context) { + start := strings.TrimSpace(c.Query("start")) + limitStr := strings.TrimSpace(c.Query("limit")) + if limitStr == "" { + c.JSON(http.StatusBadRequest, map[string]any{"error": map[string]any{"code": "INVALID_PARAMETER", "message": "limit is required"}}) + return + } + limit, err := strconv.Atoi(limitStr) + if err != nil || limit < 1 || limit > 100 { + c.JSON(http.StatusBadRequest, map[string]any{"error": map[string]any{"code": "INVALID_PARAMETER", "message": "limit must be 1-100"}}) + return + } + + state := strings.TrimSpace(c.Query("state")) + idxKey := "alert:index:open" + if state != "" { + if strings.EqualFold(state, "Open") { + idxKey = "alert:index:open" + } else if strings.EqualFold(state, "Closed") { + idxKey = "alert:index:closed" + } else { + c.JSON(http.StatusBadRequest, map[string]any{"error": map[string]any{"code": "INVALID_PARAMETER", "message": "state must be Open or Closed"}}) + return + } + } + + var cursor uint64 + if start != "" { + if cv, err := strconv.ParseUint(start, 10, 64); err == nil { + cursor = cv + } + } + + ctx := context.Background() + ids, nextCursor, err := api.R.SScan(ctx, idxKey, cursor, "", int64(limit)).Result() + if err != nil && err != redis.Nil { + c.JSON(http.StatusInternalServerError, map[string]any{"error": map[string]any{"code": "INTERNAL_ERROR", "message": err.Error()}}) + return + } + + if len(ids) == 0 { + c.JSON(http.StatusOK, listResponse{Items: []issueListItem{}, Next: ""}) + return + } + + keys := make([]string, 0, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + keys = append(keys, "alert:issue:"+id) + } + + vals, err := api.R.MGet(ctx, keys...).Result() + if err != nil && err != redis.Nil { + c.JSON(http.StatusInternalServerError, map[string]any{"error": map[string]any{"code": "INTERNAL_ERROR", "message": err.Error()}}) + return + } + + items := make([]issueListItem, 0, len(vals)) + for _, v := range vals { + if v == nil { + continue + } + var rec issueCacheRecord + switch t := v.(type) { + case string: + _ = json.Unmarshal([]byte(t), &rec) + case []byte: + _ = json.Unmarshal(t, &rec) + default: + b, _ := json.Marshal(t) + _ = json.Unmarshal(b, &rec) + } + var labels []labelKV + if len(rec.Labels) > 0 { + _ = json.Unmarshal(rec.Labels, &labels) + } + items = append(items, issueListItem{ + ID: rec.ID, + State: rec.State, + Level: rec.Level, + AlertState: rec.AlertState, + Title: rec.Title, + Labels: labels, + AlertSince: normalizeTimeString(rec.AlertSince), + }) + } + + resp := listResponse{Items: items} + if nextCursor != 0 { + resp.Next = strconv.FormatUint(nextCursor, 10) + } + c.JSON(http.StatusOK, resp) +} diff --git a/internal/alerting/database/database.go b/internal/alerting/database/database.go new file mode 100644 index 0000000..e6ee504 --- /dev/null +++ b/internal/alerting/database/database.go @@ -0,0 +1,40 @@ +package database + +import ( + "context" + "database/sql" + "fmt" + + _ "github.com/jackc/pgx/v5/stdlib" +) + +type Database struct { + db *sql.DB +} + +func New(connString string) (*Database, error) { + db, err := sql.Open("pgx", connString) + if err != nil { + return nil, fmt.Errorf("open db: %w", err) + } + if err := db.Ping(); err != nil { + return nil, fmt.Errorf("ping db: %w", err) + } + return &Database{db: db}, nil +} + +func (d *Database) Close() error { + if d == nil || d.db == nil { + return nil + } + return d.db.Close() +} + +func (d *Database) ExecContext(ctx context.Context, q string, args ...any) (sql.Result, error) { + return d.db.ExecContext(ctx, q, args...) +} + +// QueryContext exposes database/sql QueryContext for SELECT queries. +func (d *Database) QueryContext(ctx context.Context, q string, args ...any) (*sql.Rows, error) { + return d.db.QueryContext(ctx, q, args...) +} diff --git a/internal/alerting/service/healthcheck/README.md b/internal/alerting/service/healthcheck/README.md new file mode 100644 index 0000000..7f7033f --- /dev/null +++ b/internal/alerting/service/healthcheck/README.md @@ -0,0 +1,235 @@ +# healthcheck — Pending 告警扫描与分发任务 + +本包提供一个定时任务: +- 周期性扫描 Pending 状态的告警 +- 将告警投递到 channel(供下游处理器消费),后续再接入消息队列 +- 成功投递后,原子地把缓存中的状态更新: + - `alert:issue:{id}` 的 `alertState`:Pending → InProcessing + - `service_state:{service}:{version}` 的 `health_state`:由告警等级推导(P0→Error;P1/P2→Warning) + +此任务默认只更新缓存,不直接更新数据库,以降低耦合与避免与业务处理竞争。数据库状态可由下游处理器在处理开始时回写,或由后续补偿任务兜底。 + +—— + +## 1. 触发与频率 + +- 间隔:默认每 10s 扫描一次(可配置) +- 批量:每次最多处理 200 条 Pending(可配置) +- 并发:串行或小并发(<= 4),避免重复投递 + +环境变量建议: +``` +HC_SCAN_INTERVAL=10s +HC_SCAN_BATCH=200 +HC_WORKERS=1 +``` + +—— + +## 2. 数据来源与过滤 + +优先以数据库为准,结合缓存加速: + +- 数据库查询(推荐) + ```sql + SELECT id, level, title, labels, alert_since + FROM alert_issues + WHERE alert_state = 'Pending' + ORDER BY alert_since ASC + LIMIT $1; + ``` + +当告警切换为 InProcessing 时,需要更新对应 `service_states.report_at` 为该 service/version 关联的 `alert_issue_ids` 中,所有 alert_issues 里 alert_state=InProcessing 的 `alert_since` 最早时间(min)。可通过下游处理器或本任务的补充逻辑回填: + +```sql +UPDATE service_states ss +SET report_at = sub.min_since +FROM ( + SELECT si.service, si.version, MIN(ai.alert_since) AS min_since + FROM service_states si + JOIN alert_issues ai ON ai.id = ANY(si.alert_issue_ids) + WHERE ai.alert_state = 'InProcessing' + GROUP BY si.service, si.version +) AS sub +WHERE ss.service = sub.service AND ss.version = sub.version; +``` + +- 或仅用缓存(可选): + - 维护集合 `alert:index:alert_state:Pending`(若未维护,可临时 SCAN `alert:issue:*` 并过滤 JSON 中的 `alertState`,但不推荐在大规模下使用 SCAN)。 + +—— + +## 3. 通道(channel) + +现阶段通过进程内 channel 向下游处理器传递告警消息,后续再平滑切换为消息队列(Kafka/NATS 等)。 + +消息格式保留为 `AlertMessage`: +```go +type AlertMessage struct { + ID string `json:"id"` + Service string `json:"service"` + Version string `json:"version,omitempty"` + Level string `json:"level"` + Title string `json:"title"` + AlertSince time.Time `json:"alert_since"` + Labels map[string]string `json:"labels"` +} +``` + +发布样例(避免阻塞可用非阻塞写): +```go +func publishToChannel(ctx context.Context, ch chan<- AlertMessage, m AlertMessage) error { + select { + case ch <- m: + return nil + default: + return fmt.Errorf("alert channel full") + } +} +``` + +配置:当前无需队列相关配置。未来切换到消息队列时,可启用以下配置项: +``` +# ALERT_QUEUE_KIND=redis_stream|kafka|nats +# ALERT_QUEUE_DSN=redis://localhost:6379/0 +# ALERT_QUEUE_TOPIC=alerts.pending +``` + +—— + +## 4. 缓存键与原子更新 + +现有(或建议)键: +- 告警:`alert:issue:{id}` → JSON,字段包含 `alertState` +- 指数(可选):`alert:index:alert_state:{Pending|InProcessing|...}` +- 服务态:`service_state:{service}:{version}` → JSON,字段包含 `health_state` +- 指数:`service_state:index:health:{Error|Warning|...}` + +为避免并发写冲突,建议使用 Lua CAS(Compare-And-Set)脚本原子修改值与索引: + +```lua +-- KEYS[1] = alert key, ARGV[1] = expected, ARGV[2] = next, KEYS[2] = idx:old, KEYS[3] = idx:new, ARGV[3] = id +local v = redis.call('GET', KEYS[1]) +if not v then return 0 end +local obj = cjson.decode(v) +if obj.alertState ~= ARGV[1] then return -1 end +obj.alertState = ARGV[2] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[3]) end +if KEYS[3] ~= '' then redis.call('SADD', KEYS[3], ARGV[3]) end +return 1 +``` + +服务态类似(示例将态切换到推导的新态): +```lua +-- KEYS[1] = service_state key, ARGV[1] = expected(optional), ARGV[2] = next, KEYS[2] = idx:old(optional), KEYS[3] = idx:new, ARGV[3] = member +local v = redis.call('GET', KEYS[1]) +if not v then return 0 end +local obj = cjson.decode(v) +if ARGV[1] ~= '' and obj.health_state ~= ARGV[1] then return -1 end +obj.health_state = ARGV[2] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[3]) end +if KEYS[3] ~= '' then redis.call('SADD', KEYS[3], ARGV[3]) end +return 1 +``` + +—— + +## 5. 任务流程(伪代码) + +```go +func runOnce(ctx context.Context, db *Database, rdb *redis.Client, ch chan<- AlertMessage, batch int) error { + rows := queryPendingFromDB(ctx, db, batch) // id, level, title, labels(JSON), alert_since + for _, it := range rows { + svc := it.Labels["service"] + ver := it.Labels["service_version"] + // 1) 投递消息到 channel(非阻塞) + select { + case ch <- AlertMessage{ID: it.ID, Service: svc, Version: ver, Level: it.Level, Title: it.Title, AlertSince: it.AlertSince, Labels: it.Labels}: + // ok + default: + // 投递失败:通道已满,跳过状态切换,计数并继续 + continue + } + // 2) 缓存状态原子切换(告警) + alertKey := "alert:issue:" + it.ID + rdb.Eval(ctx, alertCAS, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing"}, "Pending", "InProcessing", it.ID) + // 3) 缓存状态原子切换(服务态:按告警等级推导) + if svc != "" { // version 可空 + target := deriveHealth(it.Level) // P0->Error; P1/P2->Warning; else Warning + svcKey := "service_state:" + svc + ":" + ver + -- 可按需指定旧态索引,否则留空 + localOld := '' + newIdx := "service_state:index:health:" + target + member := svcKey + rdb.Eval(ctx, svcCAS, []string{svcKey, localOld, newIdx}, '', target, member) + } + } + return nil +} + +func StartScheduler(ctx context.Context, deps Deps) { + t := time.NewTicker(deps.Interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): return + case <-t.C: + _ = runOnce(ctx, deps.DB, deps.Redis, deps.AlertCh, deps.Batch) + } + } +} +``` + +—— + +## 6. 可观测与重试 + +- 指标:扫描次数、选出数量、成功投递数量、CAS 成功/失败数量、用时分位 +- 日志:每批开始/结束、首尾 ID、错误明细 +- 重试: + - 消息投递失败:不更改缓存状态,等待下次扫描重试 + - CAS 返回 -1(状态被他处更改):记录并跳过 + +—— + +## 7. 本地验证 + +1) 准备 Redis 与 DB(见 receiver/README.md) + +2) 造数据:插入一条 `alert_issues.alert_state='Pending'` 且缓存中存在 `alert:issue:{id}` 的 JSON。 + +3) 启动任务:观察日志/指标。 + +4) 验证缓存: +```bash +redis-cli --raw GET alert:issue: | jq +redis-cli --raw SMEMBERS alert:index:alert_state:InProcessing | head -n 20 +redis-cli --raw GET service_state:: | jq +redis-cli --raw SMEMBERS service_state:index:health:Processing | head -n 20 +``` + +5) 验证 channel:在消费端确认是否收到消息。 + +—— + +## 8. 配置汇总 + +``` +# 扫描任务 +HC_SCAN_INTERVAL=10s +HC_SCAN_BATCH=200 +HC_WORKERS=1 + +# 通道 +# 当前无需额外配置 +# 预留(未来切换到消息队列时启用): +# ALERT_QUEUE_KIND=redis_stream|kafka|nats +# ALERT_QUEUE_DSN=redis://localhost:6379/0 +# ALERT_QUEUE_TOPIC=alerts.pending +``` + +—— + + diff --git a/internal/alerting/service/healthcheck/scheduler.go b/internal/alerting/service/healthcheck/scheduler.go new file mode 100644 index 0000000..d137adf --- /dev/null +++ b/internal/alerting/service/healthcheck/scheduler.go @@ -0,0 +1,174 @@ +package healthcheck + +import ( + "context" + "encoding/json" + "os" + "strconv" + "time" + + adb "github.com/qiniu/zeroops/internal/alerting/database" + "github.com/redis/go-redis/v9" + "github.com/rs/zerolog/log" +) + +type Deps struct { + DB *adb.Database + Redis *redis.Client + AlertCh chan<- AlertMessage + Batch int + Interval time.Duration +} + +// NewRedisClientFromEnv constructs a redis client from env. +func NewRedisClientFromEnv() *redis.Client { + db, _ := strconv.Atoi(os.Getenv("REDIS_DB")) + return redis.NewClient(&redis.Options{ + Addr: os.Getenv("REDIS_ADDR"), + Password: os.Getenv("REDIS_PASSWORD"), + DB: db, + }) +} + +func StartScheduler(ctx context.Context, deps Deps) { + if deps.Interval <= 0 { + deps.Interval = 10 * time.Second + } + if deps.Batch <= 0 { + deps.Batch = 200 + } + t := time.NewTicker(deps.Interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if err := runOnce(ctx, deps.DB, deps.Redis, deps.AlertCh, deps.Batch); err != nil { + log.Error().Err(err).Msg("healthcheck runOnce failed") + } + } + } +} + +type pendingRow struct { + ID string + Level string + Title string + AlertSince time.Time + LabelsJSON string +} + +func runOnce(ctx context.Context, db *adb.Database, rdb *redis.Client, ch chan<- AlertMessage, batch int) error { + rows, err := queryPendingFromDB(ctx, db, batch) + if err != nil { + return err + } + for _, it := range rows { + labels := parseLabels(it.LabelsJSON) + svc := labels["service"] + ver := labels["service_version"] + // 1) publish to channel (non-blocking) + if ch != nil { + select { + case ch <- AlertMessage{ID: it.ID, Service: svc, Version: ver, Level: it.Level, Title: it.Title, AlertSince: it.AlertSince, Labels: labels}: + default: + // channel full, skip state change + continue + } + } + // 2) alert state CAS: Pending -> InProcessing + _ = alertStateCAS(ctx, rdb, it.ID, "Pending", "InProcessing") + // 3) service state CAS by derived level + if svc != "" { + target := deriveHealth(it.Level) + _ = serviceStateCAS(ctx, rdb, svc, ver, target) + } + } + return nil +} + +func queryPendingFromDB(ctx context.Context, db *adb.Database, limit int) ([]pendingRow, error) { + if db == nil { + return []pendingRow{}, nil + } + const q = `SELECT id, level, title, labels, alert_since +FROM alert_issues +WHERE alert_state = 'Pending' +ORDER BY alert_since ASC +LIMIT $1` + rows, err := db.QueryContext(ctx, q, limit) + if err != nil { + return nil, err + } + defer rows.Close() + out := make([]pendingRow, 0, limit) + for rows.Next() { + var it pendingRow + if err := rows.Scan(&it.ID, &it.Level, &it.Title, &it.LabelsJSON, &it.AlertSince); err != nil { + return nil, err + } + out = append(out, it) + } + return out, rows.Err() +} + +func alertStateCAS(ctx context.Context, rdb *redis.Client, id, expected, next string) error { + if rdb == nil { + return nil + } + key := "alert:issue:" + id + script := redis.NewScript(` +local v = redis.call('GET', KEYS[1]) +if not v then return 0 end +local obj = cjson.decode(v) +if obj.alertState ~= ARGV[1] then return -1 end +obj.alertState = ARGV[2] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +redis.call('SREM', KEYS[2], ARGV[3]) +redis.call('SADD', KEYS[3], ARGV[3]) +return 1 +`) + _, _ = script.Run(ctx, rdb, []string{key, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing"}, expected, next, id).Result() + return nil +} + +func serviceStateCAS(ctx context.Context, rdb *redis.Client, service, version, target string) error { + if rdb == nil { + return nil + } + key := "service_state:" + service + ":" + version + script := redis.NewScript(` +local v = redis.call('GET', KEYS[1]) +if not v then v = '{}'; end +local obj = cjson.decode(v) +obj.health_state = ARGV[2] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if ARGV[2] ~= '' then redis.call('SADD', KEYS[3], KEYS[1]) end +return 1 +`) + _, _ = script.Run(ctx, rdb, []string{key, "", "service_state:index:health:" + target}, "", target, key).Result() + return nil +} + +// parseLabels supports either flat map {"k":"v"} or array [{"key":"k","value":"v"}] +func parseLabels(s string) map[string]string { + m := map[string]string{} + if s == "" { + return m + } + // try map form + if json.Unmarshal([]byte(s), &m) == nil && len(m) > 0 { + return m + } + // try array form + var arr []struct{ Key, Value string } + if json.Unmarshal([]byte(s), &arr) == nil { + out := make(map[string]string, len(arr)) + for _, kv := range arr { + out[kv.Key] = kv.Value + } + return out + } + return map[string]string{} +} diff --git a/internal/alerting/service/healthcheck/types.go b/internal/alerting/service/healthcheck/types.go new file mode 100644 index 0000000..766528a --- /dev/null +++ b/internal/alerting/service/healthcheck/types.go @@ -0,0 +1,28 @@ +package healthcheck + +import ( + "time" +) + +// AlertMessage is the payload sent to downstream processors. +type AlertMessage struct { + ID string `json:"id"` + Service string `json:"service"` + Version string `json:"version,omitempty"` + Level string `json:"level"` + Title string `json:"title"` + AlertSince time.Time `json:"alert_since"` + Labels map[string]string `json:"labels"` +} + +// deriveHealth maps alert level to service health state. +func deriveHealth(level string) string { + switch level { + case "P0": + return "Error" + case "P1", "P2": + return "Warning" + default: + return "Warning" + } +} diff --git a/internal/alerting/service/receiver/README.md b/internal/alerting/service/receiver/README.md new file mode 100644 index 0000000..edd7eff --- /dev/null +++ b/internal/alerting/service/receiver/README.md @@ -0,0 +1,523 @@ +🧭 端到端验证(Docker Postgres + Redis + 本服务) + +以下步骤演示从 Alertmanager Webhook 到数据库落库的完整链路验证: + +1) 启动 Postgres(Docker) + +```bash +docker run --name zeroops-pg \ + -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=zeroops \ + -p 5432:5432 -d postgres:16 +``` + +1b) 启动 Redis(Docker) + +```bash +docker run --name zeroops-redis -p 6379:6379 -d redis:7-alpine +``` + +2) 初始化告警相关表 +运行集成测试(需 Postgres 实例与 `-tags=integration`)可验证插入成功: +```bash +go test ./internal/alerting/service/receiver -tags=integration -run TestPgDAO_InsertAlertIssue -v +``` + +3) 配置环境变量并启动服务(另开一个 shell 后台运行) + +```bash +export DB_HOST=localhost DB_PORT=5432 DB_USER=postgres DB_PASSWORD=postgres DB_NAME=zeroops DB_SSLMODE=disable +export ALERT_WEBHOOK_BASIC_USER=alert ALERT_WEBHOOK_BASIC_PASS=REDACTED +export REDIS_ADDR=localhost:6379 REDIS_PASSWORD="" REDIS_DB=0 +nohup go run ./cmd/zeroops -- 1>/tmp/zeroops.out 2>&1 & +``` + +4) 用 curl 模拟 Alertmanager 发送 firing 事件 + +```bash +curl -u alert:REDACTED -H 'Content-Type: application/json' \ + -X POST http://localhost:8080/v1/integrations/alertmanager/webhook -d '{ + "receiver":"our-webhook", + "status":"firing", + "alerts":[{ + "status":"firing", + "labels":{"alertname":"HighRequestLatency","service":"serviceA","severity":"P1","idc":"yzh"}, + "annotations":{"summary":"p95 latency over threshold","description":"apitime p95 > 450ms"}, + "startsAt":"2025-05-05T11:00:00Z", + "endsAt":"0001-01-01T00:00:00Z", + "generatorURL":"http://prometheus/graph?g0.expr=...", + "fingerprint":"3b1b7f4e8f0e" + }], + "groupLabels":{"alertname":"HighRequestLatency"}, + "commonLabels":{"service":"serviceA","severity":"P1"}, + "version":"4" +}' +``` + +5) 在数据库中验证(应看到一行 Open/P1/Pending 且标题匹配的记录) + +```bash +docker exec -i zeroops-pg psql -U postgres -d zeroops -c \ + "SELECT id,state,level,alert_state,title,alert_since FROM alert_issues WHERE title='p95 latency over threshold' AND alert_since='2025-05-05 11:00:00'::timestamp;" +``` +```bash +# 更易读(格式化 JSON)labels +docker exec -i zeroops-pg psql -U postgres -d zeroops -c \ + "SELECT jsonb_pretty(labels::jsonb) AS label FROM alert_issues WHERE title='p95 latency over threshold' AND alert_since='2025-05-05 11:00:00'::timestamp;" + +``` + +6)(可选)运行带集成标签的最小 DAO 测试 + +```bash +DB_HOST=localhost DB_PORT=5432 DB_USER=postgres DB_PASSWORD=postgres DB_NAME=zeroops DB_SSLMODE=disable \ +go test ./internal/alerting/service/receiver -tags=integration -run TestPgDAO_InsertAlertIssue -v +``` + + +receiver/ — 从 Alertmanager Webhook 到 alert_issues 入库的实施计划 + +目标:当 Alertmanager 向本服务发起 POST JSON 时,第一次创建告警记录并落表 alert_issues,字段规则: + • state 默认 Open + • alertState 默认 Pending + • 其余字段按 webhook 请求体解析、校验后写入 + +本计划仅覆盖「首次创建」逻辑;resolved(恢复)更新逻辑可在后续补充(例如切换 state=Closed、alertState=Restored)。 + +⸻ + +① 目录与文件准备 + +在 alerting/service/receiver/ 下新建如下文件(按模块职责划分): + +alerting/ +└─ service/ + └─ receiver/ + ├─ README.md # ← 就放本文档 + ├─ router.go # 注册路由:POST /v1/integrations/alertmanager/webhook + ├─ handler.go # HTTP 入口,接收与整体编排 + ├─ dto.go # 入参(Alertmanager Webhook)与内部 DTO 定义 + ├─ validator.go # 字段校验(必填/枚举/时间格式等) + ├─ mapper.go # 映射:AM payload → alert_issues 行记录 + ├─ dao.go # DB 访问(Insert/Query/事务/重试) + ├─ cache.go # Redis 客户端与写通缓存(Write-through) + ├─ idempotency.go # 幂等键生成与“已处理”快速判断(应用层) + └─ errors.go # 统一错误定义(参数错误/DB错误等) + +若你的 DB 连接封装在 alerting/database/,dao.go 里直接引入公用的 db 客户端即可。 + +⸻ + +② 路由与入口 + +router.go + +// package receiver +func RegisterReceiverRoutes(r *gin.Engine, h *Handler) { + r.POST("/v1/integrations/alertmanager/webhook", h.AlertmanagerWebhook) +} + +handler.go + +type Handler struct { + dao *DAO + cache *Cache // Redis 写通 +} + +func NewHandler(dao *DAO, cache *Cache) *Handler { return &Handler{dao: dao, cache: cache} } + +func (h *Handler) AlertmanagerWebhook(c *gin.Context) { + var req AMWebhook // dto.go 中定义的 Alertmanager 请求体结构 + if err := c.BindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"ok": false, "error": "invalid JSON"}) + return + } + + // 1) 基本字段校验 + if err := ValidateAMWebhook(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"ok": false, "error": err.Error()}) + return + } + + // 2) 仅处理 status == "firing" 的首次创建 + if strings.ToLower(req.Status) != "firing" { + c.JSON(http.StatusOK, gin.H{"ok": true, "msg": "ignored (not firing)"}) + return + } + + // 3) 对每条 alert 做落库(可能一批多个) + // 幂等键建议:fingerprint + startsAt(同一告警起始时间视为同一事件) + created := 0 + for _, a := range req.Alerts { + key := BuildIdempotencyKey(a) // idempotency.go + if AlreadySeen(key) { // 应用层短路(可选) + continue + } + + row, mapErr := MapToAlertIssueRow(&req, &a) // mapper.go → 组装 alert_issues 行 + if mapErr != nil { + // 单条失败不影响其它,记录日志即可 + continue + } + + // 4) 插入 DB(第一次创建强制 state=Open, alertState=Pending) + if err := h.dao.InsertAlertIssue(c, row); err != nil { + // 若唯一约束冲突/网络抖动等,记录后继续 + continue + } + // 5) 同步写入 service_states(health_state 由 level 推导;resolved_at 留空;alert_issue_ids 追加新 issue id) + // 规则:P0→Error;P1/P2→Warning;其他→Warning(可按需调整) + // service 从 labels.service 取;version 可从 labels.service_version 取(可空) + derived := func(l string) string { if l == "P0" { return "Error" }; if l == "P1" || l == "P2" { return "Warning" }; return "Warning" }(row.Level) + // report_at 此处暂不写,由 healthcheck 定时任务在 alert_issues 进入 InProcessing 后回填为最早的 alert_since + if err := h.dao.UpsertServiceState(c, a.Labels["service"], a.Labels["service_version"], nil, derived, row.ID); err != nil { + // 仅记录错误,不阻断主流程 + } + // 6) 写通到 Redis(不阻塞主流程,失败仅记录日志) + // alert_issues + if err := h.cache.WriteIssue(c, row, a); err != nil { + // 仅记录错误,避免影响 Alertmanager 重试逻辑 + } + // service_states(使用同样的推导态) + _ = h.cache.WriteServiceState(c, a.Labels["service"], a.Labels["service_version"], "", derived) + MarkSeen(key) // 记忆幂等键 + created++ + } + + c.JSON(http.StatusOK, gin.H{"ok": true, "created": created}) +} + + +⸻ + +③ 入参 DTO 与内部结构 + +dto.go(Alertmanager Webhook 载荷 + 内部插入结构) + +type KV map[string]string + +// 来自 Alertmanager 的单条告警 +type AMAlert struct { + Status string `json:"status"` // firing|resolved + Labels KV `json:"labels"` // 包含 alertname、service、severity 等 + Annotations KV `json:"annotations"` // 包含 summary/description 等 + StartsAt time.Time `json:"startsAt"` + EndsAt time.Time `json:"endsAt"` + GeneratorURL string `json:"generatorURL"` + Fingerprint string `json:"fingerprint"` // 用于幂等 +} + +// Webhook 根对象 +type AMWebhook struct { + Receiver string `json:"receiver"` + Status string `json:"status"` // firing|resolved + Alerts []AMAlert `json:"alerts"` + GroupLabels KV `json:"groupLabels"` + CommonLabels KV `json:"commonLabels"` + CommonAnnotations KV `json:"commonAnnotations"` + ExternalURL string `json:"externalURL"` + Version string `json:"version"` + GroupKey string `json:"groupKey"` +} + +// 准备插入 alert_issues 的行(与表字段一一对应) +type AlertIssueRow struct { + ID string // uuid + State string // enum: Open/Closed (首次固定 Open) + Level string // varchar(32): P0/P1/P2/Warning + AlertState string // enum: Pending/InProcessing/Restored/AutoRestored(首次固定 Pending) + Title string // varchar(255) + LabelJSON json.RawMessage // json: 标准化后的 [{key,value}] + AlertSince time.Time // timestamp: 用 StartsAt +} + + +⸻ + +④ 字段校验(validator) + +validator.go + +func ValidateAMWebhook(w *AMWebhook) error { + if w == nil { return errors.New("nil payload") } + if len(w.Alerts) == 0 { return errors.New("alerts empty") } + // 可加大小限制:len(alerts) <= N;防巨量 payload + for i := range w.Alerts { + a := &w.Alerts[i] + if a.StartsAt.IsZero() { return fmt.Errorf("alerts[%d].startsAt empty", i) } + // 允许空 annotations.summary,但后续会用回退规则生成 title + if a.Status == "" { a.Status = "firing" } // 容错 + } + return nil +} + +var allowedLevels = map[string]bool{"P0":true,"P1":true,"P2":true,"Warning":true} + +func NormalizeLevel(sev string) string { + s := strings.ToUpper(strings.TrimSpace(sev)) + if allowedLevels[s] { return s } + // 若为空/不合法,可设置默认或交给 severity 模块再评估 + return "Warning" +} + + +⸻ + +⑤ 映射规则(mapper) + +目标:将 Alertmanager 的单条 AMAlert → AlertIssueRow。 + • id:uuid.NewString() + • state:Open(首次创建强制) + • alertState:Pending(首次创建强制) + • level:NormalizeLevel(alert.Labels["severity"]) + • title:优先 annotations.summary,否则拼:{idc} {service} {alertname} ... + • label:把 labels 展平成 [{key,value}](额外加上一些关键来源信息:am_fingerprint、generatorURL、groupKey) + • alertSince:StartsAt(统一转 UTC) + +mapper.go + +func MapToAlertIssueRow(w *AMWebhook, a *AMAlert) (*AlertIssueRow, error) { + // 1) Title + title := strings.TrimSpace(a.Annotations["summary"]) + if title == "" { + // fallback:尽量信息量大且≤255 + title = fmt.Sprintf("%s %s %s", + a.Labels["idc"], a.Labels["service"], a.Labels["alertname"]) + title = strings.TrimSpace(title) + if title == "" { title = "Alert from Alertmanager" } + } + if len(title) > 255 { title = title[:255] } + + // 2) Level + level := NormalizeLevel(a.Labels["severity"]) + + // 3) Labels → []{key,value} + // 附加指纹等方便后续查询/对账 + flat := make([]map[string]string, 0, len(a.Labels)+3) + for k, v := range a.Labels { + flat = append(flat, map[string]string{"key": k, "value": v}) + } + if a.Fingerprint != "" { + flat = append(flat, map[string]string{"key": "am_fingerprint", "value": a.Fingerprint}) + } + if g := strings.TrimSpace(a.GeneratorURL); g != "" { + flat = append(flat, map[string]string{"key": "generatorURL", "value": g}) + } + if w.GroupKey != "" { + flat = append(flat, map[string]string{"key": "groupKey", "value": w.GroupKey}) + } + b, _ := json.Marshal(flat) + + // 4) Row + return &AlertIssueRow{ + ID: uuid.NewString(), + State: "Open", + AlertState: "Pending", + Level: level, + Title: title, + LabelJSON: b, + AlertSince: a.StartsAt.UTC(), // 建议统一 UTC + }, nil +} + + +⸻ + +⑥ 幂等(idempotency) + +虽然本步骤主要描述“首次创建”,但为了避免重复插入,建议引入应用层幂等(无须改表结构): + +idempotency.go + +func BuildIdempotencyKey(a AMAlert) string { + return a.Fingerprint + "|" + a.StartsAt.UTC().Format(time.RFC3339Nano) +} + +// 可以用内存 LRU/Redis;或入库前先按 (am_fingerprint + startsAt) 查询是否存在 +func AlreadySeen(key string) bool { /* TODO */ return false } +func MarkSeen(key string) { /* TODO */ } + +若后续允许调整表结构,可把 am_fingerprint 单列化并与 alertSince 组成唯一索引,幂等更稳。 + +⸻ + +⑦ 数据访问(DAO) + +dao.go(示例使用 pgx / database/sql,重点是参数化与事务) + +type DAO struct{ DB *pgxpool.Pool } + +func (d *DAO) InsertAlertIssue(ctx context.Context, r *AlertIssueRow) error { + const q = ` + INSERT INTO alert_issues + (id, state, level, alert_state, title, labels, alert_since) + VALUES + ($1, $2, $3, $4, $5, $6, $7) + ` + _, err := d.DB.Exec(ctx, q, + r.ID, r.State, r.Level, r.AlertState, r.Title, r.LabelJSON, r.AlertSince) + return err +} + +注意: + • label 列类型为 json(建议实际使用 jsonb),此处用 json.RawMessage 参数化写入即可。 + • 使用 Exec/Prepare 都可,确保不拼接字符串,防注入。 + • 生产建议增加:重试策略、插入耗时监控、错误分级(唯一冲突 vs 网络抖动)。 + +⸻ + +⑧ Redis 缓存写通(Write-through)与分布式幂等 + +目标:在成功写入 PostgreSQL 后,将关键数据写入 Redis,既为前端查询提供加速缓存,也为后续定时任务提供快速读取能力;同时用 Redis 提供跨实例幂等控制。 + +依赖: + +```bash +go get github.com/redis/go-redis/v9 +``` + +配置(环境变量): + +``` +REDIS_ADDR=localhost:6379 +REDIS_PASSWORD="" +REDIS_DB=0 +``` + +key 设计与 TTL: + +- alert:issue:{id} → JSON(AlertIssueRow + 补充字段),TTL 3d +- alert:idemp:{fingerprint}|{startsAtRFC3339Nano} → "1",TTL 10m(用于分布式幂等 SETNX) +- alert:index:open → Set(issues...),无 TTL(恢复时再移除) +- alert:index:svc:{service}:open → Set(issues...),无 TTL +// service_states 缓存 +- service_state:{service}:{version} → JSON(service/version/report_at/health_state),TTL 3d +- service_state:index:service:{service} → Set(keys) +- service_state:index:health:{health_state} → Set(keys) + +cache.go(示例): + +```go +type Cache struct{ R *redis.Client } + +func NewCacheFromEnv() *Cache { + db, _ := strconv.Atoi(os.Getenv("REDIS_DB")) + c := redis.NewClient(&redis.Options{Addr: os.Getenv("REDIS_ADDR"), Password: os.Getenv("REDIS_PASSWORD"), DB: db}) + return &Cache{R: c} +} + +// 写通:issue 主键对象 + 索引集合 +func (c *Cache) WriteIssue(ctx context.Context, r *AlertIssueRow, a AMAlert) error { + if c == nil || c.R == nil { return nil } + key := "alert:issue:" + r.ID + payload := map[string]any{ + "id": r.ID, "state": r.State, "level": r.Level, "alertState": r.AlertState, + "title": r.Title, "labels": json.RawMessage(r.LabelJSON), "alertSince": r.AlertSince, + "fingerprint": a.Fingerprint, "service": a.Labels["service"], "alertname": a.Labels["alertname"], + } + b, _ := json.Marshal(payload) + svc := strings.TrimSpace(a.Labels["service"]) + pipe := c.R.Pipeline() + pipe.Set(ctx, key, b, 72*time.Hour) + pipe.SAdd(ctx, "alert:index:open", r.ID) + if svc != "" { + pipe.SAdd(ctx, "alert:index:svc:"+svc+":open", r.ID) + } + _, err := pipe.Exec(ctx) + return err +} + +// 分布式幂等:SETNX + TTL +func (c *Cache) TryMarkIdempotent(ctx context.Context, a AMAlert) (bool, error) { + if c == nil || c.R == nil { return true, nil } + k := "alert:idemp:" + a.Fingerprint + "|" + a.StartsAt.UTC().Format(time.RFC3339Nano) + ok, err := c.R.SetNX(ctx, k, "1", 10*time.Minute).Result() + return ok, err +} +``` + +在 handler 中接入(伪码): + +```go +// 幂等短路(跨实例) +if ok, _ := h.cache.TryMarkIdempotent(c, a); !ok { + continue +} +// DB 成功后写通 Redis +_ = h.cache.WriteIssue(c, row, a) +``` + +失败处理:Redis 失败不影响 HTTP 主流程(Alertmanager 侧重试依赖 2xx),但需要日志打点与告警;后续可在定时任务做补偿(扫描最近 N 分钟的 DB 记录回填 Redis)。 + +快速验证: + +```bash +# 触发一次 webhook 后在 Redis 查看 +redis-cli --raw keys 'alert:*' +redis-cli --raw get alert:issue: +redis-cli --raw smembers alert:index:open | head -n 10 +redis-cli ttl alert:issue: +redis-cli --raw keys 'service_state:*' +redis-cli --raw get service_state:serviceA:v1.3.7 +redis-cli --raw smembers service_state:index:health:Error +``` + +⸻ + +⑨ 成功/失败返回与日志 + • 返回:统一 200 {"ok": true, "created": },即使个别记录失败也快速返回,避免 Alertmanager 阻塞重试。 + • 日志:按 alertname/service/severity/fingerprint 打点;错误包含 SQLSTATE/堆栈;统计接收/解析/插入耗时分位。 + +⸻ + +⑩ 最小联调(人工模拟) + +firing 模拟: + +curl -X POST http://localhost:8080/v1/integrations/alertmanager/webhook \ + -H 'Content-Type: application/json' \ + -d '{ + "receiver":"our-webhook", + "status":"firing", + "alerts":[ + { + "status":"firing", + "labels":{ + "alertname":"HighRequestLatency", + "service":"serviceA", + "severity":"P1", + "idc":"yzh", + "service_version": "v1.3.7" + }, + "annotations":{"summary":"p95 latency over threshold","description":"apitime p95 > 450ms"}, + "startsAt":"2025-05-05T11:00:00Z", + "endsAt":"0001-01-01T00:00:00Z", + "generatorURL":"http://prometheus/graph?g0.expr=...", + "fingerprint":"3b1b7f4e8f0e" + } + ], + "groupLabels":{"alertname":"HighRequestLatency"}, + "commonLabels":{"service":"serviceA","severity":"P1"}, + "version":"4" + }' + +入库后,alert_issues 里应看到: + • state=Open + • alertState=Pending + • level=P1 + • title="p95 latency over threshold" + • label 中包含 am_fingerprint/generatorURL/groupKey/... + • alertSince=2025-05-05 11:00:00+00 + +同时,service_states 里应看到/更新(按 service+version): + • service=serviceA + • version=(若 labels 中有 service_version 则为其值,否则为空字符串) + • report_at=与 alert_since 一致(若已存在则保留更早的 report_at) + • health_state=Warning(因本示例 level=P1) + • alert_issue_ids 包含刚插入的 alert_issues.id + +Redis 中应看到: + • key: alert:issue: 值为 JSON 且 TTL≈3 天 + • 集合 alert:index:open 中包含 + • 若有 service=serviceA,则 alert:index:svc:serviceA:open 包含 + +⸻ \ No newline at end of file diff --git a/internal/alerting/service/receiver/auth.go b/internal/alerting/service/receiver/auth.go new file mode 100644 index 0000000..e9f03d9 --- /dev/null +++ b/internal/alerting/service/receiver/auth.go @@ -0,0 +1,42 @@ +package receiver + +import ( + "net/http" + "os" + + "github.com/fox-gonic/fox" +) + +func authEnabled() bool { + return os.Getenv("ALERT_WEBHOOK_BASIC_USER") != "" || + os.Getenv("ALERT_WEBHOOK_BASIC_PASS") != "" || + os.Getenv("ALERT_WEBHOOK_BEARER") != "" +} + +// AuthMiddleware returns false if unauthorized and writes a 401 response. +func AuthMiddleware(c *fox.Context) bool { + if !authEnabled() { + return true + } + + user := os.Getenv("ALERT_WEBHOOK_BASIC_USER") + pass := os.Getenv("ALERT_WEBHOOK_BASIC_PASS") + bearer := os.Getenv("ALERT_WEBHOOK_BEARER") + + if user != "" || pass != "" { + u, p, ok := c.Request.BasicAuth() + if !ok || u != user || p != pass { + c.JSON(http.StatusUnauthorized, map[string]any{"ok": false, "error": "unauthorized"}) + return false + } + return true + } + + if bearer != "" { + if c.GetHeader("Authorization") != "Bearer "+bearer { + c.JSON(http.StatusUnauthorized, map[string]any{"ok": false, "error": "unauthorized"}) + return false + } + } + return true +} diff --git a/internal/alerting/service/receiver/cache.go b/internal/alerting/service/receiver/cache.go new file mode 100644 index 0000000..ee088ab --- /dev/null +++ b/internal/alerting/service/receiver/cache.go @@ -0,0 +1,120 @@ +package receiver + +import ( + "context" + "encoding/json" + "os" + "strconv" + "strings" + "time" + + "github.com/redis/go-redis/v9" +) + +// AlertIssueCache defines the minimal cache contract used by the handler. +type AlertIssueCache interface { + WriteIssue(ctx context.Context, r *AlertIssueRow, a AMAlert) error + TryMarkIdempotent(ctx context.Context, a AMAlert) (bool, error) + WriteServiceState(ctx context.Context, service, version string, reportAt time.Time, healthState string) error +} + +// NoopCache is a no-op implementation of AlertIssueCache. +type NoopCache struct{} + +func (NoopCache) WriteIssue(ctx context.Context, r *AlertIssueRow, a AMAlert) error { return nil } +func (NoopCache) TryMarkIdempotent(ctx context.Context, a AMAlert) (bool, error) { return true, nil } +func (NoopCache) WriteServiceState(ctx context.Context, service, version string, reportAt time.Time, healthState string) error { + return nil +} + +// Cache implements AlertIssueCache using Redis. +type Cache struct{ R *redis.Client } + +// NewCacheFromEnv constructs a Redis client using environment variables. +// REDIS_ADDR, REDIS_PASSWORD, REDIS_DB +func NewCacheFromEnv() *Cache { + db, _ := strconv.Atoi(os.Getenv("REDIS_DB")) + addr := os.Getenv("REDIS_ADDR") + if strings.TrimSpace(addr) == "" { + addr = "localhost:6379" + } + c := redis.NewClient(&redis.Options{ + Addr: addr, + Password: os.Getenv("REDIS_PASSWORD"), + DB: db, + }) + return &Cache{R: c} +} + +// WriteIssue writes the alert issue into Redis as a JSON blob and updates a few indices. +// Best-effort: failure should not block the main flow. +func (c *Cache) WriteIssue(ctx context.Context, r *AlertIssueRow, a AMAlert) error { + if c == nil || c.R == nil { + return nil + } + key := "alert:issue:" + r.ID + payload := map[string]any{ + "id": r.ID, + "state": r.State, + "level": r.Level, + "alertState": r.AlertState, + "title": r.Title, + "labels": json.RawMessage(r.LabelJSON), + "alertSince": r.AlertSince, + "fingerprint": a.Fingerprint, + "service": a.Labels["service"], + "alertname": a.Labels["alertname"], + } + b, _ := json.Marshal(payload) + svc := strings.TrimSpace(a.Labels["service"]) + pipe := c.R.Pipeline() + pipe.Set(ctx, key, b, 72*time.Hour) + pipe.SAdd(ctx, "alert:index:open", r.ID) + if svc != "" { + pipe.SAdd(ctx, "alert:index:svc:"+svc+":open", r.ID) + } + _, err := pipe.Exec(ctx) + return err +} + +// TryMarkIdempotent marks an alert event as processed using Redis SETNX + TTL. +// Returns false if the key already exists (duplicate). +func (c *Cache) TryMarkIdempotent(ctx context.Context, a AMAlert) (bool, error) { + if c == nil || c.R == nil { + return true, nil + } + k := "alert:idemp:" + a.Fingerprint + "|" + a.StartsAt.UTC().Format(time.RFC3339Nano) + ok, err := c.R.SetNX(ctx, k, "1", 10*time.Minute).Result() + if err != nil { + // Best-effort: treat Redis errors as non-blocking and allow processing + return true, nil + } + return ok, nil +} + +// WriteServiceState writes the service state snapshot into Redis and maintains simple indices. +func (c *Cache) WriteServiceState(ctx context.Context, service, version string, reportAt time.Time, healthState string) error { + if c == nil || c.R == nil { + return nil + } + s := strings.TrimSpace(service) + v := strings.TrimSpace(version) + key := "service_state:" + s + ":" + v + payload := map[string]any{ + "service": s, + "version": v, + "report_at": reportAt, + "health_state": healthState, + } + b, _ := json.Marshal(payload) + pipe := c.R.Pipeline() + pipe.Set(ctx, key, b, 72*time.Hour) + if s != "" { + pipe.SAdd(ctx, "service_state:index:service:"+s, key) + } + if healthState != "" { + pipe.SAdd(ctx, "service_state:index:health:"+healthState, key) + } + _, err := pipe.Exec(ctx) + return err +} diff --git a/internal/alerting/service/receiver/change_record.md b/internal/alerting/service/receiver/change_record.md new file mode 100644 index 0000000..8849062 --- /dev/null +++ b/internal/alerting/service/receiver/change_record.md @@ -0,0 +1,43 @@ +# receiver 变更记录(与数据库/API 文档同步) + +时间:2025-09-12 + +## 变更摘要 + +- 对接最新 `docs/alerting/database-design.md` 与 `docs/alerting/api.md`,统一列命名与时间精度。 +- 更新 DAO 插入语句、测试建表语句与 README 中的 SQL 片段。 + +## 具体修改 + +1) `dao.go` +- 将 `INSERT INTO alert_issues` 的列名由旧版驼峰改为蛇形: + - `alertState` → `alert_state` + - `label` → `labels` + - `alertSince` → `alert_since` + +2) `dao_integration_test.go` +- 初始化表结构同步到最新设计: + - `id varchar(64)`(原为 varchar(255)) + - 列名改为 `alert_state`、`labels`、`alert_since`;`alert_since` 使用 `timestamp(6)`。 + - 索引列同步:`(state, level, alert_since)` 与 `(alert_state, alert_since)`。 + +3) `README.md` +- 所有 SQL 示例与查询示例同步上述列名与类型调整,避免误导联调。 +- 注明查询样例中选择列为 `alert_state` 与 `alert_since`。 + +## 变更原因 + +- 数据库设计文件已经更新为 7 张表版本,并统一命名规范为 snake_case;为避免字段不匹配导致插入失败,代码与文档需保持一致。 +- API 文档已统一时间格式为 ISO 8601,数据库侧采用 `timestamp(6)` 存储精度,与接收端 `time.Time` 保持纳秒到毫秒的合理折衷。 + +## 验证 + +- 运行集成测试(需 Postgres 实例与 `-tags=integration`)可验证插入成功: + - `go test ./internal/alerting/service/receiver -tags=integration -run TestPgDAO_InsertAlertIssue -v` +- 按 `README.md` 的 Docker 步骤进行端到端验证,可看到 `alert_issues` 正确落库且列名匹配。 + +## 兼容性 + +- 若已有旧表结构,需要执行迁移(ALTER TABLE 重命名列,或重建表)。本次改动不改变业务语义,仅为命名与精度统一。 + + diff --git a/internal/alerting/service/receiver/dao.go b/internal/alerting/service/receiver/dao.go new file mode 100644 index 0000000..c848321 --- /dev/null +++ b/internal/alerting/service/receiver/dao.go @@ -0,0 +1,70 @@ +package receiver + +import ( + "context" + "fmt" + "time" + + adb "github.com/qiniu/zeroops/internal/alerting/database" +) + +type AlertIssueDAO interface { + InsertAlertIssue(ctx context.Context, r *AlertIssueRow) error +} + +// ServiceStateWriter optionally allows writing to service_states table. +type ServiceStateWriter interface { + UpsertServiceState(ctx context.Context, service, version string, reportAt *time.Time, healthState string, issueID string) error +} + +type NoopDAO struct{} + +func NewNoopDAO() *NoopDAO { return &NoopDAO{} } + +func (d *NoopDAO) InsertAlertIssue(ctx context.Context, r *AlertIssueRow) error { return nil } + +func (d *NoopDAO) UpsertServiceState(ctx context.Context, service, version string, reportAt *time.Time, healthState string, issueID string) error { + return nil +} + +type PgDAO struct{ DB *adb.Database } + +func NewPgDAO(db *adb.Database) *PgDAO { return &PgDAO{DB: db} } + +func (d *PgDAO) InsertAlertIssue(ctx context.Context, r *AlertIssueRow) error { + const q = ` + INSERT INTO alert_issues + (id, state, level, alert_state, title, labels, alert_since) + VALUES + ($1, $2, $3, $4, $5, $6, $7) + ` + if _, err := d.DB.ExecContext(ctx, q, r.ID, r.State, r.Level, r.AlertState, r.Title, r.LabelJSON, r.AlertSince); err != nil { + return fmt.Errorf("insert alert_issue: %w", err) + } + return nil +} + +// UpsertServiceState inserts or updates service_states with health_state and alert_issue_ids. +// report_at is not updated here except at insert-time if provided (may be NULL). +func (d *PgDAO) UpsertServiceState(ctx context.Context, service, version string, reportAt *time.Time, healthState string, issueID string) error { + const q = ` + INSERT INTO service_states (service, version, report_at, health_state, alert_issue_ids) + VALUES ($1, $2, $3, $4, ARRAY[$5]::text[]) + ON CONFLICT (service, version) DO UPDATE + SET health_state = EXCLUDED.health_state, + alert_issue_ids = CASE + WHEN NOT ($5 = ANY(service_states.alert_issue_ids)) THEN array_append(service_states.alert_issue_ids, $5) + ELSE service_states.alert_issue_ids + END + ` + var reportAtVal any + if reportAt != nil { + reportAtVal = *reportAt + } else { + reportAtVal = nil + } + if _, err := d.DB.ExecContext(ctx, q, service, version, reportAtVal, healthState, issueID); err != nil { + return fmt.Errorf("upsert service_state: %w", err) + } + return nil +} diff --git a/internal/alerting/service/receiver/dao_integration_test.go b/internal/alerting/service/receiver/dao_integration_test.go new file mode 100644 index 0000000..f8daaf7 --- /dev/null +++ b/internal/alerting/service/receiver/dao_integration_test.go @@ -0,0 +1,64 @@ +//go:build integration + +package receiver + +import ( + "context" + "os" + "testing" + "time" + + "github.com/google/uuid" + adb "github.com/qiniu/zeroops/internal/alerting/database" +) + +func ensureSchema(t *testing.T, db *adb.Database) { + t.Helper() + const schema = ` +CREATE TABLE IF NOT EXISTS alert_issues ( + id varchar(64) PRIMARY KEY, + state varchar(16) NOT NULL, + level varchar(32) NOT NULL, + alert_state varchar(32) NOT NULL, + title varchar(255) NOT NULL, + labels json NOT NULL, + alert_since timestamp(6) NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_alert_issues_state_level_since ON alert_issues(state, level, alert_since); +CREATE INDEX IF NOT EXISTS idx_alert_issues_alertstate_since ON alert_issues(alert_state, alert_since); +` + if _, err := db.ExecContext(context.Background(), schema); err != nil { + t.Fatalf("init schema: %v", err) + } +} + +func TestPgDAO_InsertAlertIssue(t *testing.T) { + dsn := "host=" + os.Getenv("DB_HOST") + + " port=" + os.Getenv("DB_PORT") + + " user=" + os.Getenv("DB_USER") + + " password=" + os.Getenv("DB_PASSWORD") + + " dbname=" + os.Getenv("DB_NAME") + + " sslmode=" + os.Getenv("DB_SSLMODE") + + db, err := adb.New(dsn) + if err != nil { + t.Fatalf("db connect: %v", err) + } + defer db.Close() + + ensureSchema(t, db) + + dao := NewPgDAO(db) + row := &AlertIssueRow{ + ID: uuid.NewString(), + State: "Open", + Level: "P1", + AlertState: "Pending", + Title: "integration insert", + LabelJSON: []byte(`[{"key":"k","value":"v"}]`), + AlertSince: time.Now().UTC(), + } + if err := dao.InsertAlertIssue(context.Background(), row); err != nil { + t.Fatalf("insert: %v", err) + } +} diff --git a/internal/alerting/service/receiver/dto.go b/internal/alerting/service/receiver/dto.go new file mode 100644 index 0000000..48539a9 --- /dev/null +++ b/internal/alerting/service/receiver/dto.go @@ -0,0 +1,43 @@ +package receiver + +import ( + "encoding/json" + "time" +) + +type KV map[string]string + +// AMAlert represents a single alert from Alertmanager +type AMAlert struct { + Status string `json:"status"` + Labels KV `json:"labels"` + Annotations KV `json:"annotations"` + StartsAt time.Time `json:"startsAt"` + EndsAt time.Time `json:"endsAt"` + GeneratorURL string `json:"generatorURL"` + Fingerprint string `json:"fingerprint"` +} + +// AMWebhook is the root webhook payload from Alertmanager +type AMWebhook struct { + Receiver string `json:"receiver"` + Status string `json:"status"` + Alerts []AMAlert `json:"alerts"` + GroupLabels KV `json:"groupLabels"` + CommonLabels KV `json:"commonLabels"` + CommonAnnotations KV `json:"commonAnnotations"` + ExternalURL string `json:"externalURL"` + Version string `json:"version"` + GroupKey string `json:"groupKey"` +} + +// AlertIssueRow represents the row to insert into alert_issues +type AlertIssueRow struct { + ID string + State string + Level string + AlertState string + Title string + LabelJSON json.RawMessage + AlertSince time.Time +} diff --git a/internal/alerting/service/receiver/errors.go b/internal/alerting/service/receiver/errors.go new file mode 100644 index 0000000..d77bda3 --- /dev/null +++ b/internal/alerting/service/receiver/errors.go @@ -0,0 +1,7 @@ +package receiver + +import "errors" + +var ( + ErrInvalidPayload = errors.New("invalid payload") +) diff --git a/internal/alerting/service/receiver/handler.go b/internal/alerting/service/receiver/handler.go new file mode 100644 index 0000000..c69d924 --- /dev/null +++ b/internal/alerting/service/receiver/handler.go @@ -0,0 +1,86 @@ +package receiver + +import ( + "net/http" + "strings" + "time" + + "github.com/fox-gonic/fox" +) + +type Handler struct { + dao AlertIssueDAO + cache AlertIssueCache +} + +// NewHandler keeps backward compatibility and uses a NoopCache by default. +func NewHandler(dao AlertIssueDAO) *Handler { return &Handler{dao: dao, cache: NoopCache{}} } + +// NewHandlerWithCache allows injecting a real cache implementation. +func NewHandlerWithCache(dao AlertIssueDAO, cache AlertIssueCache) *Handler { + if cache == nil { + cache = NoopCache{} + } + return &Handler{dao: dao, cache: cache} +} + +func (h *Handler) AlertmanagerWebhook(c *fox.Context) { + if !AuthMiddleware(c) { + return + } + var req AMWebhook + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, map[string]any{"ok": false, "error": "invalid JSON"}) + return + } + + if err := ValidateAMWebhook(&req); err != nil { + c.JSON(http.StatusBadRequest, map[string]any{"ok": false, "error": err.Error()}) + return + } + + if strings.ToLower(req.Status) != "firing" { + c.JSON(http.StatusOK, map[string]any{"ok": true, "msg": "ignored (not firing)"}) + return + } + + created := 0 + for _, a := range req.Alerts { + key := BuildIdempotencyKey(a) + // Distributed idempotency (best-effort). If key exists, skip. + if ok, _ := h.cache.TryMarkIdempotent(c.Request.Context(), a); !ok { + continue + } + if AlreadySeen(key) { + continue + } + row, err := MapToAlertIssueRow(&req, &a) + if err != nil { + continue + } + if err := h.dao.InsertAlertIssue(c.Request.Context(), row); err != nil { + continue + } + + if w, ok := h.dao.(ServiceStateWriter); ok { + service := strings.TrimSpace(a.Labels["service"]) + version := strings.TrimSpace(a.Labels["service_version"]) // optional + if service != "" { + derived := "Warning" + if row.Level == "P0" { + derived = "Error" + } else if row.Level == "P1" || row.Level == "P2" { + derived = "Warning" + } + _ = w.UpsertServiceState(c.Request.Context(), service, version, nil, derived, row.ID) + _ = h.cache.WriteServiceState(c.Request.Context(), service, version, time.Time{}, derived) + } + } + // Write-through to cache. Errors are ignored to avoid impacting webhook ack. + _ = h.cache.WriteIssue(c.Request.Context(), row, a) + MarkSeen(key) + created++ + } + + c.JSON(http.StatusOK, map[string]any{"ok": true, "created": created}) +} diff --git a/internal/alerting/service/receiver/handler_test.go b/internal/alerting/service/receiver/handler_test.go new file mode 100644 index 0000000..74ef05e --- /dev/null +++ b/internal/alerting/service/receiver/handler_test.go @@ -0,0 +1,37 @@ +package receiver + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/fox-gonic/fox" +) + +type mockDAO struct{ calls int } + +func (m *mockDAO) InsertAlertIssue(_ context.Context, _ *AlertIssueRow) error { m.calls++; return nil } + +func TestHandlerCreatesIssues(t *testing.T) { + r := fox.New() + m := &mockDAO{} + h := NewHandler(m) + RegisterReceiverRoutes(r, h) + + payload := AMWebhook{ + Status: "firing", + Alerts: []AMAlert{{Status: "firing", StartsAt: time.Now()}}, + } + b, _ := json.Marshal(payload) + req := httptest.NewRequest(http.MethodPost, "/v1/integrations/alertmanager/webhook", bytes.NewReader(b)) + req.Header.Set("Content-Type", "application/json") + resp := httptest.NewRecorder() + r.ServeHTTP(resp, req) + if resp.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.Code) + } +} diff --git a/internal/alerting/service/receiver/idempotency.go b/internal/alerting/service/receiver/idempotency.go new file mode 100644 index 0000000..1f6eed3 --- /dev/null +++ b/internal/alerting/service/receiver/idempotency.go @@ -0,0 +1,33 @@ +package receiver + +import ( + "sync" + "time" +) + +var ( + idempMu sync.Mutex + idempMap = make(map[string]time.Time) +) + +func BuildIdempotencyKey(a AMAlert) string { + return a.Fingerprint + "|" + a.StartsAt.UTC().Format(time.RFC3339Nano) +} + +func AlreadySeen(key string) bool { + idempMu.Lock() + defer idempMu.Unlock() + if t, ok := idempMap[key]; ok { + if time.Since(t) < 30*time.Minute { + return true + } + delete(idempMap, key) + } + return false +} + +func MarkSeen(key string) { + idempMu.Lock() + defer idempMu.Unlock() + idempMap[key] = time.Now() +} diff --git a/internal/alerting/service/receiver/idempotency_test.go b/internal/alerting/service/receiver/idempotency_test.go new file mode 100644 index 0000000..2541aeb --- /dev/null +++ b/internal/alerting/service/receiver/idempotency_test.go @@ -0,0 +1,25 @@ +package receiver + +import ( + "testing" + "time" +) + +func TestBuildIdempotencyKey(t *testing.T) { + a := AMAlert{Fingerprint: "fp", StartsAt: time.Unix(0, 123).UTC()} + key := BuildIdempotencyKey(a) + if key == "" || key[:2] != "fp" { + t.Fatalf("unexpected key: %s", key) + } +} + +func TestAlreadySeenAndMarkSeen(t *testing.T) { + key := "k|t" + if AlreadySeen(key) { + t.Fatal("should not be seen initially") + } + MarkSeen(key) + if !AlreadySeen(key) { + t.Fatal("should be seen after MarkSeen") + } +} diff --git a/internal/alerting/service/receiver/mapper.go b/internal/alerting/service/receiver/mapper.go new file mode 100644 index 0000000..3a8db6c --- /dev/null +++ b/internal/alerting/service/receiver/mapper.go @@ -0,0 +1,51 @@ +package receiver + +import ( + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/google/uuid" +) + +func MapToAlertIssueRow(w *AMWebhook, a *AMAlert) (*AlertIssueRow, error) { + // Title + title := strings.TrimSpace(a.Annotations["summary"]) + if title == "" { + title = strings.TrimSpace(fmt.Sprintf("%s %s %s", a.Labels["idc"], a.Labels["service"], a.Labels["alertname"])) + if title == "" { + title = "Alert from Alertmanager" + } + } + if len(title) > 255 { + title = title[:255] + } + + level := NormalizeLevel(a.Labels["severity"]) + + flat := make([]map[string]string, 0, len(a.Labels)+3) + for k, v := range a.Labels { + flat = append(flat, map[string]string{"key": k, "value": v}) + } + if a.Fingerprint != "" { + flat = append(flat, map[string]string{"key": "am_fingerprint", "value": a.Fingerprint}) + } + if g := strings.TrimSpace(a.GeneratorURL); g != "" { + flat = append(flat, map[string]string{"key": "generatorURL", "value": g}) + } + if w.GroupKey != "" { + flat = append(flat, map[string]string{"key": "groupKey", "value": w.GroupKey}) + } + b, _ := json.Marshal(flat) + + return &AlertIssueRow{ + ID: uuid.NewString(), + State: "Open", + AlertState: "Pending", + Level: level, + Title: title, + LabelJSON: b, + AlertSince: a.StartsAt.UTC().Truncate(time.Second), + }, nil +} diff --git a/internal/alerting/service/receiver/mapper_test.go b/internal/alerting/service/receiver/mapper_test.go new file mode 100644 index 0000000..7b27045 --- /dev/null +++ b/internal/alerting/service/receiver/mapper_test.go @@ -0,0 +1,40 @@ +package receiver + +import ( + "encoding/json" + "testing" + "time" +) + +func TestMapToAlertIssueRow(t *testing.T) { + w := &AMWebhook{GroupKey: "gk"} + a := &AMAlert{ + Annotations: KV{"summary": "a very very very long title that should be accepted"}, + Labels: KV{"severity": "P1", "alertname": "X", "service": "svc", "idc": "idc"}, + StartsAt: time.Now(), + Fingerprint: "fp", + GeneratorURL: "http://gen", + } + row, err := MapToAlertIssueRow(w, a) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if row.State != "Open" || row.AlertState != "Pending" { + t.Fatal("unexpected state mapping") + } + var flat []map[string]string + if err := json.Unmarshal(row.LabelJSON, &flat); err != nil { + t.Fatalf("invalid label json: %v", err) + } + // ensure am_fingerprint present + found := false + for _, kv := range flat { + if kv["key"] == "am_fingerprint" { + found = true + break + } + } + if !found { + t.Fatal("am_fingerprint not found in label json") + } +} diff --git a/internal/alerting/service/receiver/router.go b/internal/alerting/service/receiver/router.go new file mode 100644 index 0000000..a668ba5 --- /dev/null +++ b/internal/alerting/service/receiver/router.go @@ -0,0 +1,7 @@ +package receiver + +import "github.com/fox-gonic/fox" + +func RegisterReceiverRoutes(r *fox.Engine, h *Handler) { + r.POST("/v1/integrations/alertmanager/webhook", h.AlertmanagerWebhook) +} diff --git a/internal/alerting/service/receiver/validator.go b/internal/alerting/service/receiver/validator.go new file mode 100644 index 0000000..e3c6fed --- /dev/null +++ b/internal/alerting/service/receiver/validator.go @@ -0,0 +1,36 @@ +package receiver + +import ( + "errors" + "fmt" + "strings" +) + +func ValidateAMWebhook(w *AMWebhook) error { + if w == nil { + return ErrInvalidPayload + } + if len(w.Alerts) == 0 { + return errors.New("alerts empty") + } + for i := range w.Alerts { + a := &w.Alerts[i] + if a.StartsAt.IsZero() { + return fmt.Errorf("alerts[%d].startsAt empty", i) + } + if a.Status == "" { + a.Status = "firing" + } + } + return nil +} + +var allowedLevels = map[string]bool{"P0": true, "P1": true, "P2": true, "WARNING": true} + +func NormalizeLevel(sev string) string { + s := strings.ToUpper(strings.TrimSpace(sev)) + if allowedLevels[s] { + return s + } + return "Warning" +} diff --git a/internal/alerting/service/receiver/validator_test.go b/internal/alerting/service/receiver/validator_test.go new file mode 100644 index 0000000..0a4b8ae --- /dev/null +++ b/internal/alerting/service/receiver/validator_test.go @@ -0,0 +1,28 @@ +package receiver + +import ( + "testing" + "time" +) + +func TestValidateAMWebhook(t *testing.T) { + if err := ValidateAMWebhook(&AMWebhook{}); err == nil { + t.Fatal("expected error for empty alerts") + } + + w := &AMWebhook{Alerts: []AMAlert{{}}} + if err := ValidateAMWebhook(w); err == nil { + t.Fatal("expected error for empty startsAt") + } + + w = &AMWebhook{Alerts: []AMAlert{{StartsAt: time.Now()}}} + if err := ValidateAMWebhook(w); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestNormalizeLevel(t *testing.T) { + if NormalizeLevel("p1") != "P1" && NormalizeLevel("p1") != "Warning" { + t.Fatal("unexpected level normalization") + } +} diff --git a/internal/alerting/service/remediation/README.md b/internal/alerting/service/remediation/README.md new file mode 100644 index 0000000..de5ca08 --- /dev/null +++ b/internal/alerting/service/remediation/README.md @@ -0,0 +1,207 @@ +# remediation — 通道消费与自动回滚(Mock) + +本包规划一个后台处理器:消费 `healthcheck` 投递到进程内 channel 的告警消息,模拟执行“自动回滚”,回滚成功后将相关告警与服务态标记为恢复。 + +—— + +## 1. 目标 + +- 订阅 `healthcheck` 的 `AlertMessage`(进程内 channel) +- 对每条消息: + 1) Mock 调用回滚接口 `POST /v1/deployments/:deployID/rollback` + 2) `sleep 30s` 后返回“回滚成功”的模拟响应 + 3) 若成功,则更新 DB 与缓存: + - `alert_issues.alert_state = 'Restored'` + - `alert_issues.state = 'Closed'` + - `service_states.health_state = 'Normal'` + - `service_states.resolved_at = NOW()`(当前时间) + - 同时在 `alert_issue_comments` 中追加一条 AI 分析评论(见下文内容模板) + +> 说明:本阶段仅实现消费与 Mock,真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。 + +—— + +## 2. 输入消息(与 healthcheck 对齐) + +```go +// healthcheck/types.go +// 由 healthcheck 投递到 channel + type AlertMessage struct { + ID string `json:"id"` + Service string `json:"service"` + Version string `json:"version,omitempty"` + Level string `json:"level"` + Title string `json:"title"` + AlertSince time.Time `json:"alert_since"` + Labels map[string]string `json:"labels"` + } +``` + +- deployID 的来源(用于构造回滚 URL): + - Mock 阶段:可从 `Labels["deploy_id"]`(若存在)读取;若为空,可按 `{service}:{version}` 组装一个占位 ID。 + +—— + +## 3. 运行方式与配置 + +- 进程内消费者: + - 在 `cmd/zeroops/main.go` 中创建 `make(chan AlertMessage, N)` 并同时传给 `healthcheck` 与 `remediation`,形成发布-订阅。 + - 当前 README 仅描述,具体接线可在实现阶段加入。 + +- 环境变量建议: +``` +# 通道容量 +REMEDIATION_ALERT_CHAN_SIZE=1024 + +# 回滚接口(Mock) +REMEDIATION_ROLLBACK_URL=http://localhost:8080/v1/deployments/%s/rollback +REMEDIATION_ROLLBACK_SLEEP=30s + +# DB/Redis 复用已有:DB_* 与 REDIS_* +``` + +—— + +## 4. 流程(伪代码) + +```go +func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rdb *redis.Client) { + for { + select { + case <-ctx.Done(): + return + case m := <-ch: + // 1) 组装回滚 URL(Mock) + deployID := m.Labels["deploy_id"] + if deployID == "" { + // 仅 Mock:用 service:version 兜底 + deployID = fmt.Sprintf("%s:%s", m.Service, m.Version) + } + url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deployID) + + // 2) 发起回滚(Mock):sleep 指定时间再判为成功 + sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) + // TODO: 如需真实 HTTP 调用,可在此发起 POST 并根据响应判断 + + // 3) 成功后,先写入 AI 分析评论,再更新 DB 与缓存状态 + _ = addAIAnalysisComment(ctx, db, m) + _ = markRestoredInDB(ctx, db, m) + _ = markRestoredInCache(ctx, rdb, m) + } + } +} +``` + +—— + +## 5. DB 更新(SQL 建议) + +- 告警状态: +```sql +UPDATE alert_issues +SET alert_state = 'Restored' +WHERE id = $1; +``` + +- 服务态: +```sql +UPDATE service_states +SET health_state = 'Normal', + resolved_at = NOW() +WHERE service = $1 AND version = $2; +``` + +- 评论写入(AI 分析结果)(`alert_issue_comments.issue_id`对应 `alert_issues.id`): +```sql +INSERT INTO alert_issue_comments (issue_id, create_at, content) +VALUES ( + $1, + NOW(), + $2 +); +``` + +评论内容模板(Markdown,多行): +``` +## AI分析结果 +**问题类型**:非发版本导致的问题 +**根因分析**:数据库连接池配置不足,导致大量请求无法获取数据库连接 +**处理建议**: +- 增加数据库连接池大小 +- 优化数据库连接管理 +- 考虑读写分离缓解压力 +**执行状态**:正在处理中,等待指标恢复正常 +``` + +> 说明:若 `service_states` 不存在对应行,可按需 `INSERT ... ON CONFLICT`;或沿用 `receiver.PgDAO.UpsertServiceState` 的写入策略。 + +—— + +## 6. 缓存更新(Redis,Lua CAS 建议) + +- 告警缓存 `alert:issue:{id}`: +```lua +-- KEYS[1] = alert key +-- KEYS[2] = idx:old1 (例如 alert:index:alert_state:Pending) +-- KEYS[3] = idx:old2 (例如 alert:index:alert_state:InProcessing) +-- KEYS[4] = idx:new (alert:index:alert_state:Restored) +-- ARGV[1] = next ('Restored'), ARGV[2] = id +local v = redis.call('GET', KEYS[1]) +if not v then return 0 end +local obj = cjson.decode(v) +obj.alertState = ARGV[1] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[2]) end +if KEYS[3] ~= '' then redis.call('SREM', KEYS[3], ARGV[2]) end +if KEYS[4] ~= '' then redis.call('SADD', KEYS[4], ARGV[2]) end +return 1 +``` + +- 服务态缓存 `service_state:{service}:{version}`: +```lua +-- KEYS[1] = service_state key +-- KEYS[2] = idx:new (service_state:index:health:Normal) +-- ARGV[1] = next ('Normal'), ARGV[2] = member (key 本身) +local v = redis.call('GET', KEYS[1]) +if not v then v = '{}' end +local obj = cjson.decode(v) +obj.health_state = ARGV[1] +obj.resolved_at = redis.call('TIME')[1] -- 可选:秒级时间戳;或由上层填充分辨率更高的时间串 +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SADD', KEYS[2], KEYS[1]) end +return 1 +``` + +- 建议键: + - `alert:index:alert_state:Pending|InProcessing|Restored` + - `service_state:index:health:Normal|Warning|Error` + +—— + +## 7. 幂等与重试 + +- 幂等:同一 `AlertMessage.ID` 的回滚处理应具备幂等性,重复消费不应产生额外副作用。 +- 重试:Mock 模式下可忽略;接入真实接口后,对 5xx/网络错误考虑重试与退避,最终写入失败应有告警与补偿。 + +—— + +## 8. 验证步骤(与 healthcheck E2E 相衔接) + +1) 启动 Redis/Postgres 与 API(参考 `healthcheck/E2E_VALIDATION.md` 与 `env_example.txt`) +2) 创建 channel,并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)` +3) `curl` 触发 Webhook,`alert_issues` 入库为 `Pending` +4) 等待 `healthcheck` 将缓存态切到 `InProcessing` +5) 等待 `remediation` mock 回滚完成 → DB 与缓存更新: + - `alert_issues.alert_state = 'Restored'` + - `service_states.health_state = 'Normal'` + - `service_states.resolved_at = NOW()` +6) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新(comments 仍为 mock) + +—— + +## 9. 后续计划 + +- 接入真实部署系统回滚接口与鉴权 +- 将进程内 channel 平滑切换为 MQ(Kafka/NATS) +- 完善指标与可观测:事件消费速率、成功率、时延分位、回滚结果等 +- 增加补偿任务:对“回滚成功但缓存/DB 未一致”的场景进行对账修复 diff --git a/internal/alerting/service/remediation/consumer.go b/internal/alerting/service/remediation/consumer.go new file mode 100644 index 0000000..07cae6b --- /dev/null +++ b/internal/alerting/service/remediation/consumer.go @@ -0,0 +1,186 @@ +package remediation + +import ( + "context" + "fmt" + "os" + "strconv" + "time" + + adb "github.com/qiniu/zeroops/internal/alerting/database" + "github.com/qiniu/zeroops/internal/alerting/service/healthcheck" + "github.com/redis/go-redis/v9" + "github.com/rs/zerolog/log" +) + +type Consumer struct { + DB *adb.Database + Redis *redis.Client + + // sleepFn allows overriding for tests + sleepFn func(time.Duration) +} + +func NewConsumer(db *adb.Database, rdb *redis.Client) *Consumer { + return &Consumer{DB: db, Redis: rdb, sleepFn: time.Sleep} +} + +// Start consumes alert messages and performs a mocked rollback then marks restored. +func (c *Consumer) Start(ctx context.Context, ch <-chan healthcheck.AlertMessage) { + if ch == nil { + log.Warn().Msg("remediation consumer started without channel; no-op") + return + } + sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) + for { + select { + case <-ctx.Done(): + return + case m := <-ch: + // 1) Mock rollback: optional URL composition (unused) + _ = fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(&m)) + // 2) Sleep to simulate rollback time + if c.sleepFn != nil { + c.sleepFn(sleepDur) + } + // 3) On success: add AI analysis comment, update DB and cache + if err := c.addAIAnalysisComment(ctx, &m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") + } + if err := c.markRestoredInDB(ctx, &m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed") + } + if err := c.markRestoredInCache(ctx, &m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed") + } + } + } +} + +func deriveDeployID(m *healthcheck.AlertMessage) string { + if m == nil { + return "" + } + if v := m.Labels["deploy_id"]; v != "" { + return v + } + return fmt.Sprintf("%s:%s", m.Service, m.Version) +} + +func (c *Consumer) addAIAnalysisComment(ctx context.Context, m *healthcheck.AlertMessage) error { + if c.DB == nil || m == nil { + return nil + } + const existsQ = `SELECT 1 FROM alert_issue_comments WHERE issue_id=$1 AND content=$2 LIMIT 1` + const insertQ = `INSERT INTO alert_issue_comments (issue_id, create_at, content) VALUES ($1, NOW(), $2)` + content := "## AI分析结果\n" + + "**问题类型**:非发版本导致的问题\n" + + "**根因分析**:数据库连接池配置不足,导致大量请求无法获取数据库连接\n" + + "**处理建议**:\n" + + "- 增加数据库连接池大小\n" + + "- 优化数据库连接管理\n" + + "- 考虑读写分离缓解压力\n" + + "**执行状态**:正在处理中,等待指标恢复正常" + if rows, err := c.DB.QueryContext(ctx, existsQ, m.ID, content); err == nil { + defer rows.Close() + if rows.Next() { + return nil + } + } + _, err := c.DB.ExecContext(ctx, insertQ, m.ID, content) + return err +} + +func (c *Consumer) markRestoredInDB(ctx context.Context, m *healthcheck.AlertMessage) error { + if c.DB == nil || m == nil { + return nil + } + // alert_issues + if _, err := c.DB.ExecContext(ctx, `UPDATE alert_issues SET alert_state = 'Restored' , state = 'Closed' WHERE id = $1`, m.ID); err != nil { + return err + } + // service_states (upsert) + if m.Service != "" { + const upsert = ` +INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids) +VALUES ($1, $2, NULL, NOW(), 'Normal', ARRAY[$3]::text[]) +ON CONFLICT (service, version) DO UPDATE +SET health_state = 'Normal', + resolved_at = NOW(); +` + if _, err := c.DB.ExecContext(ctx, upsert, m.Service, m.Version, m.ID); err != nil { + return err + } + } + return nil +} + +func (c *Consumer) markRestoredInCache(ctx context.Context, m *healthcheck.AlertMessage) error { + if c.Redis == nil || m == nil { + return nil + } + // 1) alert:issue:{id} → alertState=Restored; state=Closed; move indices + alertKey := "alert:issue:" + m.ID + script := redis.NewScript(` +local v = redis.call('GET', KEYS[1]) +if not v then return 0 end +local obj = cjson.decode(v) +obj.alertState = ARGV[1] +obj.state = ARGV[3] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[2]) end +if KEYS[3] ~= '' then redis.call('SREM', KEYS[3], ARGV[2]) end +if KEYS[4] ~= '' then redis.call('SADD', KEYS[4], ARGV[2]) end +-- move open→closed indices +if KEYS[5] ~= '' then redis.call('SREM', KEYS[5], ARGV[2]) end +if KEYS[6] ~= '' then redis.call('SADD', KEYS[6], ARGV[2]) end +-- service scoped indices if service exists in payload +local svc = obj['service'] +if svc and svc ~= '' then + local openSvcKey = 'alert:index:svc:' .. svc .. ':open' + local closedSvcKey = 'alert:index:svc:' .. svc .. ':closed' + redis.call('SREM', openSvcKey, ARGV[2]) + redis.call('SADD', closedSvcKey, ARGV[2]) +end +return 1 +`) + _, _ = script.Run(ctx, c.Redis, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing", "alert:index:alert_state:Restored", "alert:index:open", "alert:index:closed"}, "Restored", m.ID, "Closed").Result() + + // 2) service_state:{service}:{version} → health_state=Normal; resolved_at=now; add to Normal index + if m.Service != "" { + svcKey := "service_state:" + m.Service + ":" + m.Version + now := time.Now().UTC().Format(time.RFC3339Nano) + svcScript := redis.NewScript(` +local v = redis.call('GET', KEYS[1]) +if not v then v = '{}' end +local obj = cjson.decode(v) +obj.health_state = ARGV[1] +obj.resolved_at = ARGV[2] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SADD', KEYS[2], KEYS[1]) end +return 1 +`) + _, _ = svcScript.Run(ctx, c.Redis, []string{svcKey, "service_state:index:health:Normal"}, "Normal", now).Result() + } + return nil +} + +func parseDuration(s string, d time.Duration) time.Duration { + if s == "" { + return d + } + if v, err := time.ParseDuration(s); err == nil { + return v + } + return d +} + +func parseInt(s string, v int) int { + if s == "" { + return v + } + if n, err := strconv.Atoi(s); err == nil { + return n + } + return v +} diff --git a/internal/middleware/auth.go b/internal/middleware/auth.go index 741c05e..582ce3d 100644 --- a/internal/middleware/auth.go +++ b/internal/middleware/auth.go @@ -1,5 +1,11 @@ package middleware -func Authentication() { +import ( + "github.com/fox-gonic/fox" +) +// Authentication is a placeholder global middleware. It currently allows all requests. +// Per-alerting webhook uses its own path-scoped auth. +func Authentication(c *fox.Context) { + c.Next() }