Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions client/src/Hooks/useMonitorForm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ const getBaseDefaults = (data?: Monitor | null) => ({
description: data?.description || "",
interval: data?.interval || 60000,
notifications: data?.notifications || [],
escalationNotifications: data?.escalationNotifications || [],
escalationWaitMinutes: data?.escalationWaitMinutes ?? 0,
statusWindowSize: data?.statusWindowSize || 5,
statusWindowThreshold: data?.statusWindowThreshold || 60,
geoCheckEnabled: data?.geoCheckEnabled ?? false,
Expand Down
83 changes: 83 additions & 0 deletions client/src/Pages/CreateMonitor/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,89 @@ const CreateMonitorPage = () => {
}
/>

<ConfigBox
title={t("pages.createMonitor.form.escalation.title")}
subtitle={t("pages.createMonitor.form.escalation.description")}
rightContent={
<Stack spacing={theme.spacing(LAYOUT.MD)}>
<Controller
name="escalationWaitMinutes"
control={control}
render={({ field, fieldState }) => (
<TextField
type="number"
label={t("pages.createMonitor.form.escalation.waitTimeLabel")}
value={field.value}
onChange={(e) => field.onChange(Number(e.target.value))}
error={fieldState.error?.message}
inputProps={{ min: 0 }}
/>
)}
/>
<Controller
name="escalationNotifications"
control={control}
render={({ field }) => {
const notificationOptions = (notifications ?? []).map((n) => ({
...n,
name: n.notificationName,
}));
const selectedEscalations = notificationOptions.filter((n) =>
(field.value ?? []).includes(n.id)
);
return (
<Stack spacing={theme.spacing(LAYOUT.MD)}>
<Autocomplete
multiple
options={notificationOptions}
value={selectedEscalations}
getOptionLabel={(option) => option.name}
onChange={(_: unknown, newValue: typeof notificationOptions) => {
field.onChange(newValue.map((n) => n.id));
}}
isOptionEqualToValue={(option, value) => option.id === value.id}
/>
{selectedEscalations.length > 0 && (
<Stack
flex={1}
width="100%"
>
{selectedEscalations.map((notification, index) => (
<Stack
direction="row"
alignItems="center"
key={notification.id}
width="100%"
>
<Typography flexGrow={1}>
{notification.notificationName}
</Typography>
<IconButton
size="small"
onClick={() => {
field.onChange(
(field.value ?? []).filter(
(id: string) => id !== notification.id
)
);
}}
aria-label="Remove escalation notification"
>
<Trash2 size={16} />
</IconButton>
{index < selectedEscalations.length - 1 && <Divider />}
</Stack>
))}
</Stack>
)}
</Stack>
);
}}
/>
</Stack>
}
/>

{(watchedType === "http" ||
watchedType === "grpc" ||
watchedType === "websocket") && (
Expand Down
3 changes: 3 additions & 0 deletions client/src/Types/Monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ export interface Monitor {
geoCheckEnabled?: boolean;
geoCheckLocations?: GeoContinent[];
geoCheckInterval?: number;
escalationNotifications: string[];
escalationWaitMinutes: number;
escalationSent: boolean;
recentChecks: CheckSnapshot[];
createdAt: string;
updatedAt: string;
Expand Down
2 changes: 2 additions & 0 deletions client/src/Validation/monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ const baseSchema = z.object({
description: z.string().optional(),
interval: z.number().min(15000, "Interval must be at least 15 seconds"),
notifications: z.array(z.string()),
escalationNotifications: z.array(z.string()),
escalationWaitMinutes: z.number().min(0, "Wait time must be at least 0 minutes"),
statusWindowSize: z
.number({ message: "Status window size is required" })
.min(1, "Status window size must be at least 1")
Expand Down
6 changes: 6 additions & 0 deletions client/src/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,12 @@
"description": "Select the notification channels you want to use",
"title": "Notifications"
},
"escalation": {
"title": "Escalation notifications",
"description": "If a monitor stays down, send additional alerts to other channels after a specified wait time",
"waitTimeLabel": "Escalation wait time (minutes)",
"channelsLabel": "Escalation channels"
},
"type": {
"description": "Select the type of check to perform",
"optionDockerDescription": "Use Docker to monitor if a container is running.",
Expand Down
17 changes: 16 additions & 1 deletion server/src/db/models/Monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ type CheckSnapshotDocument = Omit<CheckSnapshot, "createdAt"> & { createdAt: Dat

type MonitorDocumentBase = Omit<
Monitor,
"id" | "userId" | "teamId" | "notifications" | "selectedDisks" | "statusWindow" | "recentChecks" | "createdAt" | "updatedAt"
"id" | "userId" | "teamId" | "notifications" | "escalationNotifications" | "selectedDisks" | "statusWindow" | "recentChecks" | "createdAt" | "updatedAt"
> & {
statusWindow: boolean[];
recentChecks: CheckSnapshotDocument[];
notifications: Types.ObjectId[];
escalationNotifications: Types.ObjectId[];
selectedDisks: string[];
matchMethod?: MonitorMatchMethod;
};
Expand Down Expand Up @@ -351,6 +352,20 @@ const MonitorSchema = new Schema<MonitorDocument>(
type: Number,
default: 300000,
},
escalationNotifications: [
{
type: Schema.Types.ObjectId,
ref: "Notification",
},
],
escalationWaitMinutes: {
type: Number,
default: 0,
},
escalationSent: {
type: Boolean,
default: false,
},
recentChecks: {
type: [checkSnapshotSchema],
default: [],
Expand Down
8 changes: 8 additions & 0 deletions server/src/repositories/monitors/MongoMonitorsRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ class MongoMonitorsRepository implements IMonitorsRepository {
};

const notificationIds = (doc.notifications ?? []).map((notification) => toStringId(notification));
const escalationNotificationIds = (doc.escalationNotifications ?? []).map((notification) => toStringId(notification));

return {
id: toStringId(doc._id),
Expand All @@ -374,6 +375,9 @@ class MongoMonitorsRepository implements IMonitorsRepository {
interval: doc.interval,
uptimePercentage: doc.uptimePercentage ?? undefined,
notifications: notificationIds,
escalationNotifications: escalationNotificationIds,
escalationWaitMinutes: doc.escalationWaitMinutes ?? 0,
escalationSent: doc.escalationSent ?? false,
secret: doc.secret ?? undefined,
cpuAlertThreshold: doc.cpuAlertThreshold,
cpuAlertCounter: doc.cpuAlertCounter,
Expand Down Expand Up @@ -410,6 +414,7 @@ class MongoMonitorsRepository implements IMonitorsRepository {
};

const notificationIds = (doc.notifications ?? []).map((notification: unknown) => toStringId(notification));
const escalationNotificationIds = ((doc as Record<string, unknown>).escalationNotifications as unknown[] ?? []).map((notification: unknown) => toStringId(notification));

return {
id: toStringId(doc._id),
Expand All @@ -433,6 +438,9 @@ class MongoMonitorsRepository implements IMonitorsRepository {
interval: doc.interval,
uptimePercentage: doc.uptimePercentage ?? undefined,
notifications: notificationIds,
escalationNotifications: escalationNotificationIds,
escalationWaitMinutes: (doc as Record<string, unknown>).escalationWaitMinutes as number ?? 0,
escalationSent: (doc as Record<string, unknown>).escalationSent as boolean ?? false,
secret: doc.secret ?? undefined,
cpuAlertThreshold: doc.cpuAlertThreshold,
cpuAlertCounter: doc.cpuAlertCounter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,52 @@ export class SuperSimpleQueueHelper implements ISuperSimpleQueueHelper {
stack: error instanceof Error ? error.stack : undefined,
});
});

// Step 8. Handle escalation notifications
const currentMonitor = statusChangeResult.monitor;

// Reset escalation flag when monitor recovers
if (decision.shouldResolveIncident && currentMonitor.escalationSent) {
this.monitorsRepository.updateById(monitorId, teamId, { escalationSent: false }).catch((error: unknown) => {
this.logger.warn({
message: `Error resetting escalation flag for monitor ${monitorId}: ${error instanceof Error ? error.message : "Unknown error"}`,
service: SERVICE_NAME,
method: "getMonitorJob",
});
});
}

// Check if escalation should fire (monitor still down, not a new transition)
if (
!statusChangeResult.statusChanged &&
currentMonitor.status === "down" &&
!currentMonitor.escalationSent &&
currentMonitor.escalationWaitMinutes > 0 &&
(currentMonitor.escalationNotifications?.length ?? 0) > 0
) {
const activeIncident = await this.incidentsRepository.findActiveByMonitorId(monitorId, teamId);
if (activeIncident) {
const incidentStart = new Date(activeIncident.startTime);
const downtimeMinutes = (Date.now() - incidentStart.getTime()) / 60000;
if (downtimeMinutes >= currentMonitor.escalationWaitMinutes) {
this.notificationsService.handleEscalationNotifications(currentMonitor, status).catch((error: unknown) => {
this.logger.error({
message: `Error sending escalation notifications for monitor ${monitorId}: ${error instanceof Error ? error.message : "Unknown error"}`,
service: SERVICE_NAME,
method: "getMonitorJob",
stack: error instanceof Error ? error.stack : undefined,
});
});
this.monitorsRepository.updateById(monitorId, teamId, { escalationSent: true }).catch((error: unknown) => {
this.logger.warn({
message: `Error setting escalation flag for monitor ${monitorId}: ${error instanceof Error ? error.message : "Unknown error"}`,
service: SERVICE_NAME,
method: "getMonitorJob",
});
});
}
}
}
} catch (error: unknown) {
this.logger.warn({
message: error instanceof Error ? error.message : "Unknown error",
Expand Down
57 changes: 57 additions & 0 deletions server/src/service/infrastructure/notificationMessageBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ export interface INotificationMessageBuilder {
decision: MonitorActionDecision,
clientHost: string
): NotificationMessage;
buildEscalationMessage(
monitor: Monitor,
monitorStatusResponse: MonitorStatusResponse,
clientHost: string
): NotificationMessage;
extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): ThresholdBreach[];
}

Expand Down Expand Up @@ -52,6 +57,34 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder {
};
}

buildEscalationMessage(
monitor: Monitor,
monitorStatusResponse: MonitorStatusResponse,
clientHost: string
): NotificationMessage {
const type: NotificationType = "escalation";
const severity = this.determineSeverity(type);
const content = this.buildContent(type, monitor, monitorStatusResponse);

return {
type,
severity,
monitor: {
id: monitor.id,
name: monitor.name,
url: monitor.url,
type: monitor.type,
status: monitor.status,
},
content,
clientHost,
metadata: {
teamId: monitor.teamId,
notificationReason: "escalation",
},
};
}

private determineNotificationType(decision: MonitorActionDecision, monitor: Monitor): NotificationType {
// Down status has highest priority (critical)
if (monitor.status === "down") {
Expand Down Expand Up @@ -80,6 +113,7 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder {
private determineSeverity(type: NotificationType): NotificationSeverity {
switch (type) {
case "monitor_down":
case "escalation":
return "critical";
case "threshold_breach":
return "warning";
Expand All @@ -103,6 +137,8 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder {
return this.buildThresholdBreachContent(monitor, monitorStatusResponse as MonitorStatusResponse<HardwareStatusPayload>);
case "threshold_resolved":
return this.buildThresholdResolvedContent(monitor);
case "escalation":
return this.buildEscalationContent(monitor, monitorStatusResponse);
default:
return this.buildDefaultContent(monitor);
}
Expand Down Expand Up @@ -182,6 +218,27 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder {
};
}

private buildEscalationContent(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): NotificationContent {
const title = `Escalation: Monitor ${monitor.name} still down`;
const summary = `Monitor "${monitor.name}" has been down for an extended period and requires attention.`;
const details = [`URL: ${monitor.url}`, `Status: Down`, `Type: ${monitor.type}`];

if (monitorStatusResponse.code) {
details.push(`Response Code: ${monitorStatusResponse.code}`);
}

if (monitorStatusResponse.message) {
details.push(`Error: ${monitorStatusResponse.message}`);
}

return {
title,
summary,
details,
timestamp: new Date(),
};
}

public extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse<HardwareStatusPayload>): ThresholdBreach[] {
const breaches: ThresholdBreach[] = [];

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ export class EmailProvider implements INotificationProvider {
return `Monitor ${message.monitor.name} threshold exceeded`;
case "threshold_resolved":
return `Monitor ${message.monitor.name} thresholds resolved`;
case "escalation":
return `Escalation: Monitor ${message.monitor.name} still down`;
default:
return `Alert: ${message.monitor.name}`;
}
Expand Down
Loading