Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Serve] [Dashboard] Add serve controller metrics to serve system dashboard page #43797

10 changes: 8 additions & 2 deletions dashboard/client/src/pages/serve/ServeDeploymentsListPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ import { HelpInfo } from "../../components/Tooltip";
import { useServeDeployments } from "./hook/useServeApplications";
import { ServeApplicationRows } from "./ServeApplicationRow";
import { ServeEntityLogViewer } from "./ServeEntityLogViewer";
import { ServeMetricsSection } from "./ServeMetricsSection";
import {
APPS_METRICS_CONFIG,
ServeMetricsSection,
} from "./ServeMetricsSection";
import { ServeSystemPreview } from "./ServeSystemDetails";

const useStyles = makeStyles((theme) =>
Expand Down Expand Up @@ -172,7 +175,10 @@ export const ServeDeploymentsListPage = () => {
</CollapsibleSection>
</React.Fragment>
)}
<ServeMetricsSection className={classes.section} />
<ServeMetricsSection
className={classes.section}
metricsConfig={APPS_METRICS_CONFIG}
/>
</div>
);
};
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { render, screen, waitFor } from "@testing-library/react";
import React, { PropsWithChildren } from "react";
import { GlobalContext } from "../../App";
import { ServeMetricsSection } from "./ServeMetricsSection";
import {
APPS_METRICS_CONFIG,
SERVE_SYSTEM_METRICS_CONFIG,
ServeMetricsSection,
} from "./ServeMetricsSection";

const Wrapper = ({ children }: PropsWithChildren<{}>) => {
return (
Expand Down Expand Up @@ -54,10 +58,12 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => {
};

describe("ServeMetricsSection", () => {
it("renders", async () => {
it("renders app metrics", async () => {
expect.assertions(4);

render(<ServeMetricsSection />, { wrapper: Wrapper });
render(<ServeMetricsSection metricsConfig={APPS_METRICS_CONFIG} />, {
wrapper: Wrapper,
});
await screen.findByText(/View in Grafana/);
expect(screen.getByText(/5 minutes/)).toBeVisible();
expect(screen.getByTitle("QPS per application")).toBeInTheDocument();
Expand All @@ -67,10 +73,34 @@ describe("ServeMetricsSection", () => {
).toBeInTheDocument();
});

it("renders serve system metrics", async () => {
expect.assertions(6);

render(
<ServeMetricsSection metricsConfig={SERVE_SYSTEM_METRICS_CONFIG} />,
{
wrapper: Wrapper,
},
);
await screen.findByText(/View in Grafana/);
expect(screen.getByTitle("Ongoing HTTP Requests")).toBeInTheDocument();
expect(screen.getByTitle("Ongoing gRPC Requests")).toBeInTheDocument();
expect(screen.getByTitle("Scheduling Tasks")).toBeInTheDocument();
expect(
screen.getByTitle("Scheduling Tasks in Backoff"),
).toBeInTheDocument();
expect(
screen.getByTitle("Controller Control Loop Duration"),
).toBeInTheDocument();
expect(screen.getByTitle("Number of Control Loops")).toBeInTheDocument();
});

it("renders nothing when grafana is not available", async () => {
expect.assertions(5);

render(<ServeMetricsSection />, { wrapper: MetricsDisabledWrapper });
render(<ServeMetricsSection metricsConfig={APPS_METRICS_CONFIG} />, {
wrapper: MetricsDisabledWrapper,
});
// Wait .1 seconds for render to finish
await waitFor(() => new Promise((r) => setTimeout(r, 100)));

Expand Down
37 changes: 34 additions & 3 deletions dashboard/client/src/pages/serve/ServeMetricsSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ const useStyles = makeStyles((theme) =>
);

// NOTE: please keep the titles here in sync with dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
const METRICS_CONFIG: MetricConfig[] = [
export const APPS_METRICS_CONFIG: MetricConfig[] = [
{
title: "QPS per application",
pathParams: "orgId=1&theme=light&panelId=7",
Expand All @@ -75,10 +75,41 @@ const METRICS_CONFIG: MetricConfig[] = [
},
];

type ServeMetricsSectionProps = ClassNameProps;
// NOTE: please keep the titles here in sync with dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
export const SERVE_SYSTEM_METRICS_CONFIG: MetricConfig[] = [
{
title: "Ongoing HTTP Requests",
pathParams: "orgId=1&theme=light&panelId=20",
},
{
title: "Ongoing gRPC Requests",
pathParams: "orgId=1&theme=light&panelId=21",
},
{
title: "Scheduling Tasks",
pathParams: "orgId=1&theme=light&panelId=22",
},
{
title: "Scheduling Tasks in Backoff",
pathParams: "orgId=1&theme=light&panelId=23",
},
{
title: "Controller Control Loop Duration",
pathParams: "orgId=1&theme=light&panelId=24",
},
{
title: "Number of Control Loops",
pathParams: "orgId=1&theme=light&panelId=25",
},
];

type ServeMetricsSectionProps = ClassNameProps & {
metricsConfig: MetricConfig[];
};

export const ServeMetricsSection = ({
className,
metricsConfig,
}: ServeMetricsSectionProps) => {
const classes = useStyles();
const { grafanaHost, prometheusHealth, dashboardUids, dashboardDatasource } =
Expand Down Expand Up @@ -131,7 +162,7 @@ export const ServeMetricsSection = ({
</TextField>
</Paper>
<div className={classes.grafanaEmbedsContainer}>
{METRICS_CONFIG.map(({ title, pathParams }) => {
{metricsConfig.map(({ title, pathParams }) => {
const path =
`/d-solo/${grafanaServeDashboardUid}?${pathParams}` +
`&refresh${timeRangeParams}&var-datasource=${dashboardDatasource}`;
Expand Down
12 changes: 11 additions & 1 deletion dashboard/client/src/pages/serve/ServeSystemDetailPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ import { Outlet } from "react-router-dom";
import Loading from "../../components/Loading";
import { MainNavPageInfo } from "../layout/mainNavContext";
import { useServeDeployments } from "./hook/useServeApplications";
import {
SERVE_SYSTEM_METRICS_CONFIG,
ServeMetricsSection,
} from "./ServeMetricsSection";
import { ServeSystemDetails } from "./ServeSystemDetails";

const useStyles = makeStyles((theme) =>
createStyles({
root: {
Expand All @@ -15,6 +18,9 @@ const useStyles = makeStyles((theme) =>
serveInstanceWarning: {
marginBottom: theme.spacing(2),
},
section: {
marginTop: theme.spacing(4),
},
}),
);

Expand Down Expand Up @@ -53,6 +59,10 @@ export const ServeSystemDetailPage = () => {
setPage={setProxiesPage}
/>
)}
<ServeMetricsSection
className={classes.section}
metricsConfig={SERVE_SYSTEM_METRICS_CONFIG}
/>
</div>
);
};
Expand Down
78 changes: 78 additions & 0 deletions dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,84 @@
stack=False,
grid_pos=GridPos(16, 5, 8, 8),
),
Panel(
id=20,
title="Ongoing HTTP Requests",
GeneDer marked this conversation as resolved.
Show resolved Hide resolved
description="The number of ongoing requests in the HTTP Proxy.",
unit="requests",
targets=[
Target(
expr="ray_serve_num_ongoing_http_requests{{{global_filters}}}",
legend="Ongoing HTTP Requests",
),
],
grid_pos=GridPos(0, 6, 8, 8),
),
Panel(
id=21,
title="Ongoing gRPC Requests",
description="The number of ongoing requests in the gRPC Proxy.",
unit="requests",
targets=[
Target(
expr="ray_serve_num_ongoing_grpc_requests{{{global_filters}}}",
legend="Ongoing gRPC Requests",
),
],
grid_pos=GridPos(8, 6, 8, 8),
),
Panel(
id=22,
title="Scheduling Tasks",
description="The number of request scheduling tasks in the router.",
unit="tasks",
targets=[
Target(
expr="ray_serve_num_scheduling_tasks{{{global_filters}}}",
legend="Scheduling Tasks",
),
],
grid_pos=GridPos(16, 6, 8, 8),
),
Panel(
id=23,
title="Scheduling Tasks in Backoff",
description="The number of request scheduling tasks in the router that are undergoing backoff.",
unit="tasks",
targets=[
Target(
expr="ray_serve_num_scheduling_tasks_in_backoff{{{global_filters}}}",
legend="Scheduling Tasks in Backoff",
),
],
grid_pos=GridPos(0, 7, 8, 8),
),
Panel(
id=24,
title="Controller Control Loop Duration",
description="The duration of the last control loop.",
unit="seconds",
targets=[
Target(
expr="ray_serve_controller_control_loop_duration_s{{{global_filters}}}",
legend="Control Loop Duration",
),
],
grid_pos=GridPos(8, 7, 8, 8),
),
Panel(
id=25,
title="Number of Control Loops",
description="The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.",
unit="loops",
targets=[
Target(
expr="ray_serve_controller_num_control_loops{{{global_filters}}}",
legend="Control Loops",
),
],
grid_pos=GridPos(16, 7, 8, 8),
),
]

ids = []
Expand Down
2 changes: 1 addition & 1 deletion doc/source/serve/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ The following metrics are exposed by Ray Serve:
* replica
* application
* model_id
- The mutlplexed model ID registered on the current replica.
- The mutliplexed model ID registered on the current replica.
* - ``ray_serve_multiplexed_get_model_requests_counter``
- * deployment
* replica
Expand Down
2 changes: 1 addition & 1 deletion python/ray/serve/_private/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ async def __init__(
# Track the number of times the controller has started
metrics.Counter(
"serve_controller_num_starts",
description="The number of times that controller has started.",
description="The number of times the controller has started.",
).inc()

def reconfigure_global_logging_config(self, global_logging_config: LoggingConfig):
Expand Down
Loading