From 5d936f8f4ea6dc14225db9dc4be4ed7d6ac59d08 Mon Sep 17 00:00:00 2001 From: Buildkite Date: Tue, 4 Nov 2025 14:27:02 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=96=20sync'ing=20generated=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/admin/observability/alerts.mdx | 3224 ++-- docs/admin/observability/dashboards.mdx | 15543 +++++++--------- docs/cli/references/api.mdx | 5 +- docs/cli/references/batch/apply.mdx | 12 +- docs/cli/references/batch/new.mdx | 8 +- docs/cli/references/batch/preview.mdx | 12 +- docs/cli/references/batch/remote.mdx | 8 +- docs/cli/references/batch/repositories.mdx | 8 +- docs/cli/references/batch/validate.mdx | 8 +- docs/cli/references/code-intel/upload.mdx | 32 +- docs/cli/references/config/edit.mdx | 5 +- docs/cli/references/config/get.mdx | 5 +- docs/cli/references/config/list.mdx | 5 +- docs/cli/references/extensions/copy.mdx | 5 +- docs/cli/references/extensions/delete.mdx | 5 +- docs/cli/references/extensions/get.mdx | 5 +- docs/cli/references/extensions/list.mdx | 5 +- docs/cli/references/extensions/publish.mdx | 5 +- docs/cli/references/extsvc/create.mdx | 7 +- docs/cli/references/extsvc/edit.mdx | 5 +- docs/cli/references/extsvc/list.mdx | 5 +- docs/cli/references/index.mdx | 10 +- docs/cli/references/login.mdx | 5 +- docs/cli/references/orgs/create.mdx | 5 +- docs/cli/references/orgs/delete.mdx | 5 +- docs/cli/references/orgs/get.mdx | 5 +- docs/cli/references/orgs/list.mdx | 5 +- docs/cli/references/orgs/members/add.mdx | 5 +- docs/cli/references/orgs/members/remove.mdx | 5 +- docs/cli/references/repos/add-metadata.mdx | 5 +- docs/cli/references/repos/delete-metadata.mdx | 5 +- docs/cli/references/repos/delete.mdx | 5 +- docs/cli/references/repos/get.mdx | 5 +- docs/cli/references/repos/list.mdx | 5 +- docs/cli/references/repos/update-metadata.mdx | 5 +- docs/cli/references/search.mdx | 7 +- docs/cli/references/serve-git.mdx | 3 +- docs/cli/references/users/create.mdx | 5 +- docs/cli/references/users/delete.mdx | 5 +- docs/cli/references/users/get.mdx | 5 +- docs/cli/references/users/list.mdx | 5 +- docs/cli/references/users/prune.mdx | 9 +- docs/cli/references/users/tag.mdx | 7 +- docs/cli/references/validate.mdx | 3 +- docs/cli/references/version.mdx | 5 +- 45 files changed, 8868 insertions(+), 10178 deletions(-) diff --git a/docs/admin/observability/alerts.mdx b/docs/admin/observability/alerts.mdx index 0426c8eff..208649a2a 100644 --- a/docs/admin/observability/alerts.mdx +++ b/docs/admin/observability/alerts.mdx @@ -2,10 +2,10 @@ {/* DO NOT EDIT: generated via: bazel run //doc/admin/observability:write_monitoring_docs */} -This document contains a complete reference of all alerts in Sourcegraph's monitoring and the next steps for finding alerts that are firing. -If your alert isn't mentioned here, or if the next steps don't help, contact us at `support@sourcegraph.com` for assistance. +This document contains a complete reference of all alerts in Sourcegraph's monitoring, and next steps for when you find alerts that are firing. +If your alert isn't mentioned here, or if the next steps don't help, [contact us](mailto:support@sourcegraph.com) for assistance. -To learn more about Sourcegraph's alerting and how to set up alerts, see [our alerting guide](/admin/observability/alerting). +To learn more about Sourcegraph's alerting and how to set up alerts, see [our alerting guide](https://docs.sourcegraph.com/admin/observability/alerting). ## frontend: 99th_percentile_search_request_duration @@ -75,22 +75,22 @@ Generated query for warning alert: `max((histogram_quantile(0.9, sum by (le) (ra
-## frontend: hard_timeout_search_responses +## frontend: timeout_search_responses -

hard timeout search responses every 5m

+

timeout search responses every 5m

**Descriptions** -- warning frontend: 2%+ hard timeout search responses every 5m for 15m0s +- warning frontend: 2%+ timeout search responses every 5m for 15m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#frontend-hard-timeout-search-responses). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#frontend-timeout-search-responses). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_frontend_hard_timeout_search_responses" + "warning_frontend_timeout_search_responses" ] ``` @@ -99,7 +99,7 @@ Generated query for warning alert: `max((histogram_quantile(0.9, sum by (le) (ra
Technical details -Generated query for warning alert: `max(((sum(increase(src_graphql_search_response{request_name!="CodeIntelSearch",source="browser",status="timeout"\}[5m])) + sum(increase(src_graphql_search_response\{alert_type="timed_out",request_name!="CodeIntelSearch",source="browser",status="alert"\}[5m]))) / sum(increase(src_graphql_search_response\{request_name!="CodeIntelSearch",source="browser"}[5m])) * 100) >= 2)` +Generated query for warning alert: `max((sum(increase(src_search_streaming_response{source="browser",status=~"timeout\\|partial_timeout"\}[5m])) / sum(increase(src_search_streaming_response\{source="browser"}[5m])) * 100) >= 2)`
@@ -129,28 +129,29 @@ Generated query for warning alert: `max(((sum(increase(src_graphql_search_respon
Technical details -Generated query for warning alert: `max((sum by (status) (increase(src_graphql_search_response{request_name!="CodeIntelSearch",source="browser",status=~"error"\}[5m])) / ignoring (status) group_left () sum(increase(src_graphql_search_response\{request_name!="CodeIntelSearch",source="browser"}[5m])) * 100) >= 2)` +Generated query for warning alert: `max((sum(increase(src_search_streaming_response{source="browser",status="error"\}[5m])) / sum(increase(src_search_streaming_response\{source="browser"}[5m])) * 100) >= 2)`

-## frontend: partial_timeout_search_responses +## frontend: search_no_results -

partial timeout search responses every 5m

+

searches with no results every 5m

**Descriptions** -- warning frontend: 5%+ partial timeout search responses every 5m for 15m0s +- warning frontend: 5%+ searches with no results every 5m for 15m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#frontend-partial-timeout-search-responses). +- A sudden increase in this metric could indicate a problem with search indexing, or a shift in search behavior that are causing fewer users to find the results they`re looking for. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#frontend-search-no-results). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_frontend_partial_timeout_search_responses" + "warning_frontend_search_no_results" ] ``` @@ -159,7 +160,7 @@ Generated query for warning alert: `max((sum by (status) (increase(src_graphql_s
Technical details -Generated query for warning alert: `max((sum by (status) (increase(src_graphql_search_response{request_name!="CodeIntelSearch",source="browser",status="partial_timeout"\}[5m])) / ignoring (status) group_left () sum(increase(src_graphql_search_response\{request_name!="CodeIntelSearch",source="browser"}[5m])) * 100) >= 5)` +Generated query for warning alert: `max((sum(increase(src_search_streaming_response{source="browser",status="no_results"\}[5m])) / sum(increase(src_search_streaming_response\{source="browser"}[5m])) * 100) >= 5)`
@@ -190,7 +191,7 @@ Generated query for warning alert: `max((sum by (status) (increase(src_graphql_s
Technical details -Generated query for warning alert: `max((sum by (alert_type) (increase(src_graphql_search_response{alert_type!~"timed_out\\|no_results__suggest_quotes",request_name!="CodeIntelSearch",source="browser",status="alert"\}[5m])) / ignoring (alert_type) group_left () sum(increase(src_graphql_search_response\{request_name!="CodeIntelSearch",source="browser"}[5m])) * 100) >= 5)` +Generated query for warning alert: `max((sum by (alert_type) (increase(src_search_streaming_response{alert_type!~"timed_out",source="browser",status="alert"\}[5m])) / ignoring (alert_type) group_left () sum(increase(src_search_streaming_response\{source="browser"}[5m])) * 100) >= 5)`
@@ -399,7 +400,7 @@ Generated query for warning alert: `max((sum by (status) (increase(src_graphql_s **Next steps** -- This indicates a bug in Sourcegraph, please [contact us](https://sourcegraph.com/contact). +- This indicates a bug in Sourcegraph, please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new/choose). - Learn more about the related dashboard panel in the [dashboards reference](dashboards#frontend-search-codeintel-alert-user-suggestions). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: @@ -573,7 +574,7 @@ Generated query for warning alert: `max((sum(increase(src_graphql_search_respons
Technical details -Generated query for warning alert: `max((sum by (alert_type) (increase(src_graphql_search_response{alert_type!~"timed_out\\|no_results__suggest_quotes",source="other",status="alert"\}[5m])) / ignoring (alert_type) group_left () sum(increase(src_graphql_search_response\{source="other",status="alert"}[5m]))) >= 5)` +Generated query for warning alert: `max((sum by (alert_type) (increase(src_graphql_search_response{alert_type!~"timed_out",source="other",status="alert"\}[5m])) / ignoring (alert_type) group_left () sum(increase(src_graphql_search_response\{source="other",status="alert"}[5m]))) >= 5)`
@@ -830,6 +831,72 @@ Generated query for critical alert: `max((sum(increase(src_cloudkms_cryptographi
+## frontend: goroutine_error_rate + +

error rate for periodic goroutine executions

+ +**Descriptions** + +- warning frontend: 0.01reqps+ error rate for periodic goroutine executions for 15m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Look for recent changes to the routine`s code or configuration +- More help interpreting this metric is available in the [dashboards reference](dashboards#frontend-goroutine-error-rate). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_goroutine_error_rate" +] +``` + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* + +
+Technical details + +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*frontend.*"}[5m]))) >= 0.01)` + +
+ +
+ +## frontend: goroutine_error_percentage + +

percentage of periodic goroutine executions that result in errors

+ +**Descriptions** + +- warning frontend: 5%+ percentage of periodic goroutine executions that result in errors + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#frontend-goroutine-error-percentage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_goroutine_error_percentage" +] +``` + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* + +
+Technical details + +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*frontend.*"\}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total\{job=~".*frontend.*"}[5m]) > 0) * 100) >= 5)` + +
+ +
+ ## frontend: mean_blocked_seconds_per_conn_request

mean blocked seconds per conn request

@@ -867,6 +934,68 @@ Generated query for critical alert: `max((sum by (app_name, db_name) (increase(s
+## frontend: cpu_usage_percentage + +

CPU usage

+ +**Descriptions** + +- warning frontend: 95%+ CPU usage for 10m0s + +**Next steps** + +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#frontend-cpu-usage-percentage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_cpu_usage_percentage" +] +``` + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* + +
+Technical details + +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend\\|sourcegraph-frontend).*"}) >= 95)` + +
+ +
+ +## frontend: memory_rss + +

memory (RSS)

+ +**Descriptions** + +- warning frontend: 90%+ memory (RSS) for 10m0s + +**Next steps** + +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#frontend-memory-rss). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_memory_rss" +] +``` + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* + +
+Technical details + +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^(frontend\\|sourcegraph-frontend).*"\} / container_spec_memory_limit_bytes\{name=~"^(frontend\\|sourcegraph-frontend).*"}) * 100) >= 90)` + +
+ +
+ ## frontend: container_cpu_usage

container cpu usage total (1m average) across all cores by instance

@@ -1369,25 +1498,26 @@ Generated query for critical alert: `max((histogram_quantile(0.9, sum by (le) (l
-## gitserver: cpu_throttling_time +## gitserver: disk_space_remaining -

container CPU throttling time %

+

disk space remaining

**Descriptions** -- warning gitserver: 75%+ container CPU throttling time % for 2m0s -- critical gitserver: 90%+ container CPU throttling time % for 5m0s +- warning gitserver: less than 15% disk space remaining +- critical gitserver: less than 10% disk space remaining for 10m0s **Next steps** -- - Consider increasing the CPU limit for the container. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-cpu-throttling-time). +- On a warning alert, you may want to provision more disk space: Disk pressure may result in decreased performance, users having to wait for repositories to clone, etc. +- On a critical alert, you need to provision more disk space. Running out of disk space will result in decreased performance, or complete service outage. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-disk-space-remaining). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_cpu_throttling_time", - "critical_gitserver_cpu_throttling_time" + "warning_gitserver_disk_space_remaining", + "critical_gitserver_disk_space_remaining" ] ``` @@ -1396,34 +1526,33 @@ Generated query for critical alert: `max((histogram_quantile(0.9, sum by (le) (l
Technical details -Generated query for warning alert: `max((sum by (container_label_io_kubernetes_pod_name) ((rate(container_cpu_cfs_throttled_periods_total{container_label_io_kubernetes_container_name="gitserver"\}[5m]) / rate(container_cpu_cfs_periods_total\{container_label_io_kubernetes_container_name="gitserver"}[5m])) * 100)) >= 75)` +Generated query for warning alert: `min(((src_gitserver_disk_space_available / src_gitserver_disk_space_total) * 100) < 15)` -Generated query for critical alert: `max((sum by (container_label_io_kubernetes_pod_name) ((rate(container_cpu_cfs_throttled_periods_total{container_label_io_kubernetes_container_name="gitserver"\}[5m]) / rate(container_cpu_cfs_periods_total\{container_label_io_kubernetes_container_name="gitserver"}[5m])) * 100)) >= 90)` +Generated query for critical alert: `min(((src_gitserver_disk_space_available / src_gitserver_disk_space_total) * 100) < 10)`

-## gitserver: disk_space_remaining +## gitserver: cpu_throttling_time -

disk space remaining

+

container CPU throttling time %

**Descriptions** -- warning gitserver: less than 15% disk space remaining -- critical gitserver: less than 10% disk space remaining for 10m0s +- warning gitserver: 75%+ container CPU throttling time % for 2m0s +- critical gitserver: 90%+ container CPU throttling time % for 5m0s **Next steps** -- On a warning alert, you may want to provision more disk space: Disk pressure may result in decreased performance, users having to wait for repositories to clone, etc. -- On a critical alert, you need to provision more disk space. Running out of disk space will result in decreased performance, or complete service outage. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-disk-space-remaining). +- - Consider increasing the CPU limit for the container. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-cpu-throttling-time). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_disk_space_remaining", - "critical_gitserver_disk_space_remaining" + "warning_gitserver_cpu_throttling_time", + "critical_gitserver_cpu_throttling_time" ] ``` @@ -1432,9 +1561,9 @@ Generated query for critical alert: `max((sum by (container_label_io_kubernetes_
Technical details -Generated query for warning alert: `min(((src_gitserver_disk_space_available / src_gitserver_disk_space_total) * 100) < 15)` +Generated query for warning alert: `max((sum by (container_label_io_kubernetes_pod_name) ((rate(container_cpu_cfs_throttled_periods_total{container_label_io_kubernetes_container_name="gitserver"\}[5m]) / rate(container_cpu_cfs_periods_total\{container_label_io_kubernetes_container_name="gitserver"}[5m])) * 100)) >= 75)` -Generated query for critical alert: `min(((src_gitserver_disk_space_available / src_gitserver_disk_space_total) * 100) < 10)` +Generated query for critical alert: `max((sum by (container_label_io_kubernetes_pod_name) ((rate(container_cpu_cfs_throttled_periods_total{container_label_io_kubernetes_container_name="gitserver"\}[5m]) / rate(container_cpu_cfs_periods_total\{container_label_io_kubernetes_container_name="gitserver"}[5m])) * 100)) >= 90)`
@@ -1484,7 +1613,6 @@ Generated query for critical alert: `max((sum by (instance, cmd) (src_gitserver_ **Descriptions** - warning gitserver: 0.02s+ echo test command duration for 30s -- critical gitserver: 1s+ echo test command duration for 1m0s **Next steps** @@ -1496,8 +1624,7 @@ Generated query for critical alert: `max((sum by (instance, cmd) (src_gitserver_ ```json "observability.silenceAlerts": [ - "warning_gitserver_echo_command_duration_test", - "critical_gitserver_echo_command_duration_test" + "warning_gitserver_echo_command_duration_test" ] ``` @@ -1508,8 +1635,6 @@ Generated query for critical alert: `max((sum by (instance, cmd) (src_gitserver_ Generated query for warning alert: `max((max(src_gitserver_echo_duration_seconds)) >= 0.02)` -Generated query for critical alert: `max((max(src_gitserver_echo_duration_seconds)) >= 1)` -
@@ -1577,125 +1702,123 @@ Generated query for warning alert: `max((sum(src_gitserver_clone_queue)) >= 2
-## gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance +## gitserver: git_command_retry_attempts_rate -

maximum duration since last successful site configuration update (all "gitserver" instances)

+

rate of git command corruption retry attempts over 5m

**Descriptions** -- critical gitserver: 300s+ maximum duration since last successful site configuration update (all "gitserver" instances) +- warning gitserver: 0.1reqps+ rate of git command corruption retry attempts over 5m for 5m0s **Next steps** -- This indicates that one or more "gitserver" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. -- Check for relevant errors in the "gitserver" logs, as well as frontend`s logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-gitserver-site-configuration-duration-since-last-successful-update-by-instance). +- Investigate the underlying cause of corruption errors in git commands. +- Check disk health and I/O performance. +- Monitor for patterns in specific git operations that trigger retries. +- Consider adjusting retry configuration if retries are too frequent. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-git-command-retry-attempts-rate). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_gitserver_gitserver_site_configuration_duration_since_last_successful_update_by_instance" + "warning_gitserver_git_command_retry_attempts_rate" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*gitserver"}[1m]))) >= 300)` +Generated query for warning alert: `max((sum(rate(src_gitserver_retry_attempts_total[5m]))) >= 0.1)`

-## gitserver: mean_blocked_seconds_per_conn_request +## gitserver: goroutine_error_rate -

mean blocked seconds per conn request

+

error rate for periodic goroutine executions

**Descriptions** -- warning gitserver: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical gitserver: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning gitserver: 0.01reqps+ error rate for periodic goroutine executions for 15m0s **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-mean-blocked-seconds-per-conn-request). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Look for recent changes to the routine`s code or configuration +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-goroutine-error-rate). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_mean_blocked_seconds_per_conn_request", - "critical_gitserver_mean_blocked_seconds_per_conn_request" + "warning_gitserver_goroutine_error_rate" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="gitserver"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="gitserver"}[5m]))) >= 0.1)` - -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="gitserver"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="gitserver"}[5m]))) >= 0.5)` +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*gitserver.*"}[5m]))) >= 0.01)`

-## gitserver: container_cpu_usage +## gitserver: goroutine_error_percentage -

container cpu usage total (1m average) across all cores by instance

+

percentage of periodic goroutine executions that result in errors

**Descriptions** -- warning gitserver: 99%+ container cpu usage total (1m average) across all cores by instance +- warning gitserver: 5%+ percentage of periodic goroutine executions that result in errors **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-container-cpu-usage). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-goroutine-error-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_container_cpu_usage" + "warning_gitserver_goroutine_error_percentage" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}) >= 99)` +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*gitserver.*"\}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total\{job=~".*gitserver.*"}[5m]) > 0) * 100) >= 5)`

-## gitserver: container_memory_usage +## gitserver: cpu_usage_percentage -

container memory usage by instance

+

CPU usage

**Descriptions** -- warning gitserver: 99%+ container memory usage by instance +- warning gitserver: 95%+ CPU usage for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of gitserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-container-memory-usage). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_container_memory_usage" + "warning_gitserver_cpu_usage_percentage" ] ``` @@ -1704,30 +1827,29 @@ Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"}) >= 99)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}) >= 95)`

-## gitserver: provisioning_container_cpu_usage_long_term +## gitserver: memory_rss -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

memory (RSS)

**Descriptions** -- warning gitserver: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning gitserver: 90%+ memory (RSS) for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the gitserver service. -- **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-provisioning-container-cpu-usage-long-term). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_provisioning_container_cpu_usage_long_term" + "warning_gitserver_memory_rss" ] ``` @@ -1736,62 +1858,65 @@ Generated query for warning alert: `max((cadvisor_container_memory_usage_percent
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^gitserver.*"\} / container_spec_memory_limit_bytes\{name=~"^gitserver.*"}) * 100) >= 90)`

-## gitserver: provisioning_container_cpu_usage_short_term +## gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance -

container cpu usage total (5m maximum) across all cores by instance

+

maximum duration since last successful site configuration update (all "gitserver" instances)

**Descriptions** -- warning gitserver: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- critical gitserver: 300s+ maximum duration since last successful site configuration update (all "gitserver" instances) **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-provisioning-container-cpu-usage-short-term). +- This indicates that one or more "gitserver" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. +- Check for relevant errors in the "gitserver" logs, as well as frontend`s logs. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-gitserver-site-configuration-duration-since-last-successful-update-by-instance). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_provisioning_container_cpu_usage_short_term" + "critical_gitserver_gitserver_site_configuration_duration_since_last_successful_update_by_instance" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[5m])) >= 90)` +Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*gitserver"}[1m]))) >= 300)`

-## gitserver: container_oomkill_events_total +## gitserver: mean_blocked_seconds_per_conn_request -

container OOMKILL events total by instance

+

mean blocked seconds per conn request

**Descriptions** -- warning gitserver: 1+ container OOMKILL events total by instance +- warning gitserver: 0.1s+ mean blocked seconds per conn request for 10m0s +- critical gitserver: 0.5s+ mean blocked seconds per conn request for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of gitserver container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-container-oomkill-events-total). +- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed +- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) +- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-mean-blocked-seconds-per-conn-request). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_container_oomkill_events_total" + "warning_gitserver_mean_blocked_seconds_per_conn_request", + "critical_gitserver_mean_blocked_seconds_per_conn_request" ] ``` @@ -1800,28 +1925,32 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_us
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^gitserver.*"})) >= 1)` +Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="gitserver"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="gitserver"}[5m]))) >= 0.1)` + +Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="gitserver"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="gitserver"}[5m]))) >= 0.5)`

-## gitserver: go_goroutines +## gitserver: container_cpu_usage -

maximum active goroutines

+

container cpu usage total (1m average) across all cores by instance

**Descriptions** -- warning gitserver: 10000+ maximum active goroutines for 10m0s +- warning gitserver: 99%+ container cpu usage total (1m average) across all cores by instance **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-go-goroutines). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-container-cpu-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_go_goroutines" + "warning_gitserver_container_cpu_usage" ] ``` @@ -1830,28 +1959,30 @@ Generated query for warning alert: `max((max by (name) (container_oom_events_tot
Technical details -Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*gitserver"})) >= 10000)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}) >= 99)`

-## gitserver: go_gc_duration_seconds +## gitserver: container_memory_usage -

maximum go garbage collection duration

+

container memory usage by instance

**Descriptions** -- warning gitserver: 2s+ maximum go garbage collection duration +- warning gitserver: 99%+ container memory usage by instance **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-go-gc-duration-seconds). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of gitserver container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-container-memory-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_go_gc_duration_seconds" + "warning_gitserver_container_memory_usage" ] ``` @@ -1860,7 +1991,163 @@ Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~"
Technical details -Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*gitserver"})) >= 2)` +Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"}) >= 99)` + +
+ +
+ +## gitserver: provisioning_container_cpu_usage_long_term + +

container cpu usage total (90th percentile over 1d) across all cores by instance

+ +**Descriptions** + +- warning gitserver: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s + +**Next steps** + +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the gitserver service. +- **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-provisioning-container-cpu-usage-long-term). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_provisioning_container_cpu_usage_long_term" +] +``` + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[1d])) >= 80)` + +
+ +
+ +## gitserver: provisioning_container_cpu_usage_short_term + +

container cpu usage total (5m maximum) across all cores by instance

+ +**Descriptions** + +- warning gitserver: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s + +**Next steps** + +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-provisioning-container-cpu-usage-short-term). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_provisioning_container_cpu_usage_short_term" +] +``` + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[5m])) >= 90)` + +
+ +
+ +## gitserver: container_oomkill_events_total + +

container OOMKILL events total by instance

+ +**Descriptions** + +- warning gitserver: 1+ container OOMKILL events total by instance + +**Next steps** + +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of gitserver container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-container-oomkill-events-total). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_container_oomkill_events_total" +] +``` + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^gitserver.*"})) >= 1)` + +
+ +
+ +## gitserver: go_goroutines + +

maximum active goroutines

+ +**Descriptions** + +- warning gitserver: 10000+ maximum active goroutines for 10m0s + +**Next steps** + +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-go-goroutines). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_go_goroutines" +] +``` + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*gitserver"})) >= 10000)` + +
+ +
+ +## gitserver: go_gc_duration_seconds + +

maximum go garbage collection duration

+ +**Descriptions** + +- warning gitserver: 2s+ maximum go garbage collection duration + +**Next steps** + +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-go-gc-duration-seconds). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_go_gc_duration_seconds" +] +``` + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*gitserver"})) >= 2)`
@@ -2318,71 +2605,100 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*(pgsql\\|code
-## precise-code-intel-worker: codeintel_upload_queued_max_age +## precise-code-intel-worker: mean_blocked_seconds_per_conn_request -

unprocessed upload record queue longest time in queue

+

mean blocked seconds per conn request

**Descriptions** -- warning precise-code-intel-worker: 18000s+ unprocessed upload record queue longest time in queue +- warning precise-code-intel-worker: 0.1s+ mean blocked seconds per conn request for 10m0s +- critical precise-code-intel-worker: 0.5s+ mean blocked seconds per conn request for 10m0s **Next steps** -- An alert here could be indicative of a few things: an upload surfacing a pathological performance characteristic, -precise-code-intel-worker being underprovisioned for the required upload processing throughput, or a higher replica -count being required for the volume of uploads. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#precise-code-intel-worker-codeintel-upload-queued-max-age). +- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed +- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) +- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#precise-code-intel-worker-mean-blocked-seconds-per-conn-request). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_precise-code-intel-worker_codeintel_upload_queued_max_age" + "warning_precise-code-intel-worker_mean_blocked_seconds_per_conn_request", + "critical_precise-code-intel-worker_mean_blocked_seconds_per_conn_request" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max(src_codeintel_upload_queued_duration_seconds_total{job=~"^precise-code-intel-worker.*"})) >= 18000)` +Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="precise-code-intel-worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="precise-code-intel-worker"}[5m]))) >= 0.1)` + +Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="precise-code-intel-worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="precise-code-intel-worker"}[5m]))) >= 0.5)`

-## precise-code-intel-worker: mean_blocked_seconds_per_conn_request +## precise-code-intel-worker: cpu_usage_percentage -

mean blocked seconds per conn request

+

CPU usage

**Descriptions** -- warning precise-code-intel-worker: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical precise-code-intel-worker: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning precise-code-intel-worker: 95%+ CPU usage for 10m0s **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#precise-code-intel-worker-mean-blocked-seconds-per-conn-request). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#precise-code-intel-worker-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_precise-code-intel-worker_mean_blocked_seconds_per_conn_request", - "critical_precise-code-intel-worker_mean_blocked_seconds_per_conn_request" + "warning_precise-code-intel-worker_cpu_usage_percentage" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="precise-code-intel-worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="precise-code-intel-worker"}[5m]))) >= 0.1)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"}) >= 95)` -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="precise-code-intel-worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="precise-code-intel-worker"}[5m]))) >= 0.5)` +
+ +
+ +## precise-code-intel-worker: memory_rss + +

memory (RSS)

+ +**Descriptions** + +- warning precise-code-intel-worker: 90%+ memory (RSS) for 10m0s + +**Next steps** + +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#precise-code-intel-worker-memory-rss). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_memory_rss" +] +``` + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* + +
+Technical details + +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^precise-code-intel-worker.*"\} / container_spec_memory_limit_bytes\{name=~"^precise-code-intel-worker.*"}) * 100) >= 90)`
@@ -2704,23 +3020,27 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*precise-code-
-## redis: redis-store_up +## syntactic-indexing: mean_blocked_seconds_per_conn_request -

redis-store availability

+

mean blocked seconds per conn request

**Descriptions** -- critical redis: less than 1 redis-store availability for 10s +- warning syntactic-indexing: 0.1s+ mean blocked seconds per conn request for 10m0s +- critical syntactic-indexing: 0.5s+ mean blocked seconds per conn request for 10m0s **Next steps** -- Ensure redis-store is running -- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-redis-store-up). +- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed +- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) +- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-mean-blocked-seconds-per-conn-request). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_redis_redis-store_up" + "warning_syntactic-indexing_mean_blocked_seconds_per_conn_request", + "critical_syntactic-indexing_mean_blocked_seconds_per_conn_request" ] ``` @@ -2729,449 +3049,126 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*precise-code-
Technical details -Generated query for critical alert: `min((redis_up{app="redis-store"}) < 1)` +Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="syntactic-code-intel-worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="syntactic-code-intel-worker"}[5m]))) >= 0.1)` + +Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="syntactic-code-intel-worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="syntactic-code-intel-worker"}[5m]))) >= 0.5)`

-## redis: redis-cache_up +## syntactic-indexing: cpu_usage_percentage -

redis-cache availability

+

CPU usage

**Descriptions** -- critical redis: less than 1 redis-cache availability for 10s +- warning syntactic-indexing: 95%+ CPU usage for 10m0s **Next steps** -- Ensure redis-cache is running -- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-redis-cache-up). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_redis_redis-cache_up" + "warning_syntactic-indexing_cpu_usage_percentage" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for critical alert: `min((redis_up{app="redis-cache"}) < 1)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}) >= 95)`

-## redis: provisioning_container_cpu_usage_long_term +## syntactic-indexing: memory_rss -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

memory (RSS)

**Descriptions** -- warning redis: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning syntactic-indexing: 90%+ memory (RSS) for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the redis-cache service. -- **Docker Compose:** Consider increasing `cpus:` of the redis-cache container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-long-term). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#syntactic-indexing-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_redis_provisioning_container_cpu_usage_long_term" + "warning_syntactic-indexing_memory_rss" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^syntactic-code-intel-worker.*"\} / container_spec_memory_limit_bytes\{name=~"^syntactic-code-intel-worker.*"}) * 100) >= 90)`

-## redis: provisioning_container_memory_usage_long_term +## syntactic-indexing: container_cpu_usage -

container memory usage (1d maximum) by instance

+

container cpu usage total (1m average) across all cores by instance

**Descriptions** -- warning redis: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning syntactic-indexing: 99%+ container cpu usage total (1m average) across all cores by instance **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the redis-cache service. -- **Docker Compose:** Consider increasing `memory:` of the redis-cache container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-long-term). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the syntactic-code-intel-worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-container-cpu-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_redis_provisioning_container_memory_usage_long_term" + "warning_syntactic-indexing_container_cpu_usage" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[1d])) >= 80)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}) >= 99)`

-## redis: provisioning_container_cpu_usage_short_term +## syntactic-indexing: container_memory_usage -

container cpu usage total (5m maximum) across all cores by instance

+

container memory usage by instance

**Descriptions** -- warning redis: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s - -**Next steps** - -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the redis-cache container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-short-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_provisioning_container_cpu_usage_short_term" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[5m])) >= 90)` - -
- -
- -## redis: provisioning_container_memory_usage_short_term - -

container memory usage (5m maximum) by instance

- -**Descriptions** - -- warning redis: 90%+ container memory usage (5m maximum) by instance - -**Next steps** - -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of redis-cache container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-short-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_provisioning_container_memory_usage_short_term" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[5m])) >= 90)` - -
- -
- -## redis: container_oomkill_events_total - -

container OOMKILL events total by instance

- -**Descriptions** - -- warning redis: 1+ container OOMKILL events total by instance - -**Next steps** - -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of redis-cache container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-container-oomkill-events-total). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_container_oomkill_events_total" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^redis-cache.*"})) >= 1)` - -
- -
- -## redis: provisioning_container_cpu_usage_long_term - -

container cpu usage total (90th percentile over 1d) across all cores by instance

- -**Descriptions** - -- warning redis: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s - -**Next steps** - -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the redis-store service. -- **Docker Compose:** Consider increasing `cpus:` of the redis-store container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-long-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_provisioning_container_cpu_usage_long_term" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[1d])) >= 80)` - -
- -
- -## redis: provisioning_container_memory_usage_long_term - -

container memory usage (1d maximum) by instance

- -**Descriptions** - -- warning redis: 80%+ container memory usage (1d maximum) by instance for 336h0m0s - -**Next steps** - -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the redis-store service. -- **Docker Compose:** Consider increasing `memory:` of the redis-store container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-long-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_provisioning_container_memory_usage_long_term" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[1d])) >= 80)` - -
- -
- -## redis: provisioning_container_cpu_usage_short_term - -

container cpu usage total (5m maximum) across all cores by instance

- -**Descriptions** - -- warning redis: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s - -**Next steps** - -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the redis-store container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-short-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_provisioning_container_cpu_usage_short_term" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[5m])) >= 90)` - -
- -
- -## redis: provisioning_container_memory_usage_short_term - -

container memory usage (5m maximum) by instance

- -**Descriptions** - -- warning redis: 90%+ container memory usage (5m maximum) by instance - -**Next steps** - -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of redis-store container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-short-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_provisioning_container_memory_usage_short_term" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[5m])) >= 90)` - -
- -
- -## redis: container_oomkill_events_total - -

container OOMKILL events total by instance

- -**Descriptions** - -- warning redis: 1+ container OOMKILL events total by instance +- warning syntactic-indexing: 99%+ container memory usage by instance **Next steps** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of redis-store container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-container-oomkill-events-total). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_redis_container_oomkill_events_total" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^redis-store.*"})) >= 1)` - -
- -
- -## redis: pods_available_percentage - -

percentage pods available

- -**Descriptions** - -- critical redis: less than 90% percentage pods available for 10m0s - -**Next steps** - -- Determine if the pod was OOM killed using `kubectl describe pod redis-cache` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p redis-cache`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-pods-available-percentage). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "critical_redis_pods_available_percentage" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for critical alert: `min((sum by (app) (up{app=~".*redis-cache"\}) / count by (app) (up\{app=~".*redis-cache"}) * 100) <= 90)` - -
- -
- -## redis: pods_available_percentage - -

percentage pods available

- -**Descriptions** - -- critical redis: less than 90% percentage pods available for 10m0s - -**Next steps** - -- Determine if the pod was OOM killed using `kubectl describe pod redis-store` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p redis-store`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-pods-available-percentage). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "critical_redis_pods_available_percentage" -] -``` - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Generated query for critical alert: `min((sum by (app) (up{app=~".*redis-store"\}) / count by (app) (up\{app=~".*redis-store"}) * 100) <= 90)` - -
- -
- -## worker: worker_job_codeintel-upload-janitor_count - -

number of worker instances running the codeintel-upload-janitor job

- -**Descriptions** - -- warning worker: less than 1 number of worker instances running the codeintel-upload-janitor job for 1m0s -- critical worker: less than 1 number of worker instances running the codeintel-upload-janitor job for 5m0s - -**Next steps** - -- Ensure your instance defines a worker container such that: - - `WORKER_JOB_ALLOWLIST` contains "codeintel-upload-janitor" (or "all"), and - - `WORKER_JOB_BLOCKLIST` does not contain "codeintel-upload-janitor" -- Ensure that such a container is not failing to start or stay active -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-job-codeintel-upload-janitor-count). +- **Docker Compose:** Consider increasing `memory:` of syntactic-code-intel-worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-container-memory-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_worker_job_codeintel-upload-janitor_count", - "critical_worker_worker_job_codeintel-upload-janitor_count" + "warning_syntactic-indexing_container_memory_usage" ] ``` @@ -3180,74 +3177,30 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*redis-store"\
Technical details -Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-upload-janitor"})) == 1)` - -Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-upload-janitor"})) == 1)` +Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}) >= 99)`

-## worker: worker_job_codeintel-commitgraph-updater_count - -

number of worker instances running the codeintel-commitgraph-updater job

+## syntactic-indexing: provisioning_container_cpu_usage_long_term -**Descriptions** - -- warning worker: less than 1 number of worker instances running the codeintel-commitgraph-updater job for 1m0s -- critical worker: less than 1 number of worker instances running the codeintel-commitgraph-updater job for 5m0s - -**Next steps** - -- Ensure your instance defines a worker container such that: - - `WORKER_JOB_ALLOWLIST` contains "codeintel-commitgraph-updater" (or "all"), and - - `WORKER_JOB_BLOCKLIST` does not contain "codeintel-commitgraph-updater" -- Ensure that such a container is not failing to start or stay active -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-job-codeintel-commitgraph-updater-count). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_worker_worker_job_codeintel-commitgraph-updater_count", - "critical_worker_worker_job_codeintel-commitgraph-updater_count" -] -``` - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) == 1)` - -Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) == 1)` - -
- -
- -## worker: worker_job_codeintel-autoindexing-scheduler_count - -

number of worker instances running the codeintel-autoindexing-scheduler job

+

container cpu usage total (90th percentile over 1d) across all cores by instance

**Descriptions** -- warning worker: less than 1 number of worker instances running the codeintel-autoindexing-scheduler job for 1m0s -- critical worker: less than 1 number of worker instances running the codeintel-autoindexing-scheduler job for 5m0s +- warning syntactic-indexing: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s **Next steps** -- Ensure your instance defines a worker container such that: - - `WORKER_JOB_ALLOWLIST` contains "codeintel-autoindexing-scheduler" (or "all"), and - - `WORKER_JOB_BLOCKLIST` does not contain "codeintel-autoindexing-scheduler" -- Ensure that such a container is not failing to start or stay active -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-job-codeintel-autoindexing-scheduler-count). +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the syntactic-code-intel-worker service. +- **Docker Compose:** Consider increasing `cpus:` of the syntactic-code-intel-worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-provisioning-container-cpu-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_worker_job_codeintel-autoindexing-scheduler_count", - "critical_worker_worker_job_codeintel-autoindexing-scheduler_count" + "warning_syntactic-indexing_provisioning_container_cpu_usage_long_term" ] ``` @@ -3256,32 +3209,30 @@ Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",
Technical details -Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) == 1)` - -Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) == 1)` +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[1d])) >= 80)`

-## worker: codeintel_commit_graph_queued_max_age +## syntactic-indexing: provisioning_container_memory_usage_long_term -

repository queue longest time in queue

+

container memory usage (1d maximum) by instance

**Descriptions** -- warning worker: 3600s+ repository queue longest time in queue +- warning syntactic-indexing: 80%+ container memory usage (1d maximum) by instance for 336h0m0s **Next steps** -- An alert here is generally indicative of either underprovisioned worker instance(s) and/or -an underprovisioned main postgres instance. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-codeintel-commit-graph-queued-max-age). +- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the syntactic-code-intel-worker service. +- **Docker Compose:** Consider increasing `memory:` of the syntactic-code-intel-worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-provisioning-container-memory-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_codeintel_commit_graph_queued_max_age" + "warning_syntactic-indexing_provisioning_container_memory_usage_long_term" ] ``` @@ -3290,193 +3241,186 @@ an underprovisioned main postgres instance.
Technical details -Generated query for warning alert: `max((max(src_codeintel_commit_graph_queued_duration_seconds_total{job=~"^worker.*"})) >= 3600)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[1d])) >= 80)`

-## worker: perms_syncer_outdated_perms +## syntactic-indexing: provisioning_container_cpu_usage_short_term -

number of entities with outdated permissions

+

container cpu usage total (5m maximum) across all cores by instance

**Descriptions** -- warning worker: 100+ number of entities with outdated permissions for 5m0s +- warning syntactic-indexing: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s **Next steps** -- **Enabled permissions for the first time:** Wait for few minutes and see if the number goes down. -- **Otherwise:** Increase the API rate limit to [GitHub](https://sourcegraph.com/docs/admin/code_hosts/github#github-com-rate-limits), [GitLab](https://sourcegraph.com/docs/admin/code_hosts/gitlab#internal-rate-limits) or [Bitbucket Server](https://sourcegraph.com/docs/admin/code_hosts/bitbucket_server#internal-rate-limits). -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-perms-syncer-outdated-perms). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the syntactic-code-intel-worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-provisioning-container-cpu-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_perms_syncer_outdated_perms" + "warning_syntactic-indexing_provisioning_container_cpu_usage_short_term" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max by (type) (src_repo_perms_syncer_outdated_perms)) >= 100)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[5m])) >= 90)`

-## worker: perms_syncer_sync_duration +## syntactic-indexing: provisioning_container_memory_usage_short_term -

95th permissions sync duration

+

container memory usage (5m maximum) by instance

**Descriptions** -- warning worker: 30s+ 95th permissions sync duration for 5m0s +- warning syntactic-indexing: 90%+ container memory usage (5m maximum) by instance **Next steps** -- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-perms-syncer-sync-duration). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of syntactic-code-intel-worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-provisioning-container-memory-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_perms_syncer_sync_duration" + "warning_syntactic-indexing_provisioning_container_memory_usage_short_term" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((histogram_quantile(0.95, max by (le, type) (rate(src_repo_perms_syncer_sync_duration_seconds_bucket[1m])))) >= 30)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[5m])) >= 90)`

-## worker: perms_syncer_sync_errors +## syntactic-indexing: container_oomkill_events_total -

permissions sync error rate

+

container OOMKILL events total by instance

**Descriptions** -- critical worker: 1+ permissions sync error rate for 1m0s +- warning syntactic-indexing: 1+ container OOMKILL events total by instance **Next steps** -- Check the network connectivity the Sourcegraph and the code host. -- Check if API rate limit quota is exhausted on the code host. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-perms-syncer-sync-errors). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of syntactic-code-intel-worker container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#syntactic-indexing-container-oomkill-events-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_worker_perms_syncer_sync_errors" + "warning_syntactic-indexing_container_oomkill_events_total" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for critical alert: `max((max by (type) (ceil(rate(src_repo_perms_syncer_sync_errors_total[1m])))) >= 1)` +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^syntactic-code-intel-worker.*"})) >= 1)`

-## worker: insights_queue_unutilized_size +## syntactic-indexing: go_goroutines -

insights queue size that is not utilized (not processing)

+

maximum active goroutines

**Descriptions** -- warning worker: 0+ insights queue size that is not utilized (not processing) for 30m0s +- warning syntactic-indexing: 10000+ maximum active goroutines for 10m0s **Next steps** -- Verify code insights worker job has successfully started. Restart worker service and monitoring startup logs, looking for worker panics. -- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-insights-queue-unutilized-size). +- More help interpreting this metric is available in the [dashboards reference](dashboards#syntactic-indexing-go-goroutines). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_insights_queue_unutilized_size" + "warning_syntactic-indexing_go_goroutines" ] ``` -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max(src_query_runner_worker_total{job=~"^worker.*"\}) > 0 and on (job) sum by (op) (increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total\{job=~"^worker.*",op="Dequeue"}[5m])) < 1) > 0)` +Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*syntactic-code-intel-worker"})) >= 10000)`

-## worker: mean_blocked_seconds_per_conn_request +## syntactic-indexing: go_gc_duration_seconds -

mean blocked seconds per conn request

+

maximum go garbage collection duration

**Descriptions** -- warning worker: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical worker: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning syntactic-indexing: 2s+ maximum go garbage collection duration **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-mean-blocked-seconds-per-conn-request). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-go-gc-duration-seconds). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_mean_blocked_seconds_per_conn_request", - "critical_worker_mean_blocked_seconds_per_conn_request" + "warning_syntactic-indexing_go_gc_duration_seconds" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="worker"}[5m]))) >= 0.1)` - -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="worker"}[5m]))) >= 0.5)` +Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*syntactic-code-intel-worker"})) >= 2)`

-## worker: container_cpu_usage +## syntactic-indexing: pods_available_percentage -

container cpu usage total (1m average) across all cores by instance

+

percentage pods available

**Descriptions** -- warning worker: 99%+ container cpu usage total (1m average) across all cores by instance +- critical syntactic-indexing: less than 90% percentage pods available for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the worker container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-container-cpu-usage). +- Determine if the pod was OOM killed using `kubectl describe pod syntactic-code-intel-worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p syntactic-code-intel-worker`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntactic-indexing-pods-available-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_container_cpu_usage" + "critical_syntactic-indexing_pods_available_percentage" ] ``` @@ -3485,314 +3429,316 @@ Generated query for critical alert: `max((sum by (app_name, db_name) (increase(s
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}) >= 99)` +Generated query for critical alert: `min((sum by (app) (up{app=~".*syntactic-code-intel-worker"\}) / count by (app) (up\{app=~".*syntactic-code-intel-worker"}) * 100) <= 90)`

-## worker: container_memory_usage +## redis: redis-store_up -

container memory usage by instance

+

redis-store availability

**Descriptions** -- warning worker: 99%+ container memory usage by instance +- critical redis: less than 1 redis-store availability for 10s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of worker container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-container-memory-usage). +- Ensure redis-store is running +- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-redis-store-up). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_container_memory_usage" + "critical_redis_redis-store_up" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}) >= 99)` +Generated query for critical alert: `min((redis_up{app="redis-store"}) < 1)`

-## worker: provisioning_container_cpu_usage_long_term +## redis: redis-cache_up -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

redis-cache availability

**Descriptions** -- warning worker: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- critical redis: less than 1 redis-cache availability for 10s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the worker service. -- **Docker Compose:** Consider increasing `cpus:` of the worker container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-cpu-usage-long-term). +- Ensure redis-cache is running +- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-redis-cache-up). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_provisioning_container_cpu_usage_long_term" + "critical_redis_redis-cache_up" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[1d])) >= 80)` +Generated query for critical alert: `min((redis_up{app="redis-cache"}) < 1)`

-## worker: provisioning_container_memory_usage_long_term +## redis: provisioning_container_cpu_usage_long_term -

container memory usage (1d maximum) by instance

+

container cpu usage total (90th percentile over 1d) across all cores by instance

**Descriptions** -- warning worker: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning redis: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the worker service. -- **Docker Compose:** Consider increasing `memory:` of the worker container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-memory-usage-long-term). +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the redis-cache service. +- **Docker Compose:** Consider increasing `cpus:` of the redis-cache container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_provisioning_container_memory_usage_long_term" + "warning_redis_provisioning_container_cpu_usage_long_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[1d])) >= 80)` +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[1d])) >= 80)`

-## worker: provisioning_container_cpu_usage_short_term +## redis: provisioning_container_memory_usage_long_term -

container cpu usage total (5m maximum) across all cores by instance

+

container memory usage (1d maximum) by instance

**Descriptions** -- warning worker: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning redis: 80%+ container memory usage (1d maximum) by instance for 336h0m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the worker container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-cpu-usage-short-term). +- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the redis-cache service. +- **Docker Compose:** Consider increasing `memory:` of the redis-cache container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_provisioning_container_cpu_usage_short_term" + "warning_redis_provisioning_container_memory_usage_long_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[1d])) >= 80)`

-## worker: provisioning_container_memory_usage_short_term +## redis: provisioning_container_cpu_usage_short_term -

container memory usage (5m maximum) by instance

+

container cpu usage total (5m maximum) across all cores by instance

**Descriptions** -- warning worker: 90%+ container memory usage (5m maximum) by instance +- warning redis: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of worker container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-memory-usage-short-term). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the redis-cache container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_provisioning_container_memory_usage_short_term" + "warning_redis_provisioning_container_cpu_usage_short_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[5m])) >= 90)`

-## worker: container_oomkill_events_total +## redis: provisioning_container_memory_usage_short_term -

container OOMKILL events total by instance

+

container memory usage (5m maximum) by instance

**Descriptions** -- warning worker: 1+ container OOMKILL events total by instance +- warning redis: 90%+ container memory usage (5m maximum) by instance **Next steps** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of worker container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-container-oomkill-events-total). +- **Docker Compose:** Consider increasing `memory:` of redis-cache container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_container_oomkill_events_total" + "warning_redis_provisioning_container_memory_usage_short_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^worker.*"})) >= 1)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[5m])) >= 90)`

-## worker: go_goroutines +## redis: container_oomkill_events_total -

maximum active goroutines

+

container OOMKILL events total by instance

**Descriptions** -- warning worker: 10000+ maximum active goroutines for 10m0s +- warning redis: 1+ container OOMKILL events total by instance **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-go-goroutines). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of redis-cache container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-container-oomkill-events-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_go_goroutines" + "warning_redis_container_oomkill_events_total" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*worker"})) >= 10000)` +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^redis-cache.*"})) >= 1)`

-## worker: go_gc_duration_seconds +## redis: provisioning_container_cpu_usage_long_term -

maximum go garbage collection duration

+

container cpu usage total (90th percentile over 1d) across all cores by instance

**Descriptions** -- warning worker: 2s+ maximum go garbage collection duration +- warning redis: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-go-gc-duration-seconds). +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the redis-store service. +- **Docker Compose:** Consider increasing `cpus:` of the redis-store container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_worker_go_gc_duration_seconds" + "warning_redis_provisioning_container_cpu_usage_long_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*worker"})) >= 2)` +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[1d])) >= 80)`

-## worker: pods_available_percentage +## redis: provisioning_container_memory_usage_long_term -

percentage pods available

+

container memory usage (1d maximum) by instance

**Descriptions** -- critical worker: less than 90% percentage pods available for 10m0s +- warning redis: 80%+ container memory usage (1d maximum) by instance for 336h0m0s **Next steps** -- Determine if the pod was OOM killed using `kubectl describe pod worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p worker`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-pods-available-percentage). +- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the redis-store service. +- **Docker Compose:** Consider increasing `memory:` of the redis-store container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_worker_pods_available_percentage" + "warning_redis_provisioning_container_memory_usage_long_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for critical alert: `min((sum by (app) (up{app=~".*worker"\}) / count by (app) (up\{app=~".*worker"}) * 100) <= 90)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[1d])) >= 80)`

-## worker: worker_site_configuration_duration_since_last_successful_update_by_instance +## redis: provisioning_container_cpu_usage_short_term -

maximum duration since last successful site configuration update (all "worker" instances)

+

container cpu usage total (5m maximum) across all cores by instance

**Descriptions** -- critical worker: 300s+ maximum duration since last successful site configuration update (all "worker" instances) +- warning redis: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s **Next steps** -- This indicates that one or more "worker" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. -- Check for relevant errors in the "worker" logs, as well as frontend`s logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-site-configuration-duration-since-last-successful-update-by-instance). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the redis-store container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-cpu-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_worker_worker_site_configuration_duration_since_last_successful_update_by_instance" + "warning_redis_provisioning_container_cpu_usage_short_term" ] ``` @@ -3801,262 +3747,277 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*worker"\}) /
Technical details -Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~"^worker.*"}[1m]))) >= 300)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[5m])) >= 90)`

-## repo-updater: src_repoupdater_max_sync_backoff +## redis: provisioning_container_memory_usage_short_term -

time since oldest sync

+

container memory usage (5m maximum) by instance

**Descriptions** -- critical repo-updater: 32400s+ time since oldest sync for 10m0s +- warning redis: 90%+ container memory usage (5m maximum) by instance **Next steps** -- An alert here indicates that no code host connections have synced in at least 9h0m0s. This indicates that there could be a configuration issue -with your code hosts connections or networking issues affecting communication with your code hosts. -- Check the code host status indicator (cloud icon in top right of Sourcegraph homepage) for errors. -- Make sure external services do not have invalid tokens by navigating to them in the web UI and clicking save. If there are no errors, they are valid. -- Check the repo-updater logs for errors about syncing. -- Confirm that outbound network connections are allowed where repo-updater is deployed. -- Check back in an hour to see if the issue has resolved itself. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-src-repoupdater-max-sync-backoff). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of redis-store container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-provisioning-container-memory-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_src_repoupdater_max_sync_backoff" + "warning_redis_provisioning_container_memory_usage_short_term" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for critical alert: `max((max(src_repoupdater_max_sync_backoff)) >= 32400)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[5m])) >= 90)`

-## repo-updater: src_repoupdater_syncer_sync_errors_total +## redis: container_oomkill_events_total -

site level external service sync error rate

+

container OOMKILL events total by instance

**Descriptions** -- warning repo-updater: 0.5+ site level external service sync error rate for 10m0s -- critical repo-updater: 1+ site level external service sync error rate for 10m0s +- warning redis: 1+ container OOMKILL events total by instance **Next steps** -- An alert here indicates errors syncing site level repo metadata with code hosts. This indicates that there could be a configuration issue -with your code hosts connections or networking issues affecting communication with your code hosts. -- Check the code host status indicator (cloud icon in top right of Sourcegraph homepage) for errors. -- Make sure external services do not have invalid tokens by navigating to them in the web UI and clicking save. If there are no errors, they are valid. -- Check the repo-updater logs for errors about syncing. -- Confirm that outbound network connections are allowed where repo-updater is deployed. -- Check back in an hour to see if the issue has resolved itself. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-src-repoupdater-syncer-sync-errors-total). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of redis-store container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#redis-container-oomkill-events-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_src_repoupdater_syncer_sync_errors_total", - "critical_repo-updater_src_repoupdater_syncer_sync_errors_total" + "warning_redis_container_oomkill_events_total" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="internal_rate_limit",reason!="invalid_npm_path"}[5m]))) > 0.5)` - -Generated query for critical alert: `max((max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="internal_rate_limit",reason!="invalid_npm_path"}[5m]))) > 1)` +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^redis-store.*"})) >= 1)`

-## repo-updater: syncer_sync_start +## redis: pods_available_percentage -

repo metadata sync was started

+

percentage pods available

**Descriptions** -- warning repo-updater: less than 0 repo metadata sync was started for 9h0m0s +- critical redis: less than 90% percentage pods available for 10m0s **Next steps** -- Check repo-updater logs for errors. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-syncer-sync-start). +- Determine if the pod was OOM killed using `kubectl describe pod redis-cache` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p redis-cache`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-pods-available-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_syncer_sync_start" + "critical_redis_pods_available_percentage" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `min((max by (family) (rate(src_repoupdater_syncer_start_sync{family="Syncer.SyncExternalService"}[9h]))) <= 0)` +Generated query for critical alert: `min((sum by (app) (up{app=~".*redis-cache"\}) / count by (app) (up\{app=~".*redis-cache"}) * 100) <= 90)`

-## repo-updater: syncer_sync_duration +## redis: pods_available_percentage -

95th repositories sync duration

+

percentage pods available

**Descriptions** -- warning repo-updater: 30s+ 95th repositories sync duration for 5m0s +- critical redis: less than 90% percentage pods available for 10m0s **Next steps** -- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-syncer-sync-duration). +- Determine if the pod was OOM killed using `kubectl describe pod redis-store` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p redis-store`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#redis-pods-available-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_syncer_sync_duration" + "critical_redis_pods_available_percentage" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((histogram_quantile(0.95, max by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m])))) >= 30)` +Generated query for critical alert: `min((sum by (app) (up{app=~".*redis-store"\}) / count by (app) (up\{app=~".*redis-store"}) * 100) <= 90)`

-## repo-updater: source_duration +## worker: worker_job_codeintel-upload-janitor_count -

95th repositories source duration

+

number of worker instances running the codeintel-upload-janitor job

**Descriptions** -- warning repo-updater: 30s+ 95th repositories source duration for 5m0s +- warning worker: less than 1 number of worker instances running the codeintel-upload-janitor job for 1m0s +- critical worker: less than 1 number of worker instances running the codeintel-upload-janitor job for 5m0s **Next steps** -- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-source-duration). +- Ensure your instance defines a worker container such that: + - `WORKER_JOB_ALLOWLIST` contains "codeintel-upload-janitor" (or "all"), and + - `WORKER_JOB_BLOCKLIST` does not contain "codeintel-upload-janitor" +- Ensure that such a container is not failing to start or stay active +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-job-codeintel-upload-janitor-count). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_source_duration" + "warning_worker_worker_job_codeintel-upload-janitor_count", + "critical_worker_worker_job_codeintel-upload-janitor_count" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((histogram_quantile(0.95, max by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m])))) >= 30)` +Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-upload-janitor"})) == 1)` + +Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-upload-janitor"})) == 1)`

-## repo-updater: syncer_synced_repos +## worker: worker_job_codeintel-commitgraph-updater_count -

repositories synced

+

number of worker instances running the codeintel-commitgraph-updater job

**Descriptions** -- warning repo-updater: less than 0 repositories synced for 9h0m0s +- warning worker: less than 1 number of worker instances running the codeintel-commitgraph-updater job for 1m0s +- critical worker: less than 1 number of worker instances running the codeintel-commitgraph-updater job for 5m0s **Next steps** -- Check network connectivity to code hosts -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-syncer-synced-repos). +- Ensure your instance defines a worker container such that: + - `WORKER_JOB_ALLOWLIST` contains "codeintel-commitgraph-updater" (or "all"), and + - `WORKER_JOB_BLOCKLIST` does not contain "codeintel-commitgraph-updater" +- Ensure that such a container is not failing to start or stay active +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-job-codeintel-commitgraph-updater-count). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_syncer_synced_repos" + "warning_worker_worker_job_codeintel-commitgraph-updater_count", + "critical_worker_worker_job_codeintel-commitgraph-updater_count" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max(rate(src_repoupdater_syncer_synced_repos_total[1m]))) <= 0)` +Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) == 1)` + +Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) == 1)`

-## repo-updater: sourced_repos +## worker: worker_job_codeintel-autoindexing-scheduler_count -

repositories sourced

+

number of worker instances running the codeintel-autoindexing-scheduler job

**Descriptions** -- warning repo-updater: less than 0 repositories sourced for 9h0m0s +- warning worker: less than 1 number of worker instances running the codeintel-autoindexing-scheduler job for 1m0s +- critical worker: less than 1 number of worker instances running the codeintel-autoindexing-scheduler job for 5m0s **Next steps** -- Check network connectivity to code hosts -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-sourced-repos). +- Ensure your instance defines a worker container such that: + - `WORKER_JOB_ALLOWLIST` contains "codeintel-autoindexing-scheduler" (or "all"), and + - `WORKER_JOB_BLOCKLIST` does not contain "codeintel-autoindexing-scheduler" +- Ensure that such a container is not failing to start or stay active +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-job-codeintel-autoindexing-scheduler-count). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_sourced_repos" + "warning_worker_worker_job_codeintel-autoindexing-scheduler_count", + "critical_worker_worker_job_codeintel-autoindexing-scheduler_count" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `min((max(rate(src_repoupdater_source_repos_total[1m]))) <= 0)` +Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) == 1)` + +Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"\})) < 1)) or (absent(sum(src_worker_jobs\{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) == 1)`

-## repo-updater: purge_failed +## worker: src_repoupdater_max_sync_backoff -

repositories purge failed

+

time since oldest sync

**Descriptions** -- warning repo-updater: 0+ repositories purge failed for 5m0s +- critical worker: 32400s+ time since oldest sync for 10m0s **Next steps** -- Check repo-updater`s connectivity with gitserver and gitserver logs -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-purge-failed). +- An alert here indicates that no code host connections have synced in at least 9h0m0s. This indicates that there could be a configuration issue +with your code hosts connections or networking issues affecting communication with your code hosts. +- Check the code host status indicator (cloud icon in top right of Sourcegraph homepage) for errors. +- Make sure external services do not have invalid tokens by navigating to them in the web UI and clicking save. If there are no errors, they are valid. +- Check the worker logs for errors about syncing. +- Confirm that outbound network connections are allowed where worker is deployed. +- Check back in an hour to see if the issue has resolved itself. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-src-repoupdater-max-sync-backoff). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_purge_failed" + "critical_worker_src_repoupdater_max_sync_backoff" ] ``` @@ -4065,29 +4026,37 @@ Generated query for warning alert: `min((max(rate(src_repoupdater_source_repos_t
Technical details -Generated query for warning alert: `max((max(rate(src_repoupdater_purge_failed[1m]))) > 0)` +Generated query for critical alert: `max((max(src_repoupdater_max_sync_backoff)) >= 32400)`

-## repo-updater: sched_auto_fetch +## worker: src_repoupdater_syncer_sync_errors_total -

repositories scheduled due to hitting a deadline

+

site level external service sync error rate

**Descriptions** -- warning repo-updater: less than 0 repositories scheduled due to hitting a deadline for 9h0m0s +- warning worker: 0.5+ site level external service sync error rate for 10m0s +- critical worker: 1+ site level external service sync error rate for 10m0s **Next steps** -- Check repo-updater logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-sched-auto-fetch). +- An alert here indicates errors syncing site level repo metadata with code hosts. This indicates that there could be a configuration issue +with your code hosts connections or networking issues affecting communication with your code hosts. +- Check the code host status indicator (cloud icon in top right of Sourcegraph homepage) for errors. +- Make sure external services do not have invalid tokens by navigating to them in the web UI and clicking save. If there are no errors, they are valid. +- Check the worker logs for errors about syncing. +- Confirm that outbound network connections are allowed where worker is deployed. +- Check back in an hour to see if the issue has resolved itself. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-src-repoupdater-syncer-sync-errors-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_sched_auto_fetch" + "warning_worker_src_repoupdater_syncer_sync_errors_total", + "critical_worker_src_repoupdater_syncer_sync_errors_total" ] ``` @@ -4096,29 +4065,31 @@ Generated query for warning alert: `max((max(rate(src_repoupdater_purge_failed[1
Technical details -Generated query for warning alert: `min((max(rate(src_repoupdater_sched_auto_fetch[1m]))) <= 0)` +Generated query for warning alert: `max((max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="internal_rate_limit",reason!="invalid_npm_path"}[5m]))) > 0.5)` + +Generated query for critical alert: `max((max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="internal_rate_limit",reason!="invalid_npm_path"}[5m]))) > 1)`

-## repo-updater: sched_known_repos +## worker: syncer_sync_start -

repositories managed by the scheduler

+

repo metadata sync was started

**Descriptions** -- warning repo-updater: less than 0 repositories managed by the scheduler for 10m0s +- warning worker: less than 0 repo metadata sync was started for 9h0m0s **Next steps** -- Check repo-updater logs. This is expected to fire if there are no user added code hosts -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-sched-known-repos). +- Check worker logs for errors. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-syncer-sync-start). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_sched_known_repos" + "warning_worker_syncer_sync_start" ] ``` @@ -4127,29 +4098,29 @@ Generated query for warning alert: `min((max(rate(src_repoupdater_sched_auto_fet
Technical details -Generated query for warning alert: `min((max(src_repoupdater_sched_known_repos)) <= 0)` +Generated query for warning alert: `min((max by (family) (rate(src_repoupdater_syncer_start_sync{family="Syncer.SyncExternalService"}[9h]))) <= 0)`

-## repo-updater: sched_update_queue_length +## worker: syncer_sync_duration -

rate of growth of update queue length over 5 minutes

+

95th repositories sync duration

**Descriptions** -- critical repo-updater: 0+ rate of growth of update queue length over 5 minutes for 2h0m0s +- warning worker: 30s+ 95th repositories sync duration for 5m0s **Next steps** -- Check repo-updater logs for indications that the queue is not being processed. The queue length should trend downwards over time as items are sent to GitServer -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-sched-update-queue-length). +- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-syncer-sync-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_sched_update_queue_length" + "warning_worker_syncer_sync_duration" ] ``` @@ -4158,29 +4129,29 @@ Generated query for warning alert: `min((max(src_repoupdater_sched_known_repos))
Technical details -Generated query for critical alert: `max((max(deriv(src_repoupdater_sched_update_queue_length[5m]))) > 0)` +Generated query for warning alert: `max((histogram_quantile(0.95, max by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m])))) >= 30)`

-## repo-updater: sched_loops +## worker: source_duration -

scheduler loops

+

95th repositories source duration

**Descriptions** -- warning repo-updater: less than 0 scheduler loops for 9h0m0s +- warning worker: 30s+ 95th repositories source duration for 5m0s **Next steps** -- Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-sched-loops). +- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-source-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_sched_loops" + "warning_worker_source_duration" ] ``` @@ -4189,30 +4160,29 @@ Generated query for critical alert: `max((max(deriv(src_repoupdater_sched_update
Technical details -Generated query for warning alert: `min((max(rate(src_repoupdater_sched_loops[1m]))) <= 0)` +Generated query for warning alert: `max((histogram_quantile(0.95, max by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m])))) >= 30)`

-## repo-updater: src_repoupdater_stale_repos +## worker: syncer_synced_repos -

repos that haven't been fetched in more than 8 hours

+

repositories synced

**Descriptions** -- warning repo-updater: 1+ repos that haven't been fetched in more than 8 hours for 25m0s +- warning worker: less than 0 repositories synced for 9h0m0s **Next steps** -- Check repo-updater logs for errors. - Check for rows in gitserver_repos where LastError is not an empty string. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-src-repoupdater-stale-repos). +- Check network connectivity to code hosts +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-syncer-synced-repos). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_src_repoupdater_stale_repos" + "warning_worker_syncer_synced_repos" ] ``` @@ -4221,29 +4191,29 @@ Generated query for warning alert: `min((max(rate(src_repoupdater_sched_loops[1m
Technical details -Generated query for warning alert: `max((max(src_repoupdater_stale_repos)) >= 1)` +Generated query for warning alert: `max((max(rate(src_repoupdater_syncer_synced_repos_total[1m]))) <= 0)`

-## repo-updater: sched_error +## worker: sourced_repos -

repositories schedule error rate

+

repositories sourced

**Descriptions** -- critical repo-updater: 1+ repositories schedule error rate for 25m0s +- warning worker: less than 0 repositories sourced for 9h0m0s **Next steps** -- Check repo-updater logs for errors -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-sched-error). +- Check network connectivity to code hosts +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-sourced-repos). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_sched_error" + "warning_worker_sourced_repos" ] ``` @@ -4252,29 +4222,29 @@ Generated query for warning alert: `max((max(src_repoupdater_stale_repos)) >=
Technical details -Generated query for critical alert: `max((max(rate(src_repoupdater_sched_error[1m]))) >= 1)` +Generated query for warning alert: `min((max(rate(src_repoupdater_source_repos_total[1m]))) <= 0)`

-## repo-updater: src_repoupdater_external_services_total +## worker: sched_auto_fetch -

the total number of external services

+

repositories scheduled due to hitting a deadline

**Descriptions** -- critical repo-updater: 20000+ the total number of external services for 1h0m0s +- warning worker: less than 0 repositories scheduled due to hitting a deadline for 9h0m0s **Next steps** -- Check for spikes in external services, could be abuse -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-src-repoupdater-external-services-total). +- Check worker logs. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-sched-auto-fetch). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_src_repoupdater_external_services_total" + "warning_worker_sched_auto_fetch" ] ``` @@ -4283,30 +4253,29 @@ Generated query for critical alert: `max((max(rate(src_repoupdater_sched_error[1
Technical details -Generated query for critical alert: `max((max(src_repoupdater_external_services_total)) >= 20000)` +Generated query for warning alert: `min((max(rate(src_repoupdater_sched_auto_fetch[1m]))) <= 0)`

-## repo-updater: repoupdater_queued_sync_jobs_total +## worker: sched_loops -

the total number of queued sync jobs

+

scheduler loops

**Descriptions** -- warning repo-updater: 100+ the total number of queued sync jobs for 1h0m0s +- warning worker: less than 0 scheduler loops for 9h0m0s **Next steps** -- **Check if jobs are failing to sync:** "SELECT * FROM external_service_sync_jobs WHERE state = `errored`"; -- **Increase the number of workers** using the `repoConcurrentExternalServiceSyncers` site config. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-repoupdater-queued-sync-jobs-total). +- Check worker logs for errors. This is expected to fire if there are no user added code hosts +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-sched-loops). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_repoupdater_queued_sync_jobs_total" + "warning_worker_sched_loops" ] ``` @@ -4315,29 +4284,30 @@ Generated query for critical alert: `max((max(src_repoupdater_external_services_
Technical details -Generated query for warning alert: `max((max(src_repoupdater_queued_sync_jobs_total)) >= 100)` +Generated query for warning alert: `min((max(rate(src_repoupdater_sched_loops[1m]))) <= 0)`

-## repo-updater: repoupdater_completed_sync_jobs_total +## worker: src_repoupdater_stale_repos -

the total number of completed sync jobs

+

repos that haven't been fetched in more than 8 hours

**Descriptions** -- warning repo-updater: 100000+ the total number of completed sync jobs for 1h0m0s +- warning worker: 1+ repos that haven't been fetched in more than 8 hours for 25m0s **Next steps** -- Check repo-updater logs. Jobs older than 1 day should have been removed. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-repoupdater-completed-sync-jobs-total). +- Check worker logs for errors. + Check for rows in gitserver_repos where LastError is not an empty string. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-src-repoupdater-stale-repos). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_repoupdater_completed_sync_jobs_total" + "warning_worker_src_repoupdater_stale_repos" ] ``` @@ -4346,29 +4316,29 @@ Generated query for warning alert: `max((max(src_repoupdater_queued_sync_jobs_to
Technical details -Generated query for warning alert: `max((max(src_repoupdater_completed_sync_jobs_total)) >= 100000)` +Generated query for warning alert: `max((max(src_repoupdater_stale_repos)) >= 1)`

-## repo-updater: repoupdater_errored_sync_jobs_percentage +## worker: sched_error -

the percentage of external services that have failed their most recent sync

+

repositories schedule error rate

**Descriptions** -- warning repo-updater: 10%+ the percentage of external services that have failed their most recent sync for 1h0m0s +- critical worker: 1+ repositories schedule error rate for 25m0s **Next steps** -- Check repo-updater logs. Check code host connectivity -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-repoupdater-errored-sync-jobs-percentage). +- Check worker logs for errors +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-sched-error). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_repoupdater_errored_sync_jobs_percentage" + "critical_worker_sched_error" ] ``` @@ -4377,29 +4347,29 @@ Generated query for warning alert: `max((max(src_repoupdater_completed_sync_jobs
Technical details -Generated query for warning alert: `max((max(src_repoupdater_errored_sync_jobs_percentage)) > 10)` +Generated query for critical alert: `max((max(rate(src_repoupdater_sched_error[1m]))) >= 1)`

-## repo-updater: github_graphql_rate_limit_remaining +## worker: src_repoupdater_external_services_total -

remaining calls to GitHub graphql API before hitting the rate limit

+

the total number of external services

**Descriptions** -- warning repo-updater: less than 250 remaining calls to GitHub graphql API before hitting the rate limit +- critical worker: 20000+ the total number of external services for 1h0m0s **Next steps** -- Consider creating a new token for the indicated resource (the `name` label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-github-graphql-rate-limit-remaining). +- Check for spikes in external services, could be abuse +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-src-repoupdater-external-services-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_github_graphql_rate_limit_remaining" + "critical_worker_src_repoupdater_external_services_total" ] ``` @@ -4408,29 +4378,30 @@ Generated query for warning alert: `max((max(src_repoupdater_errored_sync_jobs_p
Technical details -Generated query for warning alert: `min((max by (name) (src_github_rate_limit_remaining_v2{resource="graphql"})) <= 250)` +Generated query for critical alert: `max((max(src_repoupdater_external_services_total)) >= 20000)`

-## repo-updater: github_rest_rate_limit_remaining +## worker: repoupdater_queued_sync_jobs_total -

remaining calls to GitHub rest API before hitting the rate limit

+

the total number of queued sync jobs

**Descriptions** -- warning repo-updater: less than 250 remaining calls to GitHub rest API before hitting the rate limit +- warning worker: 100+ the total number of queued sync jobs for 1h0m0s **Next steps** -- Consider creating a new token for the indicated resource (the `name` label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-github-rest-rate-limit-remaining). +- **Check if jobs are failing to sync:** "SELECT * FROM external_service_sync_jobs WHERE state = `errored`"; +- **Increase the number of workers** using the `repoConcurrentExternalServiceSyncers` site config. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-repoupdater-queued-sync-jobs-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_github_rest_rate_limit_remaining" + "warning_worker_repoupdater_queued_sync_jobs_total" ] ``` @@ -4439,29 +4410,29 @@ Generated query for warning alert: `min((max by (name) (src_github_rate_limit_re
Technical details -Generated query for warning alert: `min((max by (name) (src_github_rate_limit_remaining_v2{resource="rest"})) <= 250)` +Generated query for warning alert: `max((max(src_repoupdater_queued_sync_jobs_total)) >= 100)`

-## repo-updater: github_search_rate_limit_remaining +## worker: repoupdater_completed_sync_jobs_total -

remaining calls to GitHub search API before hitting the rate limit

+

the total number of completed sync jobs

**Descriptions** -- warning repo-updater: less than 5 remaining calls to GitHub search API before hitting the rate limit +- warning worker: 100000+ the total number of completed sync jobs for 1h0m0s **Next steps** -- Consider creating a new token for the indicated resource (the `name` label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-github-search-rate-limit-remaining). +- Check worker logs. Jobs older than 1 day should have been removed. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-repoupdater-completed-sync-jobs-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_github_search_rate_limit_remaining" + "warning_worker_repoupdater_completed_sync_jobs_total" ] ``` @@ -4470,29 +4441,29 @@ Generated query for warning alert: `min((max by (name) (src_github_rate_limit_re
Technical details -Generated query for warning alert: `min((max by (name) (src_github_rate_limit_remaining_v2{resource="search"})) <= 5)` +Generated query for warning alert: `max((max(src_repoupdater_completed_sync_jobs_total)) >= 100000)`

-## repo-updater: gitlab_rest_rate_limit_remaining +## worker: repoupdater_errored_sync_jobs_percentage -

remaining calls to GitLab rest API before hitting the rate limit

+

the percentage of external services that have failed their most recent sync

**Descriptions** -- critical repo-updater: less than 30 remaining calls to GitLab rest API before hitting the rate limit +- warning worker: 10%+ the percentage of external services that have failed their most recent sync for 1h0m0s **Next steps** -- Try restarting the pod to get a different public IP. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-gitlab-rest-rate-limit-remaining). +- Check worker logs. Check code host connectivity +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-repoupdater-errored-sync-jobs-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_gitlab_rest_rate_limit_remaining" + "warning_worker_repoupdater_errored_sync_jobs_percentage" ] ``` @@ -4501,65 +4472,60 @@ Generated query for warning alert: `min((max by (name) (src_github_rate_limit_re
Technical details -Generated query for critical alert: `min((max by (name) (src_gitlab_rate_limit_remaining{resource="rest"})) <= 30)` +Generated query for warning alert: `max((max(src_repoupdater_errored_sync_jobs_percentage)) > 10)`

-## repo-updater: repo_updater_site_configuration_duration_since_last_successful_update_by_instance +## worker: github_graphql_rate_limit_remaining -

maximum duration since last successful site configuration update (all "repo_updater" instances)

+

remaining calls to GitHub graphql API before hitting the rate limit

**Descriptions** -- critical repo-updater: 300s+ maximum duration since last successful site configuration update (all "repo_updater" instances) +- warning worker: less than 250 remaining calls to GitHub graphql API before hitting the rate limit **Next steps** -- This indicates that one or more "repo_updater" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. -- Check for relevant errors in the "repo_updater" logs, as well as frontend`s logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-repo-updater-site-configuration-duration-since-last-successful-update-by-instance). +- Consider creating a new token for the indicated resource (the `name` label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-github-graphql-rate-limit-remaining). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_repo_updater_site_configuration_duration_since_last_successful_update_by_instance" + "warning_worker_github_graphql_rate_limit_remaining" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*repo-updater"}[1m]))) >= 300)` +Generated query for warning alert: `min((max by (name) (src_github_rate_limit_remaining_v2{resource="graphql"})) <= 250)`

-## repo-updater: mean_blocked_seconds_per_conn_request +## worker: github_rest_rate_limit_remaining -

mean blocked seconds per conn request

+

remaining calls to GitHub rest API before hitting the rate limit

**Descriptions** -- warning repo-updater: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical repo-updater: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning worker: less than 250 remaining calls to GitHub rest API before hitting the rate limit **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-mean-blocked-seconds-per-conn-request). +- Consider creating a new token for the indicated resource (the `name` label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-github-rest-rate-limit-remaining). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_mean_blocked_seconds_per_conn_request", - "critical_repo-updater_mean_blocked_seconds_per_conn_request" + "warning_worker_github_rest_rate_limit_remaining" ] ``` @@ -4568,32 +4534,29 @@ Generated query for critical alert: `max((max(max_over_time(src_conf_client_time
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="repo-updater"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="repo-updater"}[5m]))) >= 0.1)` - -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="repo-updater"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="repo-updater"}[5m]))) >= 0.5)` +Generated query for warning alert: `min((max by (name) (src_github_rate_limit_remaining_v2{resource="rest"})) <= 250)`

-## repo-updater: container_cpu_usage +## worker: github_search_rate_limit_remaining -

container cpu usage total (1m average) across all cores by instance

+

remaining calls to GitHub search API before hitting the rate limit

**Descriptions** -- warning repo-updater: 99%+ container cpu usage total (1m average) across all cores by instance +- warning worker: less than 5 remaining calls to GitHub search API before hitting the rate limit **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the repo-updater container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-container-cpu-usage). +- Consider creating a new token for the indicated resource (the `name` label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-github-search-rate-limit-remaining). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_container_cpu_usage" + "warning_worker_github_search_rate_limit_remaining" ] ``` @@ -4602,30 +4565,29 @@ Generated query for critical alert: `max((sum by (app_name, db_name) (increase(s
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^repo-updater.*"}) >= 99)` +Generated query for warning alert: `min((max by (name) (src_github_rate_limit_remaining_v2{resource="search"})) <= 5)`

-## repo-updater: container_memory_usage +## worker: gitlab_rest_rate_limit_remaining -

container memory usage by instance

+

remaining calls to GitLab rest API before hitting the rate limit

**Descriptions** -- critical repo-updater: 90%+ container memory usage by instance for 10m0s +- critical worker: less than 30 remaining calls to GitLab rest API before hitting the rate limit **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of repo-updater container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-container-memory-usage). +- Try restarting the pod to get a different public IP. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-gitlab-rest-rate-limit-remaining). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_container_memory_usage" + "critical_worker_gitlab_rest_rate_limit_remaining" ] ``` @@ -4634,30 +4596,30 @@ Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage
Technical details -Generated query for critical alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^repo-updater.*"}) >= 90)` +Generated query for critical alert: `min((max by (name) (src_gitlab_rate_limit_remaining{resource="rest"})) <= 30)`

-## repo-updater: provisioning_container_cpu_usage_long_term +## worker: perms_syncer_outdated_perms -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

number of entities with outdated permissions

**Descriptions** -- warning repo-updater: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning worker: 100+ number of entities with outdated permissions for 5m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the repo-updater service. -- **Docker Compose:** Consider increasing `cpus:` of the repo-updater container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-provisioning-container-cpu-usage-long-term). +- **Enabled permissions for the first time:** Wait for few minutes and see if the number goes down. +- **Otherwise:** Increase the API rate limit to [GitHub](https://sourcegraph.com/docs/admin/code_hosts/github#github-com-rate-limits), [GitLab](https://sourcegraph.com/docs/admin/code_hosts/gitlab#internal-rate-limits) or [Bitbucket Server](https://sourcegraph.com/docs/admin/code_hosts/bitbucket_server#internal-rate-limits). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-perms-syncer-outdated-perms). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_provisioning_container_cpu_usage_long_term" + "warning_worker_perms_syncer_outdated_perms" ] ``` @@ -4666,30 +4628,29 @@ Generated query for critical alert: `max((cadvisor_container_memory_usage_percen
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^repo-updater.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (type) (src_repo_perms_syncer_outdated_perms)) >= 100)`

-## repo-updater: provisioning_container_memory_usage_long_term +## worker: perms_syncer_sync_duration -

container memory usage (1d maximum) by instance

+

95th permissions sync duration

**Descriptions** -- warning repo-updater: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning worker: 30s+ 95th permissions sync duration for 5m0s **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the repo-updater service. -- **Docker Compose:** Consider increasing `memory:` of the repo-updater container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-provisioning-container-memory-usage-long-term). +- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-perms-syncer-sync-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_provisioning_container_memory_usage_long_term" + "warning_worker_perms_syncer_sync_duration" ] ``` @@ -4698,30 +4659,30 @@ Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_contai
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^repo-updater.*"}[1d])) >= 80)` +Generated query for warning alert: `max((histogram_quantile(0.95, max by (le, type) (rate(src_repo_perms_syncer_sync_duration_seconds_bucket[1m])))) >= 30)`

-## repo-updater: provisioning_container_cpu_usage_short_term +## worker: perms_syncer_sync_errors -

container cpu usage total (5m maximum) across all cores by instance

+

permissions sync error rate

**Descriptions** -- warning repo-updater: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- critical worker: 1+ permissions sync error rate for 1m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the repo-updater container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-provisioning-container-cpu-usage-short-term). +- Check the network connectivity the Sourcegraph and the code host. +- Check if API rate limit quota is exhausted on the code host. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-perms-syncer-sync-errors). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_provisioning_container_cpu_usage_short_term" + "critical_worker_perms_syncer_sync_errors" ] ``` @@ -4730,567 +4691,573 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^repo-updater.*"}[5m])) >= 90)` +Generated query for critical alert: `max((max by (type) (ceil(rate(src_repo_perms_syncer_sync_errors_total[1m])))) >= 1)`

-## repo-updater: provisioning_container_memory_usage_short_term +## worker: completioncredits_aggregator_errors_total -

container memory usage (5m maximum) by instance

+

completion credits entitlement usage aggregator operation errors every 30m

**Descriptions** -- warning repo-updater: 90%+ container memory usage (5m maximum) by instance +- warning worker: 0+ completion credits entitlement usage aggregator operation errors every 30m **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of repo-updater container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-provisioning-container-memory-usage-short-term). +- Failures indicate that aggregation of completions credits usage against entitlements are failing. +- This may affect completion credits entitlement enforcement. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-completioncredits-aggregator-errors-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_provisioning_container_memory_usage_short_term" + "warning_worker_completioncredits_aggregator_errors_total" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^repo-updater.*"}[5m])) >= 90)` +Generated query for warning alert: `max((sum(increase(src_completioncredits_aggregator_errors_total{job=~"^worker.*"}[30m]))) > 0)`

-## repo-updater: container_oomkill_events_total +## worker: goroutine_error_rate -

container OOMKILL events total by instance

+

error rate for periodic goroutine executions

**Descriptions** -- warning repo-updater: 1+ container OOMKILL events total by instance +- warning worker: 0.01reqps+ error rate for periodic goroutine executions for 15m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of repo-updater container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#repo-updater-container-oomkill-events-total). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Look for recent changes to the routine`s code or configuration +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-goroutine-error-rate). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_container_oomkill_events_total" + "warning_worker_goroutine_error_rate" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^repo-updater.*"})) >= 1)` +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*worker.*"}[5m]))) >= 0.01)`

-## repo-updater: go_goroutines +## worker: goroutine_error_percentage -

maximum active goroutines

+

percentage of periodic goroutine executions that result in errors

**Descriptions** -- warning repo-updater: 10000+ maximum active goroutines for 10m0s +- warning worker: 5%+ percentage of periodic goroutine executions that result in errors **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#repo-updater-go-goroutines). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-goroutine-error-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_go_goroutines" + "warning_worker_goroutine_error_percentage" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*repo-updater"})) >= 10000)` +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*worker.*"\}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total\{job=~".*worker.*"}[5m]) > 0) * 100) >= 5)`

-## repo-updater: go_gc_duration_seconds +## worker: mean_blocked_seconds_per_conn_request -

maximum go garbage collection duration

+

mean blocked seconds per conn request

**Descriptions** -- warning repo-updater: 2s+ maximum go garbage collection duration +- warning worker: 0.1s+ mean blocked seconds per conn request for 10m0s +- critical worker: 0.5s+ mean blocked seconds per conn request for 10m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-go-gc-duration-seconds). +- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed +- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) +- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-mean-blocked-seconds-per-conn-request). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_go_gc_duration_seconds" + "warning_worker_mean_blocked_seconds_per_conn_request", + "critical_worker_mean_blocked_seconds_per_conn_request" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*repo-updater"})) >= 2)` +Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="worker"}[5m]))) >= 0.1)` + +Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="worker"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="worker"}[5m]))) >= 0.5)`

-## repo-updater: pods_available_percentage +## worker: cpu_usage_percentage -

percentage pods available

+

CPU usage

**Descriptions** -- critical repo-updater: less than 90% percentage pods available for 10m0s +- warning worker: 95%+ CPU usage for 10m0s **Next steps** -- Determine if the pod was OOM killed using `kubectl describe pod repo-updater` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p repo-updater`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#repo-updater-pods-available-percentage). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_repo-updater_pods_available_percentage" + "warning_worker_cpu_usage_percentage" ] ``` -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for critical alert: `min((sum by (app) (up{app=~".*repo-updater"\}) / count by (app) (up\{app=~".*repo-updater"}) * 100) <= 90)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}) >= 95)`

-## searcher: replica_traffic +## worker: memory_rss -

requests per second per replica over 10m

+

memory (RSS)

**Descriptions** -- warning searcher: 5+ requests per second per replica over 10m +- warning worker: 90%+ memory (RSS) for 10m0s **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-replica-traffic). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_replica_traffic" + "warning_worker_memory_rss" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((sum by (instance) (rate(searcher_service_request_total[10m]))) >= 5)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^worker.*"\} / container_spec_memory_limit_bytes\{name=~"^worker.*"}) * 100) >= 90)`

-## searcher: unindexed_search_request_errors +## worker: container_cpu_usage -

unindexed search request errors every 5m by code

+

container cpu usage total (1m average) across all cores by instance

**Descriptions** -- warning searcher: 5%+ unindexed search request errors every 5m by code for 5m0s +- warning worker: 99%+ container cpu usage total (1m average) across all cores by instance **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-unindexed-search-request-errors). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-container-cpu-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_unindexed_search_request_errors" + "warning_worker_container_cpu_usage" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((sum by (code) (increase(searcher_service_request_total{code!="200",code!="canceled"}[5m])) / ignoring (code) group_left () sum(increase(searcher_service_request_total[5m])) * 100) >= 5)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}) >= 99)`

-## searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance +## worker: container_memory_usage -

maximum duration since last successful site configuration update (all "searcher" instances)

+

container memory usage by instance

**Descriptions** -- critical searcher: 300s+ maximum duration since last successful site configuration update (all "searcher" instances) +- warning worker: 99%+ container memory usage by instance **Next steps** -- This indicates that one or more "searcher" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. -- Check for relevant errors in the "searcher" logs, as well as frontend`s logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-searcher-site-configuration-duration-since-last-successful-update-by-instance). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-container-memory-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_searcher_searcher_site_configuration_duration_since_last_successful_update_by_instance" + "warning_worker_container_memory_usage" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*searcher"}[1m]))) >= 300)` +Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}) >= 99)`

-## searcher: mean_blocked_seconds_per_conn_request +## worker: provisioning_container_cpu_usage_long_term -

mean blocked seconds per conn request

+

container cpu usage total (90th percentile over 1d) across all cores by instance

**Descriptions** -- warning searcher: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical searcher: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning worker: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-mean-blocked-seconds-per-conn-request). +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the worker service. +- **Docker Compose:** Consider increasing `cpus:` of the worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-cpu-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_mean_blocked_seconds_per_conn_request", - "critical_searcher_mean_blocked_seconds_per_conn_request" + "warning_worker_provisioning_container_cpu_usage_long_term" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="searcher"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="searcher"}[5m]))) >= 0.1)` - -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="searcher"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="searcher"}[5m]))) >= 0.5)` +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[1d])) >= 80)`

-## searcher: container_cpu_usage +## worker: provisioning_container_memory_usage_long_term -

container cpu usage total (1m average) across all cores by instance

+

container memory usage (1d maximum) by instance

**Descriptions** -- warning searcher: 99%+ container cpu usage total (1m average) across all cores by instance +- warning worker: 80%+ container memory usage (1d maximum) by instance for 336h0m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-container-cpu-usage). +- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the worker service. +- **Docker Compose:** Consider increasing `memory:` of the worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-memory-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_container_cpu_usage" + "warning_worker_provisioning_container_memory_usage_long_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}) >= 99)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[1d])) >= 80)`

-## searcher: container_memory_usage +## worker: provisioning_container_cpu_usage_short_term -

container memory usage by instance

+

container cpu usage total (5m maximum) across all cores by instance

**Descriptions** -- warning searcher: 99%+ container memory usage by instance +- warning worker: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-container-memory-usage). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-cpu-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_container_memory_usage" + "warning_worker_provisioning_container_cpu_usage_short_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}) >= 99)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[5m])) >= 90)`

-## searcher: provisioning_container_cpu_usage_long_term +## worker: provisioning_container_memory_usage_short_term -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

container memory usage (5m maximum) by instance

**Descriptions** -- warning searcher: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning worker: 90%+ container memory usage (5m maximum) by instance **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the searcher service. -- **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-cpu-usage-long-term). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of worker container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-provisioning-container-memory-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_provisioning_container_cpu_usage_long_term" + "warning_worker_provisioning_container_memory_usage_short_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[5m])) >= 90)`

-## searcher: provisioning_container_memory_usage_long_term +## worker: container_oomkill_events_total -

container memory usage (1d maximum) by instance

+

container OOMKILL events total by instance

**Descriptions** -- warning searcher: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning worker: 1+ container OOMKILL events total by instance **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the searcher service. -- **Docker Compose:** Consider increasing `memory:` of the searcher container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-memory-usage-long-term). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of worker container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-container-oomkill-events-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_provisioning_container_memory_usage_long_term" + "warning_worker_container_oomkill_events_total" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^worker.*"})) >= 1)`

-## searcher: provisioning_container_cpu_usage_short_term +## worker: go_goroutines -

container cpu usage total (5m maximum) across all cores by instance

+

maximum active goroutines

**Descriptions** -- warning searcher: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning worker: 10000+ maximum active goroutines for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-cpu-usage-short-term). +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-go-goroutines). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_provisioning_container_cpu_usage_short_term" + "warning_worker_go_goroutines" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*worker"})) >= 10000)`

-## searcher: provisioning_container_memory_usage_short_term +## worker: go_gc_duration_seconds -

container memory usage (5m maximum) by instance

+

maximum go garbage collection duration

**Descriptions** -- warning searcher: 90%+ container memory usage (5m maximum) by instance +- warning worker: 2s+ maximum go garbage collection duration **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-memory-usage-short-term). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-go-gc-duration-seconds). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_provisioning_container_memory_usage_short_term" + "warning_worker_go_gc_duration_seconds" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*worker"})) >= 2)`

-## searcher: container_oomkill_events_total +## worker: pods_available_percentage -

container OOMKILL events total by instance

+

percentage pods available

**Descriptions** -- warning searcher: 1+ container OOMKILL events total by instance +- critical worker: less than 90% percentage pods available for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-container-oomkill-events-total). +- Determine if the pod was OOM killed using `kubectl describe pod worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p worker`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-pods-available-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_container_oomkill_events_total" + "critical_worker_pods_available_percentage" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^searcher.*"})) >= 1)` +Generated query for critical alert: `min((sum by (app) (up{app=~".*worker"\}) / count by (app) (up\{app=~".*worker"}) * 100) <= 90)`

-## searcher: go_goroutines +## worker: worker_site_configuration_duration_since_last_successful_update_by_instance -

maximum active goroutines

+

maximum duration since last successful site configuration update (all "worker" instances)

**Descriptions** -- warning searcher: 10000+ maximum active goroutines for 10m0s +- critical worker: 300s+ maximum duration since last successful site configuration update (all "worker" instances) **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-go-goroutines). +- This indicates that one or more "worker" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. +- Check for relevant errors in the "worker" logs, as well as frontend`s logs. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#worker-worker-site-configuration-duration-since-last-successful-update-by-instance). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_go_goroutines" + "critical_worker_worker_site_configuration_duration_since_last_successful_update_by_instance" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*searcher"})) >= 10000)` +Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~"^worker.*"}[1m]))) >= 300)`

-## searcher: go_gc_duration_seconds +## searcher: replica_traffic -

maximum go garbage collection duration

+

requests per second per replica over 10m

**Descriptions** -- warning searcher: 2s+ maximum go garbage collection duration +- warning searcher: 5+ requests per second per replica over 10m **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-go-gc-duration-seconds). +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-replica-traffic). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_searcher_go_gc_duration_seconds" + "warning_searcher_replica_traffic" ] ``` @@ -5299,30 +5266,28 @@ Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~"
Technical details -Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*searcher"})) >= 2)` +Generated query for warning alert: `max((sum by (instance) (rate(searcher_service_request_total[10m]))) >= 5)`

-## searcher: pods_available_percentage +## searcher: unindexed_search_request_errors -

percentage pods available

+

unindexed search request errors every 5m by code

**Descriptions** -- critical searcher: less than 90% percentage pods available for 10m0s +- warning searcher: 5%+ unindexed search request errors every 5m by code for 5m0s **Next steps** -- Determine if the pod was OOM killed using `kubectl describe pod searcher` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p searcher`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-pods-available-percentage). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-unindexed-search-request-errors). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_searcher_pods_available_percentage" + "warning_searcher_unindexed_search_request_errors" ] ``` @@ -5331,30 +5296,30 @@ Generated query for warning alert: `max((max by (instance) (go_gc_duration_secon
Technical details -Generated query for critical alert: `min((sum by (app) (up{app=~".*searcher"\}) / count by (app) (up\{app=~".*searcher"}) * 100) <= 90)` +Generated query for warning alert: `max((sum by (code) (increase(searcher_service_request_total{code!="200",code!="canceled"}[5m])) / ignoring (code) group_left () sum(increase(searcher_service_request_total[5m])) * 100) >= 5)`

-## symbols: symbols_site_configuration_duration_since_last_successful_update_by_instance +## searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance -

maximum duration since last successful site configuration update (all "symbols" instances)

+

maximum duration since last successful site configuration update (all "searcher" instances)

**Descriptions** -- critical symbols: 300s+ maximum duration since last successful site configuration update (all "symbols" instances) +- critical searcher: 300s+ maximum duration since last successful site configuration update (all "searcher" instances) **Next steps** -- This indicates that one or more "symbols" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. -- Check for relevant errors in the "symbols" logs, as well as frontend`s logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-symbols-site-configuration-duration-since-last-successful-update-by-instance). +- This indicates that one or more "searcher" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. +- Check for relevant errors in the "searcher" logs, as well as frontend`s logs. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-searcher-site-configuration-duration-since-last-successful-update-by-instance). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_symbols_symbols_site_configuration_duration_since_last_successful_update_by_instance" + "critical_searcher_searcher_site_configuration_duration_since_last_successful_update_by_instance" ] ``` @@ -5363,33 +5328,31 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*searcher"\})
Technical details -Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*symbols"}[1m]))) >= 300)` +Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*searcher"}[1m]))) >= 300)`

-## symbols: mean_blocked_seconds_per_conn_request +## searcher: goroutine_error_rate -

mean blocked seconds per conn request

+

error rate for periodic goroutine executions

**Descriptions** -- warning symbols: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical symbols: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning searcher: 0.01reqps+ error rate for periodic goroutine executions for 15m0s **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-mean-blocked-seconds-per-conn-request). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Look for recent changes to the routine`s code or configuration +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-goroutine-error-rate). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_mean_blocked_seconds_per_conn_request", - "critical_symbols_mean_blocked_seconds_per_conn_request" + "warning_searcher_goroutine_error_rate" ] ``` @@ -5398,540 +5361,540 @@ Generated query for critical alert: `max((max(max_over_time(src_conf_client_time
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="symbols"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="symbols"}[5m]))) >= 0.1)` - -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="symbols"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="symbols"}[5m]))) >= 0.5)` +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*searcher.*"}[5m]))) >= 0.01)`

-## symbols: container_cpu_usage +## searcher: goroutine_error_percentage -

container cpu usage total (1m average) across all cores by instance

+

percentage of periodic goroutine executions that result in errors

**Descriptions** -- warning symbols: 99%+ container cpu usage total (1m average) across all cores by instance +- warning searcher: 5%+ percentage of periodic goroutine executions that result in errors **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the symbols container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-container-cpu-usage). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-goroutine-error-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_container_cpu_usage" + "warning_searcher_goroutine_error_percentage" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^symbols.*"}) >= 99)` +Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*searcher.*"\}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total\{job=~".*searcher.*"}[5m]) > 0) * 100) >= 5)`

-## symbols: container_memory_usage +## searcher: mean_blocked_seconds_per_conn_request -

container memory usage by instance

+

mean blocked seconds per conn request

**Descriptions** -- warning symbols: 99%+ container memory usage by instance +- warning searcher: 0.1s+ mean blocked seconds per conn request for 10m0s +- critical searcher: 0.5s+ mean blocked seconds per conn request for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of symbols container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-container-memory-usage). +- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed +- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) +- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-mean-blocked-seconds-per-conn-request). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_container_memory_usage" + "warning_searcher_mean_blocked_seconds_per_conn_request", + "critical_searcher_mean_blocked_seconds_per_conn_request" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^symbols.*"}) >= 99)` +Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="searcher"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="searcher"}[5m]))) >= 0.1)` + +Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="searcher"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="searcher"}[5m]))) >= 0.5)`

-## symbols: provisioning_container_cpu_usage_long_term +## searcher: cpu_usage_percentage -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

CPU usage

**Descriptions** -- warning symbols: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning searcher: 95%+ CPU usage for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the symbols service. -- **Docker Compose:** Consider increasing `cpus:` of the symbols container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-provisioning-container-cpu-usage-long-term). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_provisioning_container_cpu_usage_long_term" + "warning_searcher_cpu_usage_percentage" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^symbols.*"}[1d])) >= 80)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}) >= 95)`

-## symbols: provisioning_container_memory_usage_long_term +## searcher: memory_rss -

container memory usage (1d maximum) by instance

+

memory (RSS)

**Descriptions** -- warning symbols: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning searcher: 90%+ memory (RSS) for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the symbols service. -- **Docker Compose:** Consider increasing `memory:` of the symbols container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-provisioning-container-memory-usage-long-term). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_provisioning_container_memory_usage_long_term" + "warning_searcher_memory_rss" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^symbols.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^searcher.*"\} / container_spec_memory_limit_bytes\{name=~"^searcher.*"}) * 100) >= 90)`

-## symbols: provisioning_container_cpu_usage_short_term +## searcher: container_cpu_usage -

container cpu usage total (5m maximum) across all cores by instance

+

container cpu usage total (1m average) across all cores by instance

**Descriptions** -- warning symbols: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning searcher: 99%+ container cpu usage total (1m average) across all cores by instance **Next steps** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the symbols container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-provisioning-container-cpu-usage-short-term). +- **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-container-cpu-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_provisioning_container_cpu_usage_short_term" + "warning_searcher_container_cpu_usage" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^symbols.*"}[5m])) >= 90)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}) >= 99)`

-## symbols: provisioning_container_memory_usage_short_term +## searcher: container_memory_usage -

container memory usage (5m maximum) by instance

+

container memory usage by instance

**Descriptions** -- warning symbols: 90%+ container memory usage (5m maximum) by instance +- warning searcher: 99%+ container memory usage by instance **Next steps** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of symbols container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-provisioning-container-memory-usage-short-term). +- **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-container-memory-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_provisioning_container_memory_usage_short_term" + "warning_searcher_container_memory_usage" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^symbols.*"}[5m])) >= 90)` +Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}) >= 99)`

-## symbols: container_oomkill_events_total +## searcher: provisioning_container_cpu_usage_long_term -

container OOMKILL events total by instance

+

container cpu usage total (90th percentile over 1d) across all cores by instance

**Descriptions** -- warning symbols: 1+ container OOMKILL events total by instance +- warning searcher: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of symbols container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#symbols-container-oomkill-events-total). +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the searcher service. +- **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-cpu-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_container_oomkill_events_total" + "warning_searcher_provisioning_container_cpu_usage_long_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^symbols.*"})) >= 1)` +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[1d])) >= 80)`

-## symbols: go_goroutines +## searcher: provisioning_container_memory_usage_long_term -

maximum active goroutines

+

container memory usage (1d maximum) by instance

**Descriptions** -- warning symbols: 10000+ maximum active goroutines for 10m0s +- warning searcher: 80%+ container memory usage (1d maximum) by instance for 336h0m0s **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#symbols-go-goroutines). +- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the searcher service. +- **Docker Compose:** Consider increasing `memory:` of the searcher container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-memory-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_go_goroutines" + "warning_searcher_provisioning_container_memory_usage_long_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*symbols"})) >= 10000)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[1d])) >= 80)`

-## symbols: go_gc_duration_seconds +## searcher: provisioning_container_cpu_usage_short_term -

maximum go garbage collection duration

+

container cpu usage total (5m maximum) across all cores by instance

**Descriptions** -- warning symbols: 2s+ maximum go garbage collection duration +- warning searcher: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-go-gc-duration-seconds). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-cpu-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_symbols_go_gc_duration_seconds" + "warning_searcher_provisioning_container_cpu_usage_short_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*symbols"})) >= 2)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[5m])) >= 90)`

-## symbols: pods_available_percentage +## searcher: provisioning_container_memory_usage_short_term -

percentage pods available

+

container memory usage (5m maximum) by instance

**Descriptions** -- critical symbols: less than 90% percentage pods available for 10m0s +- warning searcher: 90%+ container memory usage (5m maximum) by instance **Next steps** -- Determine if the pod was OOM killed using `kubectl describe pod symbols` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p symbols`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#symbols-pods-available-percentage). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-provisioning-container-memory-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_symbols_pods_available_percentage" + "warning_searcher_provisioning_container_memory_usage_short_term" ] ``` -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for critical alert: `min((sum by (app) (up{app=~".*symbols"\}) / count by (app) (up\{app=~".*symbols"}) * 100) <= 90)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[5m])) >= 90)`

-## syntect-server: container_cpu_usage +## searcher: container_oomkill_events_total -

container cpu usage total (1m average) across all cores by instance

+

container OOMKILL events total by instance

**Descriptions** -- warning syntect-server: 99%+ container cpu usage total (1m average) across all cores by instance +- warning searcher: 1+ container OOMKILL events total by instance **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-container-cpu-usage). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-container-oomkill-events-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_container_cpu_usage" + "warning_searcher_container_oomkill_events_total" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}) >= 99)` +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^searcher.*"})) >= 1)`

-## syntect-server: container_memory_usage +## searcher: go_goroutines -

container memory usage by instance

+

maximum active goroutines

**Descriptions** -- warning syntect-server: 99%+ container memory usage by instance +- warning searcher: 10000+ maximum active goroutines for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-container-memory-usage). +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-go-goroutines). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_container_memory_usage" + "warning_searcher_go_goroutines" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"}) >= 99)` +Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*searcher"})) >= 10000)`

-## syntect-server: provisioning_container_cpu_usage_long_term +## searcher: go_gc_duration_seconds -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

maximum go garbage collection duration

**Descriptions** -- warning syntect-server: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning searcher: 2s+ maximum go garbage collection duration **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the syntect-server service. -- **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-cpu-usage-long-term). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-go-gc-duration-seconds). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_provisioning_container_cpu_usage_long_term" + "warning_searcher_go_gc_duration_seconds" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*searcher"})) >= 2)`

-## syntect-server: provisioning_container_memory_usage_long_term +## searcher: pods_available_percentage -

container memory usage (1d maximum) by instance

+

percentage pods available

**Descriptions** -- warning syntect-server: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- critical searcher: less than 90% percentage pods available for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the syntect-server service. -- **Docker Compose:** Consider increasing `memory:` of the syntect-server container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-memory-usage-long-term). +- Determine if the pod was OOM killed using `kubectl describe pod searcher` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p searcher`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#searcher-pods-available-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_provisioning_container_memory_usage_long_term" + "critical_searcher_pods_available_percentage" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"}[1d])) >= 80)` +Generated query for critical alert: `min((sum by (app) (up{app=~".*searcher"\}) / count by (app) (up\{app=~".*searcher"}) * 100) <= 90)`

-## syntect-server: provisioning_container_cpu_usage_short_term +## syntect-server: cpu_usage_percentage -

container cpu usage total (5m maximum) across all cores by instance

+

CPU usage

**Descriptions** -- warning syntect-server: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning syntect-server: 95%+ CPU usage for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-cpu-usage-short-term). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_provisioning_container_cpu_usage_short_term" + "warning_syntect-server_cpu_usage_percentage" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}[5m])) >= 90)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}) >= 95)`

-## syntect-server: provisioning_container_memory_usage_short_term +## syntect-server: memory_rss -

container memory usage (5m maximum) by instance

+

memory (RSS)

**Descriptions** -- warning syntect-server: 90%+ container memory usage (5m maximum) by instance +- warning syntect-server: 90%+ memory (RSS) for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-memory-usage-short-term). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#syntect-server-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_provisioning_container_memory_usage_short_term" + "warning_syntect-server_memory_rss" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^syntect-server.*"\} / container_spec_memory_limit_bytes\{name=~"^syntect-server.*"}) * 100) >= 90)`

-## syntect-server: container_oomkill_events_total +## syntect-server: container_cpu_usage -

container OOMKILL events total by instance

+

container cpu usage total (1m average) across all cores by instance

**Descriptions** -- warning syntect-server: 1+ container OOMKILL events total by instance +- warning syntect-server: 99%+ container cpu usage total (1m average) across all cores by instance **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#syntect-server-container-oomkill-events-total). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-container-cpu-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_syntect-server_container_oomkill_events_total" + "warning_syntect-server_container_cpu_usage" ] ``` @@ -5940,30 +5903,30 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^syntect-server.*"})) >= 1)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}) >= 99)`

-## syntect-server: pods_available_percentage +## syntect-server: container_memory_usage -

percentage pods available

+

container memory usage by instance

**Descriptions** -- critical syntect-server: less than 90% percentage pods available for 10m0s +- warning syntect-server: 99%+ container memory usage by instance **Next steps** -- Determine if the pod was OOM killed using `kubectl describe pod syntect-server` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p syntect-server`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-pods-available-percentage). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-container-memory-usage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_syntect-server_pods_available_percentage" + "warning_syntect-server_container_memory_usage" ] ``` @@ -5972,237 +5935,220 @@ Generated query for warning alert: `max((max by (name) (container_oom_events_tot
Technical details -Generated query for critical alert: `min((sum by (app) (up{app=~".*syntect-server"\}) / count by (app) (up\{app=~".*syntect-server"}) * 100) <= 90)` +Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"}) >= 99)`

-## zoekt: average_resolve_revision_duration +## syntect-server: provisioning_container_cpu_usage_long_term -

average resolve revision duration over 5m

+

container cpu usage total (90th percentile over 1d) across all cores by instance

**Descriptions** -- warning zoekt: 15s+ average resolve revision duration over 5m +- warning syntect-server: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-average-resolve-revision-duration). +- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the syntect-server service. +- **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-cpu-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_average_resolve_revision_duration" + "warning_syntect-server_provisioning_container_cpu_usage_long_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((sum(rate(resolve_revision_seconds_sum[5m])) / sum(rate(resolve_revision_seconds_count[5m]))) >= 15)` +Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}[1d])) >= 80)`

-## zoekt: get_index_options_error_increase +## syntect-server: provisioning_container_memory_usage_long_term -

the number of repositories we failed to get indexing options over 5m

+

container memory usage (1d maximum) by instance

**Descriptions** -- warning zoekt: 100+ the number of repositories we failed to get indexing options over 5m for 5m0s -- critical zoekt: 100+ the number of repositories we failed to get indexing options over 5m for 35m0s +- warning syntect-server: 80%+ container memory usage (1d maximum) by instance for 336h0m0s **Next steps** -- View error rates on gitserver and frontend to identify root cause. -- Rollback frontend/gitserver deployment if due to a bad code change. -- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-get-index-options-error-increase). +- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the syntect-server service. +- **Docker Compose:** Consider increasing `memory:` of the syntect-server container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-memory-usage-long-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_get_index_options_error_increase", - "critical_zoekt_get_index_options_error_increase" + "warning_syntect-server_provisioning_container_memory_usage_long_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((sum(increase(get_index_options_error_total[5m]))) >= 100)` - -Generated query for critical alert: `max((sum(increase(get_index_options_error_total[5m]))) >= 100)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"}[1d])) >= 80)`

-## zoekt: indexed_search_request_errors +## syntect-server: provisioning_container_cpu_usage_short_term -

indexed search request errors every 5m by code

+

container cpu usage total (5m maximum) across all cores by instance

**Descriptions** -- warning zoekt: 5%+ indexed search request errors every 5m by code for 5m0s +- warning syntect-server: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-indexed-search-request-errors). +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-cpu-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_indexed_search_request_errors" + "warning_syntect-server_provisioning_container_cpu_usage_short_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((sum by (code) (increase(src_zoekt_request_duration_seconds_count{code!~"2.."}[5m])) / ignoring (code) group_left () sum(increase(src_zoekt_request_duration_seconds_count[5m])) * 100) >= 5)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"}[5m])) >= 90)`

-## zoekt: memory_map_areas_percentage_used +## syntect-server: provisioning_container_memory_usage_short_term -

process memory map areas percentage used (per instance)

+

container memory usage (5m maximum) by instance

**Descriptions** -- warning zoekt: 60%+ process memory map areas percentage used (per instance) -- critical zoekt: 80%+ process memory map areas percentage used (per instance) +- warning syntect-server: 90%+ container memory usage (5m maximum) by instance **Next steps** -- If you are running out of memory map areas, you could resolve this by: - - - Enabling shard merging for Zoekt: Set SRC_ENABLE_SHARD_MERGING="1" for zoekt-indexserver. Use this option -if your corpus of repositories has a high percentage of small, rarely updated repositories. See -[documentation](https://sourcegraph.com/docs/code-search/features#shard-merging). - - Creating additional Zoekt replicas: This spreads all the shards out amongst more replicas, which -means that each _individual_ replica will have fewer shards. This, in turn, decreases the -amount of memory map areas that a _single_ replica can create (in order to load the shards into memory). - - Increasing the virtual memory subsystem`s "max_map_count" parameter which defines the upper limit of memory areas -a process can use. The default value of max_map_count is usually 65536. We recommend to set this value to 2x the number -of repos to be indexed per Zoekt instance. This means, if you want to index 240k repositories with 3 Zoekt instances, -set max_map_count to (240000 / 3) * 2 = 160000. The exact instructions for tuning this parameter can differ depending -on your environment. See https://kernel.org/doc/Documentation/sysctl/vm.txt for more information. -- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-memory-map-areas-percentage-used). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-provisioning-container-memory-usage-short-term). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_memory_map_areas_percentage_used", - "critical_zoekt_memory_map_areas_percentage_used" + "warning_syntect-server_provisioning_container_memory_usage_short_term" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max(((proc_metrics_memory_map_current_count / proc_metrics_memory_map_max_limit) * 100) >= 60)` - -Generated query for critical alert: `max(((proc_metrics_memory_map_current_count / proc_metrics_memory_map_max_limit) * 100) >= 80)` +Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"}[5m])) >= 90)`

-## zoekt: container_cpu_usage +## syntect-server: container_oomkill_events_total -

container cpu usage total (1m average) across all cores by instance

+

container OOMKILL events total by instance

**Descriptions** -- warning zoekt: 99%+ container cpu usage total (1m average) across all cores by instance +- warning syntect-server: 1+ container OOMKILL events total by instance **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the zoekt-indexserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-container-cpu-usage). +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. +- More help interpreting this metric is available in the [dashboards reference](dashboards#syntect-server-container-oomkill-events-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_container_cpu_usage" + "warning_syntect-server_container_oomkill_events_total" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"}) >= 99)` +Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^syntect-server.*"})) >= 1)`

-## zoekt: container_memory_usage +## syntect-server: pods_available_percentage -

container memory usage by instance

+

percentage pods available

**Descriptions** -- warning zoekt: 99%+ container memory usage by instance +- critical syntect-server: less than 90% percentage pods available for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of zoekt-indexserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-container-memory-usage). +- Determine if the pod was OOM killed using `kubectl describe pod syntect-server` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p syntect-server`. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#syntect-server-pods-available-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_container_memory_usage" + "critical_syntect-server_pods_available_percentage" ] ``` -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"}) >= 99)` +Generated query for critical alert: `min((sum by (app) (up{app=~".*syntect-server"\}) / count by (app) (up\{app=~".*syntect-server"}) * 100) <= 90)`

-## zoekt: container_cpu_usage +## zoekt: average_resolve_revision_duration -

container cpu usage total (1m average) across all cores by instance

+

average resolve revision duration over 5m

**Descriptions** -- warning zoekt: 99%+ container cpu usage total (1m average) across all cores by instance +- warning zoekt: 15s+ average resolve revision duration over 5m **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-container-cpu-usage). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-average-resolve-revision-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_container_cpu_usage" + "warning_zoekt_average_resolve_revision_duration" ] ``` @@ -6211,30 +6157,33 @@ Generated query for warning alert: `max((cadvisor_container_memory_usage_percent
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"}) >= 99)` +Generated query for warning alert: `max((sum(rate(resolve_revision_seconds_sum[5m])) / sum(rate(resolve_revision_seconds_count[5m]))) >= 15)`

-## zoekt: container_memory_usage +## zoekt: get_index_options_error_increase -

container memory usage by instance

+

the number of repositories we failed to get indexing options over 5m

**Descriptions** -- warning zoekt: 99%+ container memory usage by instance +- warning zoekt: 100+ the number of repositories we failed to get indexing options over 5m for 5m0s +- critical zoekt: 100+ the number of repositories we failed to get indexing options over 5m for 35m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of zoekt-webserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-container-memory-usage). +- View error rates on gitserver and frontend to identify root cause. +- Rollback frontend/gitserver deployment if due to a bad code change. +- View error logs for `getIndexOptions` via net/trace debug interface. For example click on a `indexed-search-indexer-` on https://sourcegraph.com/-/debug/. Then click on Traces. Replace sourcegraph.com with your instance address. +- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-get-index-options-error-increase). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_container_memory_usage" + "warning_zoekt_get_index_options_error_increase", + "critical_zoekt_get_index_options_error_increase" ] ``` @@ -6243,30 +6192,31 @@ Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage
Technical details -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"}) >= 99)` +Generated query for warning alert: `max((sum(increase(get_index_options_error_total[5m]))) >= 100)` + +Generated query for critical alert: `max((sum(increase(get_index_options_error_total[5m]))) >= 100)`

-## zoekt: provisioning_container_cpu_usage_long_term +## zoekt: cpu_usage_percentage -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

CPU usage

**Descriptions** -- warning zoekt: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning zoekt: 95%+ CPU usage for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the zoekt-indexserver service. -- **Docker Compose:** Consider increasing `cpus:` of the zoekt-indexserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-cpu-usage-long-term). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_cpu_usage_long_term" + "warning_zoekt_cpu_usage_percentage" ] ``` @@ -6275,30 +6225,29 @@ Generated query for warning alert: `max((cadvisor_container_memory_usage_percent
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"}[1d])) >= 80)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"}) >= 95)`

-## zoekt: provisioning_container_memory_usage_long_term +## zoekt: memory_rss -

container memory usage (1d maximum) by instance

+

memory (RSS)

**Descriptions** -- warning zoekt: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning zoekt: 90%+ memory (RSS) for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the zoekt-indexserver service. -- **Docker Compose:** Consider increasing `memory:` of the zoekt-indexserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-memory-usage-long-term). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_memory_usage_long_term" + "warning_zoekt_memory_rss" ] ``` @@ -6307,30 +6256,29 @@ Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_contai
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^zoekt-indexserver.*"\} / container_spec_memory_limit_bytes\{name=~"^zoekt-indexserver.*"}) * 100) >= 90)`

-## zoekt: provisioning_container_cpu_usage_short_term +## zoekt: cpu_usage_percentage -

container cpu usage total (5m maximum) across all cores by instance

+

CPU usage

**Descriptions** -- warning zoekt: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning zoekt: 95%+ CPU usage for 10m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the zoekt-indexserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-cpu-usage-short-term). +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-cpu-usage-percentage). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_cpu_usage_short_term" + "warning_zoekt_cpu_usage_percentage" ] ``` @@ -6339,30 +6287,29 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"}[5m])) >= 90)` +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"}) >= 95)`

-## zoekt: provisioning_container_memory_usage_short_term +## zoekt: memory_rss -

container memory usage (5m maximum) by instance

+

memory (RSS)

**Descriptions** -- warning zoekt: 90%+ container memory usage (5m maximum) by instance +- warning zoekt: 90%+ memory (RSS) for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of zoekt-indexserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-memory-usage-short-term). +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-memory-rss). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_memory_usage_short_term" + "warning_zoekt_memory_rss" ] ``` @@ -6371,30 +6318,43 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_us
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max by (name) (container_memory_rss{name=~"^zoekt-webserver.*"\} / container_spec_memory_limit_bytes\{name=~"^zoekt-webserver.*"}) * 100) >= 90)`

-## zoekt: container_oomkill_events_total +## zoekt: memory_map_areas_percentage_used -

container OOMKILL events total by instance

+

process memory map areas percentage used (per instance)

**Descriptions** -- warning zoekt: 1+ container OOMKILL events total by instance +- warning zoekt: 60%+ process memory map areas percentage used (per instance) +- critical zoekt: 80%+ process memory map areas percentage used (per instance) **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of zoekt-indexserver container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-container-oomkill-events-total). +- If you are running out of memory map areas, you could resolve this by: + + - Enabling shard merging for Zoekt: Set SRC_ENABLE_SHARD_MERGING="1" for zoekt-indexserver. Use this option +if your corpus of repositories has a high percentage of small, rarely updated repositories. See +[documentation](https://sourcegraph.com/docs/code-search/features#shard-merging). + - Creating additional Zoekt replicas: This spreads all the shards out amongst more replicas, which +means that each _individual_ replica will have fewer shards. This, in turn, decreases the +amount of memory map areas that a _single_ replica can create (in order to load the shards into memory). + - Increasing the virtual memory subsystem`s "max_map_count" parameter which defines the upper limit of memory areas +a process can use. The default value of max_map_count is usually 65536. We recommend to set this value to 2x the number +of repos to be indexed per Zoekt instance. This means, if you want to index 240k repositories with 3 Zoekt instances, +set max_map_count to (240000 / 3) * 2 = 160000. The exact instructions for tuning this parameter can differ depending +on your environment. See https://kernel.org/doc/Documentation/sysctl/vm.txt for more information. +- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-memory-map-areas-percentage-used). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_container_oomkill_events_total" + "warning_zoekt_memory_map_areas_percentage_used", + "critical_zoekt_memory_map_areas_percentage_used" ] ``` @@ -6403,30 +6363,30 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^zoekt-indexserver.*"})) >= 1)` +Generated query for warning alert: `max(((proc_metrics_memory_map_current_count / proc_metrics_memory_map_max_limit) * 100) >= 60)` + +Generated query for critical alert: `max(((proc_metrics_memory_map_current_count / proc_metrics_memory_map_max_limit) * 100) >= 80)`

-## zoekt: provisioning_container_cpu_usage_long_term +## zoekt: indexed_search_request_errors -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

indexed search request errors every 5m by code

**Descriptions** -- warning zoekt: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning zoekt: 5%+ indexed search request errors every 5m by code for 5m0s **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the zoekt-webserver service. -- **Docker Compose:** Consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-cpu-usage-long-term). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-indexed-search-request-errors). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_cpu_usage_long_term" + "warning_zoekt_indexed_search_request_errors" ] ``` @@ -6435,30 +6395,28 @@ Generated query for warning alert: `max((max by (name) (container_oom_events_tot
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"}[1d])) >= 80)` +Generated query for warning alert: `max((sum by (code) (increase(src_zoekt_request_duration_seconds_count{code!~"2.."}[5m])) / ignoring (code) group_left () sum(increase(src_zoekt_request_duration_seconds_count[5m])) * 100) >= 5)`

-## zoekt: provisioning_container_memory_usage_long_term +## zoekt: go_goroutines -

container memory usage (1d maximum) by instance

+

maximum active goroutines

**Descriptions** -- warning zoekt: 80%+ container memory usage (1d maximum) by instance for 336h0m0s +- warning zoekt: 10000+ maximum active goroutines for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the zoekt-webserver service. -- **Docker Compose:** Consider increasing `memory:` of the zoekt-webserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-memory-usage-long-term). +- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-go-goroutines). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_memory_usage_long_term" + "warning_zoekt_go_goroutines" ] ``` @@ -6467,30 +6425,28 @@ Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_contai
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"}[1d])) >= 80)` +Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*indexed-search-indexer"})) >= 10000)`

-## zoekt: provisioning_container_cpu_usage_short_term +## zoekt: go_gc_duration_seconds -

container cpu usage total (5m maximum) across all cores by instance

+

maximum go garbage collection duration

**Descriptions** -- warning zoekt: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning zoekt: 2s+ maximum go garbage collection duration **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-cpu-usage-short-term). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-go-gc-duration-seconds). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_cpu_usage_short_term" + "warning_zoekt_go_gc_duration_seconds" ] ``` @@ -6499,30 +6455,28 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*indexed-search-indexer"})) >= 2)`

-## zoekt: provisioning_container_memory_usage_short_term +## zoekt: go_goroutines -

container memory usage (5m maximum) by instance

+

maximum active goroutines

**Descriptions** -- warning zoekt: 90%+ container memory usage (5m maximum) by instance +- warning zoekt: 10000+ maximum active goroutines for 10m0s **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of zoekt-webserver container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-provisioning-container-memory-usage-short-term). +- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-go-goroutines). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_provisioning_container_memory_usage_short_term" + "warning_zoekt_go_goroutines" ] ``` @@ -6531,30 +6485,28 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_us
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"}[5m])) >= 90)` +Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*indexed-search"})) >= 10000)`

-## zoekt: container_oomkill_events_total +## zoekt: go_gc_duration_seconds -

container OOMKILL events total by instance

+

maximum go garbage collection duration

**Descriptions** -- warning zoekt: 1+ container OOMKILL events total by instance +- warning zoekt: 2s+ maximum go garbage collection duration **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of zoekt-webserver container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#zoekt-container-oomkill-events-total). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#zoekt-go-gc-duration-seconds). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_zoekt_container_oomkill_events_total" + "warning_zoekt_go_gc_duration_seconds" ] ``` @@ -6563,7 +6515,7 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^zoekt-webserver.*"})) >= 1)` +Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*indexed-search"})) >= 2)`
@@ -6613,7 +6565,7 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*indexed-searc - Check the Container monitoring (not available on server) panels and try increasing resources for Prometheus if necessary. - If the rule group taking a long time to evaluate belongs to `/sg_prometheus_addons`, try reducing the complexity of any custom Prometheus rules provided. -- If the rule group taking a long time to evaluate belongs to `/sg_config_prometheus`, please [contact us](https://help.sourcegraph.com/hc/en-us/requests/new). +- If the rule group taking a long time to evaluate belongs to `/sg_config_prometheus`, please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new?assignees=&labels=&template=bug_report.md&title=). - More help interpreting this metric is available in the [dashboards reference](dashboards#prometheus-prometheus-rule-eval-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: @@ -6646,7 +6598,7 @@ Generated query for warning alert: `max((sum by (rule_group) (avg_over_time(prom - Check Prometheus logs for messages related to rule group evaluation (generally with log field `component="rule manager"`). - If the rule group failing to evaluate belongs to `/sg_prometheus_addons`, ensure any custom Prometheus configuration provided is valid. -- If the rule group taking a long time to evaluate belongs to `/sg_config_prometheus`, please [contact us](https://help.sourcegraph.com/hc/en-us/requests/new). +- If the rule group taking a long time to evaluate belongs to `/sg_config_prometheus`, please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new?assignees=&labels=&template=bug_report.md&title=). - More help interpreting this metric is available in the [dashboards reference](dashboards#prometheus-prometheus-rule-eval-failures). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: @@ -7271,46 +7223,14 @@ Generated query for warning alert: `max((max by (sg_instance) (go_gc_duration_se
-## codeintel-uploads: codeintel_commit_graph_queued_max_age - -

repository queue longest time in queue

- -**Descriptions** - -- warning codeintel-uploads: 3600s+ repository queue longest time in queue - -**Next steps** - -- An alert here is generally indicative of either underprovisioned worker instance(s) and/or -an underprovisioned main postgres instance. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#codeintel-uploads-codeintel-commit-graph-queued-max-age). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_codeintel-uploads_codeintel_commit_graph_queued_max_age" -] -``` - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Generated query for warning alert: `max((max(src_codeintel_commit_graph_queued_duration_seconds_total)) >= 3600)` - -
- -
- ## telemetry: telemetry_gateway_exporter_queue_growth -

rate of growth of export queue over 30m

+

rate of growth of events export queue over 30m

**Descriptions** -- warning telemetry: 1+ rate of growth of export queue over 30m for 1h0m0s -- critical telemetry: 1+ rate of growth of export queue over 30m for 36h0m0s +- warning telemetry: 1+ rate of growth of events export queue over 30m for 1h0m0s +- critical telemetry: 1+ rate of growth of events export queue over 30m for 36h0m0s **Next steps** @@ -7375,11 +7295,11 @@ Generated query for warning alert: `max((sum(increase(src_telemetrygatewayexport ## telemetry: telemetrygatewayexporter_queue_cleanup_errors_total -

export queue cleanup operation errors every 30m

+

events export queue cleanup operation errors every 30m

**Descriptions** -- warning telemetry: 0+ export queue cleanup operation errors every 30m +- warning telemetry: 0+ events export queue cleanup operation errors every 30m **Next steps** @@ -7407,11 +7327,11 @@ Generated query for warning alert: `max((sum(increase(src_telemetrygatewayexport ## telemetry: telemetrygatewayexporter_queue_metrics_reporter_errors_total -

export backlog metrics reporting operation errors every 30m

+

events export backlog metrics reporting operation errors every 30m

**Descriptions** -- warning telemetry: 0+ export backlog metrics reporting operation errors every 30m +- warning telemetry: 0+ events export backlog metrics reporting operation errors every 30m **Next steps** @@ -7437,54 +7357,97 @@ Generated query for warning alert: `max((sum(increase(src_telemetrygatewayexport
-## telemetry: telemetry_job_error_rate +## telemetry: telemetry_v2_export_queue_write_failures -

usage data exporter operation error rate over 5m

+

failed writes to events export queue over 5m

**Descriptions** -- warning telemetry: 0%+ usage data exporter operation error rate over 5m for 30m0s +- warning telemetry: 1%+ failed writes to events export queue over 5m +- critical telemetry: 2.5%+ failed writes to events export queue over 5m for 5m0s **Next steps** -- Involved cloud team to inspect logs of the managed instance to determine error sources. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#telemetry-telemetry-job-error-rate). +- Look for error logs related to `inserting telemetry events`. +- Look for error attributes on `telemetryevents.QueueForExport` trace spans. +- More help interpreting this metric is available in the [dashboards reference](dashboards#telemetry-telemetry-v2-export-queue-write-failures). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_telemetry_telemetry_job_error_rate" + "warning_telemetry_telemetry_v2_export_queue_write_failures", + "critical_telemetry_telemetry_v2_export_queue_write_failures" ] ``` -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* + +
+Technical details + +Generated query for warning alert: `max(((sum(increase(src_telemetry_export_store_queued_events{failed="true"}[5m])) / sum(increase(src_telemetry_export_store_queued_events[5m]))) * 100) > 1)` + +Generated query for critical alert: `max(((sum(increase(src_telemetry_export_store_queued_events{failed="true"}[5m])) / sum(increase(src_telemetry_export_store_queued_events[5m]))) * 100) > 2.5)` + +
+ +
+ +## telemetry: telemetry_v2_event_logs_write_failures + +

failed write V2 events to V1 'event_logs' over 5m

+ +**Descriptions** + +- warning telemetry: 5%+ failed write V2 events to V1 'event_logs' over 5m +- critical telemetry: 10%+ failed write V2 events to V1 'event_logs' over 5m for 10m0s + +**Next steps** + +- Error details are only persisted in trace metadata as it is considered non-critical. +- To diagnose, enable trace sampling across all requests and look for error attributes on `telemetrystore.v1teewrite` spans. +- More help interpreting this metric is available in the [dashboards reference](dashboards#telemetry-telemetry-v2-event-logs-write-failures). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_telemetry_telemetry_v2_event_logs_write_failures", + "critical_telemetry_telemetry_v2_event_logs_write_failures" +] +``` + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details -Generated query for warning alert: `max((sum by (op) (increase(src_telemetry_job_errors_total{job=~"^worker.*"\}[5m])) / (sum by (op) (increase(src_telemetry_job_total\{job=~"^worker.*"\}[5m])) + sum by (op) (increase(src_telemetry_job_errors_total\{job=~"^worker.*"}[5m]))) * 100) > 0)` +Generated query for warning alert: `max(((sum(increase(src_telemetry_teestore_v1_events{failed="true"}[5m])) / sum(increase(src_telemetry_teestore_v1_events[5m]))) * 100) > 5)` + +Generated query for critical alert: `max(((sum(increase(src_telemetry_teestore_v1_events{failed="true"}[5m])) / sum(increase(src_telemetry_teestore_v1_events[5m]))) * 100) > 10)`

-## telemetry: telemetry_job_utilized_throughput +## telemetry: telemetrygatewayexporter_usermetadata_exporter_errors_total -

utilized percentage of maximum throughput

+

(off by default) user metadata exporter operation errors every 30m

**Descriptions** -- warning telemetry: 90%+ utilized percentage of maximum throughput for 30m0s +- warning telemetry: 0+ (off by default) user metadata exporter operation errors every 30m **Next steps** -- Throughput utilization is high. This could be a signal that this instance is producing too many events for the export job to keep up. Configure more throughput using the maxBatchSize option. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#telemetry-telemetry-job-utilized-throughput). +- Failures indicate that exporting of telemetry events from Sourcegraph are failing. This may affect the performance of the database as the backlog grows. +- See worker logs in the `worker.telemetrygateway-exporter` log scope for more details. If logs only indicate that exports failed, reach out to Sourcegraph with relevant log entries, as this may be an issue in Sourcegraph`s Telemetry Gateway service. +- This exporter is DISABLED by default. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#telemetry-telemetrygatewayexporter-usermetadata-exporter-errors-total). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_telemetry_telemetry_job_utilized_throughput" + "warning_telemetry_telemetrygatewayexporter_usermetadata_exporter_errors_total" ] ``` @@ -7493,7 +7456,7 @@ Generated query for warning alert: `max((sum by (op) (increase(src_telemetry_job
Technical details -Generated query for warning alert: `max((rate(src_telemetry_job_total{op="SendEvents"}[1h]) / on () group_right () src_telemetry_job_max_throughput * 100) > 90)` +Generated query for warning alert: `max((sum(increase(src_telemetrygatewayexporter_usermetadata_exporter_errors_total{job=~"^worker.*"}[30m]))) > 0)`
@@ -7719,387 +7682,320 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*otel-collecto
-## embeddings: embeddings_site_configuration_duration_since_last_successful_update_by_instance +## completions: completion_credits_check_entitlement_duration_p95 -

maximum duration since last successful site configuration update (all "embeddings" instances)

+

95th percentile completion credits entitlement check duration

**Descriptions** -- critical embeddings: 300s+ maximum duration since last successful site configuration update (all "embeddings" instances) +- warning completions: 10ms+ 95th percentile completion credits entitlement check duration for 10m0s **Next steps** -- This indicates that one or more "embeddings" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself. -- Check for relevant errors in the "embeddings" logs, as well as frontend`s logs. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-embeddings-site-configuration-duration-since-last-successful-update-by-instance). +- - This metric tracks pre-completion-request latency for checking if completion credits entitlement has been exceeded. + - If this value is high, this latency may be noticeable to users. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#completions-completion-credits-check-entitlement-duration-p95). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_embeddings_embeddings_site_configuration_duration_since_last_successful_update_by_instance" + "warning_completions_completion_credits_check_entitlement_duration_p95" ] ``` -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details -Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*embeddings"}[1m]))) >= 300)` +Generated query for warning alert: `max((histogram_quantile(0.95, sum by (le) (rate(src_completion_credits_check_entitlement_duration_ms_bucket[5m])))) > 10)`

-## embeddings: mean_blocked_seconds_per_conn_request +## completions: completion_credits_consume_credits_duration_p95 -

mean blocked seconds per conn request

+

95th percentile completion credits consume duration

**Descriptions** -- warning embeddings: 0.1s+ mean blocked seconds per conn request for 10m0s -- critical embeddings: 0.5s+ mean blocked seconds per conn request for 10m0s +- warning completions: 20ms+ 95th percentile completion credits consume duration for 10m0s **Next steps** -- Increase SRC_PGSQL_MAX_OPEN together with giving more memory to the database if needed -- Scale up Postgres memory/cpus - [see our scaling guide](https://sourcegraph.com/docs/admin/config/postgres-conf) -- If using GCP Cloud SQL, check for high lock waits or CPU usage in query insights -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-mean-blocked-seconds-per-conn-request). +- - This metric tracks post-completion-request latency for committing consumed completion credits. + - If high, this latency may be noticeable for non-streaming completions. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#completions-completion-credits-consume-credits-duration-p95). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_mean_blocked_seconds_per_conn_request", - "critical_embeddings_mean_blocked_seconds_per_conn_request" + "warning_completions_completion_credits_consume_credits_duration_p95" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details -Generated query for warning alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="embeddings"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="embeddings"}[5m]))) >= 0.1)` - -Generated query for critical alert: `max((sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="embeddings"\}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for\{app_name="embeddings"}[5m]))) >= 0.5)` +Generated query for warning alert: `max((histogram_quantile(0.95, sum by (le) (rate(src_completion_credits_consume_duration_ms_bucket[5m])))) > 20)`

-## embeddings: container_cpu_usage +## background-jobs: error_percentage_by_method -

container cpu usage total (1m average) across all cores by instance

+

percentage of operations resulting in error by method

**Descriptions** -- warning embeddings: 99%+ container cpu usage total (1m average) across all cores by instance +- warning background-jobs: 5%+ percentage of operations resulting in error by method +- critical background-jobs: 50%+ percentage of operations resulting in error by method **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the embeddings container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-container-cpu-usage). +- Review logs for the specific operation to identify patterns in errors. Check database connectivity and schema. If a particular method is consistently failing, investigate potential issues with that operation`s SQL query or transaction handling. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-error-percentage-by-method). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_container_cpu_usage" + "warning_background-jobs_error_percentage_by_method", + "critical_background-jobs_error_percentage_by_method" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^embeddings.*"}) >= 99)` - -
- -
- -## embeddings: container_memory_usage - -

container memory usage by instance

- -**Descriptions** - -- warning embeddings: 99%+ container memory usage by instance - -**Next steps** - -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of embeddings container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-container-memory-usage). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_embeddings_container_memory_usage" -] -``` - -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +Generated query for warning alert: `max(((sum by (op) (rate(src_workerutil_dbworker_store_errors_total[5m])) / sum by (op) (rate(src_workerutil_dbworker_store_total[5m]))) * 100) >= 5)` -
-Technical details - -Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^embeddings.*"}) >= 99)` +Generated query for critical alert: `max(((sum by (op) (rate(src_workerutil_dbworker_store_errors_total[5m])) / sum by (op) (rate(src_workerutil_dbworker_store_total[5m]))) * 100) >= 50)`

-## embeddings: provisioning_container_cpu_usage_long_term +## background-jobs: error_percentage_by_domain -

container cpu usage total (90th percentile over 1d) across all cores by instance

+

percentage of operations resulting in error by domain

**Descriptions** -- warning embeddings: 80%+ container cpu usage total (90th percentile over 1d) across all cores by instance for 336h0m0s +- warning background-jobs: 5%+ percentage of operations resulting in error by domain +- critical background-jobs: 50%+ percentage of operations resulting in error by domain **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the embeddings service. -- **Docker Compose:** Consider increasing `cpus:` of the embeddings container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-provisioning-container-cpu-usage-long-term). +- Review logs for the specific domain to identify patterns in errors. Check database connectivity and schema. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-error-percentage-by-domain). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_provisioning_container_cpu_usage_long_term" + "warning_background-jobs_error_percentage_by_domain", + "critical_background-jobs_error_percentage_by_domain" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^embeddings.*"}[1d])) >= 80)` - -
- -
- -## embeddings: provisioning_container_memory_usage_long_term - -

container memory usage (1d maximum) by instance

- -**Descriptions** - -- warning embeddings: 80%+ container memory usage (1d maximum) by instance for 336h0m0s - -**Next steps** - -- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the embeddings service. -- **Docker Compose:** Consider increasing `memory:` of the embeddings container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-provisioning-container-memory-usage-long-term). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_embeddings_provisioning_container_memory_usage_long_term" -] -``` - -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* - -
-Technical details +Generated query for warning alert: `max(((sum by (domain) (rate(src_workerutil_dbworker_store_errors_total[5m])) / sum by (domain) (rate(src_workerutil_dbworker_store_total[5m]))) * 100) >= 5)` -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^embeddings.*"}[1d])) >= 80)` +Generated query for critical alert: `max(((sum by (domain) (rate(src_workerutil_dbworker_store_errors_total[5m])) / sum by (domain) (rate(src_workerutil_dbworker_store_total[5m]))) * 100) >= 50)`

-## embeddings: provisioning_container_cpu_usage_short_term +## background-jobs: resetter_duration -

container cpu usage total (5m maximum) across all cores by instance

+

time spent running the resetter

**Descriptions** -- warning embeddings: 90%+ container cpu usage total (5m maximum) across all cores by instance for 30m0s +- warning background-jobs: 10s+ time spent running the resetter **Next steps** -- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `cpus:` of the embeddings container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-provisioning-container-cpu-usage-short-term). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-resetter-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_provisioning_container_cpu_usage_short_term" + "warning_background-jobs_resetter_duration" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^embeddings.*"}[5m])) >= 90)` +Generated query for warning alert: `max((histogram_quantile(0.95, sum by (le, domain) (rate(src_dbworker_resetter_duration_seconds_bucket[5m])))) >= 10)`

-## embeddings: provisioning_container_memory_usage_short_term +## background-jobs: resetter_failures -

container memory usage (5m maximum) by instance

+

number of times the resetter failed to run

**Descriptions** -- warning embeddings: 90%+ container memory usage (5m maximum) by instance +- warning background-jobs: 1reqps+ number of times the resetter failed to run **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of embeddings container in `docker-compose.yml`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-provisioning-container-memory-usage-short-term). +- Check application logs for the failing domain to check for errors. High failure rates indicate a bug in the code handling the job, or a pod frequently dying. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-resetter-failures). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_provisioning_container_memory_usage_short_term" + "warning_background-jobs_resetter_failures" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^embeddings.*"}[5m])) >= 90)` +Generated query for warning alert: `max((sum by (domain) (increase(src_dbworker_resetter_errors_total[5m]))) >= 1)`

-## embeddings: container_oomkill_events_total +## background-jobs: failed_records -

container OOMKILL events total by instance

+

number of stalled records marked as 'failed'

**Descriptions** -- warning embeddings: 1+ container OOMKILL events total by instance +- warning background-jobs: 50+ number of stalled records marked as 'failed' **Next steps** -- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. -- **Docker Compose:** Consider increasing `memory:` of embeddings container in `docker-compose.yml`. -- More help interpreting this metric is available in the [dashboards reference](dashboards#embeddings-container-oomkill-events-total). +- Check application logs for the failing domain to check for errors. High failure rates indicate a bug in the code handling the job, or a pod frequently dying. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-failed-records). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_container_oomkill_events_total" + "warning_background-jobs_failed_records" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((max by (name) (container_oom_events_total{name=~"^embeddings.*"})) >= 1)` +Generated query for warning alert: `max((sum by (domain) (increase(src_dbworker_resetter_record_reset_failures_total[5m]))) >= 50)`

-## embeddings: go_goroutines +## background-jobs: stall_duration_p90 -

maximum active goroutines

+

90th percentile of stall duration

**Descriptions** -- warning embeddings: 10000+ maximum active goroutines for 10m0s +- warning background-jobs: 300s+ 90th percentile of stall duration **Next steps** -- More help interpreting this metric is available in the [dashboards reference](dashboards#embeddings-go-goroutines). +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-stall-duration-p90). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_go_goroutines" + "warning_background-jobs_stall_duration_p90" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~".*embeddings"})) >= 10000)` +Generated query for warning alert: `max((histogram_quantile(0.9, sum by (le, domain) (rate(src_dbworker_resetter_stall_duration_seconds_bucket[5m])))) >= 300)`

-## embeddings: go_gc_duration_seconds +## background-jobs: aggregate_queue_size -

maximum go garbage collection duration

+

total number of jobs queued across all domains

**Descriptions** -- warning embeddings: 2s+ maximum go garbage collection duration +- warning background-jobs: 1e+06+ total number of jobs queued across all domains **Next steps** -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-go-gc-duration-seconds). +- Check for stuck workers or investigate the specific domains with high queue depth. Check worker logs for errors and database for high load. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-aggregate-queue-size). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_embeddings_go_gc_duration_seconds" + "warning_background-jobs_aggregate_queue_size" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for warning alert: `max((max by (instance) (go_gc_duration_seconds{job=~".*embeddings"})) >= 2)` +Generated query for warning alert: `max((sum(max by (domain) (src_workerutil_queue_depth))) >= 1000000)`

-## embeddings: pods_available_percentage +## background-jobs: max_queue_duration -

percentage pods available

+

maximum time a job has been in queue across all domains

**Descriptions** -- critical embeddings: less than 90% percentage pods available for 10m0s +- warning background-jobs: 86400s+ maximum time a job has been in queue across all domains **Next steps** -- Determine if the pod was OOM killed using `kubectl describe pod embeddings` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. -- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p embeddings`. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#embeddings-pods-available-percentage). +- Investigate which domain has jobs stuck in queue. If the queue is growing, consider scaling up worker instances. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#background-jobs-max-queue-duration). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "critical_embeddings_pods_available_percentage" + "warning_background-jobs_max_queue_duration" ] ``` -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details -Generated query for critical alert: `min((sum by (app) (up{app=~".*embeddings"\}) / count by (app) (up\{app=~".*embeddings"}) * 100) <= 90)` +Generated query for warning alert: `max((max(src_workerutil_queue_duration_seconds)) >= 86400)`
-
+
\ No newline at end of file diff --git a/docs/admin/observability/dashboards.mdx b/docs/admin/observability/dashboards.mdx index 5191b74dc..c38ee3996 100644 --- a/docs/admin/observability/dashboards.mdx +++ b/docs/admin/observability/dashboards.mdx @@ -58,11 +58,11 @@ histogram_quantile(0.90, sum by (le)(rate(src_search_streaming_latency_seconds_b
-#### frontend: hard_timeout_search_responses +#### frontend: timeout_search_responses -

Hard timeout search responses every 5m

+

Timeout search responses every 5m

-Refer to the [alerts reference](alerts#frontend-hard-timeout-search-responses) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#frontend-timeout-search-responses) for 1 alert related to this panel. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100010` on your Sourcegraph instance. @@ -74,7 +74,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100010` Query: ``` -(sum(increase(src_graphql_search_response{status="timeout",source="browser",request_name!="CodeIntelSearch"}[5m])) + sum(increase(src_graphql_search_response{status="alert",alert_type="timed_out",source="browser",request_name!="CodeIntelSearch"}[5m]))) / sum(increase(src_graphql_search_response{source="browser",request_name!="CodeIntelSearch"}[5m])) * 100 +sum(increase(src_search_streaming_response{status=~"timeout|partial_timeout",source="browser"}[5m])) / sum(increase(src_search_streaming_response{source="browser"}[5m])) * 100 ``` @@ -96,17 +96,17 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100011` Query: ``` -sum by (status)(increase(src_graphql_search_response{status=~"error",source="browser",request_name!="CodeIntelSearch"}[5m])) / ignoring(status) group_left sum(increase(src_graphql_search_response{source="browser",request_name!="CodeIntelSearch"}[5m])) * 100 +sum(increase(src_search_streaming_response{status="error",source="browser"}[5m])) / sum(increase(src_search_streaming_response{source="browser"}[5m])) * 100 ```
-#### frontend: partial_timeout_search_responses +#### frontend: search_no_results -

Partial timeout search responses every 5m

+

Searches with no results every 5m

-Refer to the [alerts reference](alerts#frontend-partial-timeout-search-responses) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#frontend-search-no-results) for 1 alert related to this panel. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100012` on your Sourcegraph instance. @@ -118,7 +118,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100012` Query: ``` -sum by (status)(increase(src_graphql_search_response{status="partial_timeout",source="browser",request_name!="CodeIntelSearch"}[5m])) / ignoring(status) group_left sum(increase(src_graphql_search_response{source="browser",request_name!="CodeIntelSearch"}[5m])) * 100 +sum(increase(src_search_streaming_response{status="no_results",source="browser"}[5m])) / sum(increase(src_search_streaming_response{source="browser"}[5m])) * 100 ``` @@ -140,7 +140,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100013` Query: ``` -sum by (alert_type)(increase(src_graphql_search_response{status="alert",alert_type!~"timed_out|no_results__suggest_quotes",source="browser",request_name!="CodeIntelSearch"}[5m])) / ignoring(alert_type) group_left sum(increase(src_graphql_search_response{source="browser",request_name!="CodeIntelSearch"}[5m])) * 100 +sum by (alert_type)(increase(src_search_streaming_response{status="alert",alert_type!~"timed_out",source="browser"}[5m])) / ignoring(alert_type) group_left sum(increase(src_search_streaming_response{source="browser"}[5m])) * 100 ``` @@ -408,7 +408,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100212` Query: ``` -sum by (alert_type)(increase(src_graphql_search_response{status="alert",alert_type!~"timed_out|no_results__suggest_quotes",source="other"}[5m])) / ignoring(alert_type) group_left sum(increase(src_graphql_search_response{status="alert",source="other"}[5m])) +sum by (alert_type)(increase(src_graphql_search_response{status="alert",alert_type!~"timed_out",source="other"}[5m])) / ignoring(alert_type) group_left sum(increase(src_graphql_search_response{status="alert",source="other"}[5m])) ``` @@ -998,7 +998,7 @@ sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^(frontend|s ### Frontend: Workerutil: lsif_indexes dbworker/store stats -#### frontend: workerutil_dbworker_store_codeintel_index_total +#### frontend: workerutil_dbworker_store_total

Store operations every 5m

@@ -1014,13 +1014,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100700` Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_index_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_workerutil_dbworker_store_total{domain='codeintel_index_jobs',job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: workerutil_dbworker_store_codeintel_index_99th_percentile_duration +#### frontend: workerutil_dbworker_store_99th_percentile_duration

Aggregate successful store operation duration distribution over 5m

@@ -1036,13 +1036,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100701` Query: ``` -sum by (le)(rate(src_workerutil_dbworker_store_codeintel_index_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='codeintel_index_jobs',job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: workerutil_dbworker_store_codeintel_index_errors_total +#### frontend: workerutil_dbworker_store_errors_total

Store operation errors every 5m

@@ -1058,13 +1058,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100702` Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_index_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='codeintel_index_jobs',job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: workerutil_dbworker_store_codeintel_index_error_rate +#### frontend: workerutil_dbworker_store_error_rate

Store operation error rate over 5m

@@ -1080,7 +1080,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100703` Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_index_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_codeintel_index_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_workerutil_dbworker_store_codeintel_index_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_workerutil_dbworker_store_errors_total{domain='codeintel_index_jobs',job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_total{domain='codeintel_index_jobs',job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_workerutil_dbworker_store_errors_total{domain='codeintel_index_jobs',job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ``` @@ -1266,7 +1266,7 @@ sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^(fronte ### Frontend: Codeintel: gitserver client -#### frontend: codeintel_gitserver_total +#### frontend: gitserver_client_total

Aggregate client operations every 5m

@@ -1282,13 +1282,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100900` Query: ``` -sum(increase(src_codeintel_gitserver_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: codeintel_gitserver_99th_percentile_duration +#### frontend: gitserver_client_99th_percentile_duration

Aggregate successful client operation duration distribution over 5m

@@ -1304,13 +1304,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100901` Query: ``` -sum by (le)(rate(src_codeintel_gitserver_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: codeintel_gitserver_errors_total +#### frontend: gitserver_client_errors_total

Aggregate client operation errors every 5m

@@ -1326,13 +1326,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100902` Query: ``` -sum(increase(src_codeintel_gitserver_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: codeintel_gitserver_error_rate +#### frontend: gitserver_client_error_rate

Aggregate client operation error rate over 5m

@@ -1348,13 +1348,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100903` Query: ``` -sum(increase(src_codeintel_gitserver_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_codeintel_gitserver_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_codeintel_gitserver_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```
-#### frontend: codeintel_gitserver_total +#### frontend: gitserver_client_total

Client operations every 5m

@@ -1370,13 +1370,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100910` Query: ``` -sum by (op)(increase(src_codeintel_gitserver_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: codeintel_gitserver_99th_percentile_duration +#### frontend: gitserver_client_99th_percentile_duration

99th percentile successful client operation duration over 5m

@@ -1392,13 +1392,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100911` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_gitserver_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) ```
-#### frontend: codeintel_gitserver_errors_total +#### frontend: gitserver_client_errors_total

Client operation errors every 5m

@@ -1414,13 +1414,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100912` Query: ``` -sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```
-#### frontend: codeintel_gitserver_error_rate +#### frontend: gitserver_client_error_rate

Client operation error rate over 5m

@@ -1436,7 +1436,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100913` Query: ``` -sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_codeintel_gitserver_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ``` @@ -1620,17 +1620,17 @@ sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^(frontend|sou
-### Frontend: Codeintel: dependencies service stats +### Frontend: Gitserver: Gitserver Client -#### frontend: codeintel_dependencies_total +#### frontend: gitserver_client_total -

Aggregate service operations every 5m

+

Aggregate client operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1638,21 +1638,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101100` Query: ``` -sum(increase(src_codeintel_dependencies_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_99th_percentile_duration +#### frontend: gitserver_client_99th_percentile_duration -

Aggregate successful service operation duration distribution over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1660,21 +1660,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101101` Query: ``` -sum by (le)(rate(src_codeintel_dependencies_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_errors_total +#### frontend: gitserver_client_errors_total -

Aggregate service operation errors every 5m

+

Aggregate client operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1682,21 +1682,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101102` Query: ``` -sum(increase(src_codeintel_dependencies_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_error_rate +#### frontend: gitserver_client_error_rate -

Aggregate service operation error rate over 5m

+

Aggregate client operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1704,21 +1704,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101103` Query: ``` -sum(increase(src_codeintel_dependencies_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_codeintel_dependencies_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_codeintel_dependencies_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-#### frontend: codeintel_dependencies_total +#### frontend: gitserver_client_total -

Service operations every 5m

+

Client operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1726,21 +1726,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101110` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op,scope)(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_99th_percentile_duration +#### frontend: gitserver_client_99th_percentile_duration -

99th percentile successful service operation duration over 5m

+

99th percentile successful client operation duration over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1748,21 +1748,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101111` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_dependencies_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) ```

-#### frontend: codeintel_dependencies_errors_total +#### frontend: gitserver_client_errors_total -

Service operation errors every 5m

+

Client operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101112` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1770,21 +1770,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101112` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_error_rate +#### frontend: gitserver_client_error_rate -

Service operation error rate over 5m

+

Client operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101113` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1792,23 +1792,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101113` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_codeintel_dependencies_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_codeintel_dependencies_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-### Frontend: Codeintel: dependencies service store stats +### Frontend: Gitserver: Gitserver Repository Service Client -#### frontend: codeintel_dependencies_background_total +#### frontend: gitserver_repositoryservice_client_total -

Aggregate service operations every 5m

+

Aggregate client operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1816,21 +1816,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101200` Query: ``` -sum(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_gitserver_repositoryservice_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_99th_percentile_duration +#### frontend: gitserver_repositoryservice_client_99th_percentile_duration -

Aggregate successful service operation duration distribution over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1838,21 +1838,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101201` Query: ``` -sum by (le)(rate(src_codeintel_dependencies_background_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_gitserver_repositoryservice_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_errors_total +#### frontend: gitserver_repositoryservice_client_errors_total -

Aggregate service operation errors every 5m

+

Aggregate client operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1860,21 +1860,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101202` Query: ``` -sum(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_error_rate +#### frontend: gitserver_repositoryservice_client_error_rate -

Aggregate service operation error rate over 5m

+

Aggregate client operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101203` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1882,21 +1882,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101203` Query: ``` -sum(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_gitserver_repositoryservice_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-#### frontend: codeintel_dependencies_background_total +#### frontend: gitserver_repositoryservice_client_total -

Service operations every 5m

+

Client operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1904,21 +1904,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101210` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_99th_percentile_duration +#### frontend: gitserver_repositoryservice_client_99th_percentile_duration -

99th percentile successful service operation duration over 5m

+

99th percentile successful client operation duration over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1926,21 +1926,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101211` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_dependencies_background_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_repositoryservice_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) ```

-#### frontend: codeintel_dependencies_background_errors_total +#### frontend: gitserver_repositoryservice_client_errors_total -

Service operation errors every 5m

+

Client operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101212` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1948,21 +1948,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101212` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_error_rate +#### frontend: gitserver_repositoryservice_client_error_rate -

Service operation error rate over 5m

+

Client operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101213` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -1970,23 +1970,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101213` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_repositoryservice_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-### Frontend: Codeintel: dependencies service background stats +### Frontend: Batches: dbstore stats -#### frontend: codeintel_dependencies_background_total +#### frontend: batches_dbstore_total -

Aggregate service operations every 5m

+

Aggregate store operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -1994,21 +1994,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101300` Query: ``` -sum(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_99th_percentile_duration +#### frontend: batches_dbstore_99th_percentile_duration -

Aggregate successful service operation duration distribution over 5m

+

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2016,21 +2016,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101301` Query: ``` -sum by (le)(rate(src_codeintel_dependencies_background_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_errors_total +#### frontend: batches_dbstore_errors_total -

Aggregate service operation errors every 5m

+

Aggregate store operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101302` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2038,21 +2038,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101302` Query: ``` -sum(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_error_rate +#### frontend: batches_dbstore_error_rate -

Aggregate service operation error rate over 5m

+

Aggregate store operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101303` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2060,21 +2060,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101303` Query: ``` -sum(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-#### frontend: codeintel_dependencies_background_total +#### frontend: batches_dbstore_total -

Service operations every 5m

+

Store operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2082,21 +2082,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101310` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_99th_percentile_duration +#### frontend: batches_dbstore_99th_percentile_duration -

99th percentile successful service operation duration over 5m

+

99th percentile successful store operation duration over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101311` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2104,21 +2104,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101311` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_dependencies_background_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) ```

-#### frontend: codeintel_dependencies_background_errors_total +#### frontend: batches_dbstore_errors_total -

Service operation errors every 5m

+

Store operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101312` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2126,21 +2126,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101312` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_dependencies_background_error_rate +#### frontend: batches_dbstore_error_rate -

Service operation error rate over 5m

+

Store operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101313` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2148,15 +2148,15 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101313` Query: ``` -sum by (op)(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_codeintel_dependencies_background_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_codeintel_dependencies_background_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-### Frontend: Codeintel: lockfiles service stats +### Frontend: Batches: service stats -#### frontend: codeintel_lockfiles_total +#### frontend: batches_service_total

Aggregate service operations every 5m

@@ -2164,7 +2164,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2172,13 +2172,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101400` Query: ``` -sum(increase(src_codeintel_lockfiles_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_lockfiles_99th_percentile_duration +#### frontend: batches_service_99th_percentile_duration

Aggregate successful service operation duration distribution over 5m

@@ -2186,7 +2186,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2194,13 +2194,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101401` Query: ``` -sum by (le)(rate(src_codeintel_lockfiles_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_batches_service_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_lockfiles_errors_total +#### frontend: batches_service_errors_total

Aggregate service operation errors every 5m

@@ -2208,7 +2208,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101402` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2216,13 +2216,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101402` Query: ``` -sum(increase(src_codeintel_lockfiles_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_lockfiles_error_rate +#### frontend: batches_service_error_rate

Aggregate service operation error rate over 5m

@@ -2230,7 +2230,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101403` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2238,13 +2238,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101403` Query: ``` -sum(increase(src_codeintel_lockfiles_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_codeintel_lockfiles_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_codeintel_lockfiles_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-#### frontend: codeintel_lockfiles_total +#### frontend: batches_service_total

Service operations every 5m

@@ -2252,7 +2252,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101410` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2260,13 +2260,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101410` Query: ``` -sum by (op)(increase(src_codeintel_lockfiles_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_lockfiles_99th_percentile_duration +#### frontend: batches_service_99th_percentile_duration

99th percentile successful service operation duration over 5m

@@ -2274,7 +2274,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2282,13 +2282,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101411` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_lockfiles_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_batches_service_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) ```

-#### frontend: codeintel_lockfiles_errors_total +#### frontend: batches_service_errors_total

Service operation errors every 5m

@@ -2296,7 +2296,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101412` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2304,13 +2304,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101412` Query: ``` -sum by (op)(increase(src_codeintel_lockfiles_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: codeintel_lockfiles_error_rate +#### frontend: batches_service_error_rate

Service operation error rate over 5m

@@ -2318,7 +2318,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101413` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2326,23 +2326,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101413` Query: ``` -sum by (op)(increase(src_codeintel_lockfiles_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_codeintel_lockfiles_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_codeintel_lockfiles_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum by (op)(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-### Frontend: Gitserver: Gitserver Client +### Frontend: Batches: HTTP API File Handler -#### frontend: gitserver_client_total +#### frontend: batches_httpapi_total -

Aggregate graphql operations every 5m

+

Aggregate http handler operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2350,21 +2350,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101500` Query: ``` -sum(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: gitserver_client_99th_percentile_duration +#### frontend: batches_httpapi_99th_percentile_duration -

Aggregate successful graphql operation duration distribution over 5m

+

Aggregate successful http handler operation duration distribution over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2372,21 +2372,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101501` Query: ``` -sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_batches_httpapi_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: gitserver_client_errors_total +#### frontend: batches_httpapi_errors_total -

Aggregate graphql operation errors every 5m

+

Aggregate http handler operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2394,21 +2394,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101502` Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: gitserver_client_error_rate +#### frontend: batches_httpapi_error_rate -

Aggregate graphql operation error rate over 5m

+

Aggregate http handler operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101503` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2416,21 +2416,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101503` Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-#### frontend: gitserver_client_total +#### frontend: batches_httpapi_total -

Graphql operations every 5m

+

Http handler operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101510` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2438,21 +2438,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101510` Query: ``` -sum by (op,scope)(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: gitserver_client_99th_percentile_duration +#### frontend: batches_httpapi_99th_percentile_duration -

99th percentile successful graphql operation duration over 5m

+

99th percentile successful http handler operation duration over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2460,21 +2460,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101511` Query: ``` -histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_batches_httpapi_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) ```

-#### frontend: gitserver_client_errors_total +#### frontend: batches_httpapi_errors_total -

Graphql operation errors every 5m

+

Http handler operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101512` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2482,21 +2482,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101512` Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (op)(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: gitserver_client_error_rate +#### frontend: batches_httpapi_error_rate -

Graphql operation error rate over 5m

+

Http handler operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101513` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -2504,23 +2504,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101513` Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum by (op)(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-### Frontend: Batches: dbstore stats +### Frontend: Out-of-band migrations: up migration invocation (one batch processed) -#### frontend: batches_dbstore_total +#### frontend: oobmigration_total -

Aggregate store operations every 5m

+

Migration handler operations every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2528,21 +2528,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101600` Query: ``` -sum(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_oobmigration_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: batches_dbstore_99th_percentile_duration +#### frontend: oobmigration_99th_percentile_duration -

Aggregate successful store operation duration distribution over 5m

+

Aggregate successful migration handler operation duration distribution over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101601` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2550,21 +2550,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101601` Query: ``` -sum by (le)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum by (le)(rate(src_oobmigration_duration_seconds_bucket{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: batches_dbstore_errors_total +#### frontend: oobmigration_errors_total -

Aggregate store operation errors every 5m

+

Migration handler operation errors every 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101602` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2572,21 +2572,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101602` Query: ``` -sum(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_oobmigration_errors_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: batches_dbstore_error_rate +#### frontend: oobmigration_error_rate -

Aggregate store operation error rate over 5m

+

Migration handler operation error rate over 5m

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101603` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2594,21 +2594,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101603` Query: ``` -sum(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_oobmigration_errors_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_oobmigration_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_oobmigration_errors_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-#### frontend: batches_dbstore_total +### Frontend: Out-of-band migrations: down migration invocation (one batch processed) -

Store operations every 5m

+#### frontend: oobmigration_total + +

Migration handler operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2616,21 +2618,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101610` Query: ``` -sum by (op)(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_oobmigration_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: batches_dbstore_99th_percentile_duration +#### frontend: oobmigration_99th_percentile_duration -

99th percentile successful store operation duration over 5m

+

Aggregate successful migration handler operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2638,21 +2640,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101611` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +sum by (le)(rate(src_oobmigration_duration_seconds_bucket{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: batches_dbstore_errors_total +#### frontend: oobmigration_errors_total -

Store operation errors every 5m

+

Migration handler operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101702` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2660,21 +2662,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101612` Query: ``` -sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(increase(src_oobmigration_errors_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) ```

-#### frontend: batches_dbstore_error_rate +#### frontend: oobmigration_error_rate -

Store operation error rate over 5m

+

Migration handler operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101613` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101703` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -2682,23 +2684,25 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101613` Query: ``` -sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_batches_dbstore_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +sum(increase(src_oobmigration_errors_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_oobmigration_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_oobmigration_errors_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 ```

-### Frontend: Batches: service stats +### Frontend: Zoekt Configuration GRPC server metrics -#### frontend: batches_service_total +#### frontend: zoekt_configuration_grpc_request_rate_all_methods -

Aggregate service operations every 5m

+

Request rate across all methods over 2m

+ +The number of gRPC requests received per second across all methods, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101800` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2706,21 +2710,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101700` Query: ``` -sum(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(rate(grpc_server_started_total{instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) ```

-#### frontend: batches_service_99th_percentile_duration +#### frontend: zoekt_configuration_grpc_request_rate_per_method -

Aggregate successful service operation duration distribution over 5m

+

Request rate per-method over 2m

+ +The number of gRPC requests received per second broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2728,21 +2734,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101701` Query: ``` -sum by (le)(rate(src_batches_service_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(rate(grpc_server_started_total{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method) ```

-#### frontend: batches_service_errors_total +#### frontend: zoekt_configuration_error_percentage_all_methods -

Aggregate service operation errors every 5m

+

Error percentage across all methods over 2m

+ +The percentage of gRPC requests that fail across all methods, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101810` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2750,21 +2758,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101702` Query: ``` -sum(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) )) ```

-#### frontend: batches_service_error_rate +#### frontend: zoekt_configuration_grpc_error_percentage_per_method -

Aggregate service operation error rate over 5m

+

Error percentage per-method over 2m

+ +The percentage of gRPC requests that fail per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101703` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101811` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2772,21 +2782,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101703` Query: ``` -sum(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_configuration_method:regex}`,grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method)) )) ```

-#### frontend: batches_service_total +#### frontend: zoekt_configuration_p99_response_time_per_method -

Service operations every 5m

+

99th percentile response time per method over 2m

+ +The 99th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101820` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2794,21 +2806,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101710` Query: ``` -sum by (op)(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: batches_service_99th_percentile_duration +#### frontend: zoekt_configuration_p90_response_time_per_method -

99th percentile successful service operation duration over 5m

+

90th percentile response time per method over 2m

+ +The 90th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101821` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2816,21 +2830,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101711` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_service_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: batches_service_errors_total +#### frontend: zoekt_configuration_p75_response_time_per_method -

Service operation errors every 5m

+

75th percentile response time per method over 2m

+ +The 75th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101712` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101822` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2838,21 +2854,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101712` Query: ``` -sum by (op)(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: batches_service_error_rate +#### frontend: zoekt_configuration_p99_9_response_size_per_method -

Service operation error rate over 5m

+

99.9th percentile total response size per method over 2m

+ +The 99.9th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101713` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101830` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2860,23 +2878,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101713` Query: ``` -sum by (op)(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_batches_service_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_batches_service_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-### Frontend: Batches: Workspace execution dbstore +#### frontend: zoekt_configuration_p90_response_size_per_method -#### frontend: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_total +

90th percentile total response size per method over 2m

-

Store operations every 5m

+The 90th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101831` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2884,21 +2902,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101800` Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_99th_percentile_duration +#### frontend: zoekt_configuration_p75_response_size_per_method -

99th percentile successful store operation duration over 5m

+

75th percentile total response size per method over 2m

+ +The 75th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101832` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2906,21 +2926,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101801` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total +#### frontend: zoekt_configuration_p99_9_invididual_sent_message_size_per_method -

Store operation errors every 5m

+

99.9th percentile individual sent message size per method over 2m

+ +The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101840` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2928,21 +2950,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101802` Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_error_rate +#### frontend: zoekt_configuration_p90_invididual_sent_message_size_per_method -

Store operation error rate over 5m

+

90th percentile individual sent message size per method over 2m

+ +The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101803` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101841` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2950,23 +2974,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101803` Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-### Frontend: Batches: HTTP API File Handler +#### frontend: zoekt_configuration_p75_invididual_sent_message_size_per_method -#### frontend: batches_httpapi_total +

75th percentile individual sent message size per method over 2m

-

Aggregate http handler operations every 5m

+The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101842` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2974,21 +2998,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101900` Query: ``` -sum(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) ```

-#### frontend: batches_httpapi_99th_percentile_duration +#### frontend: zoekt_configuration_grpc_response_stream_message_count_per_method -

Aggregate successful http handler operation duration distribution over 5m

+

Average streaming response message count per-method over 2m

+ +The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101850` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -2996,21 +3022,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101901` Query: ``` -sum by (le)(rate(src_batches_httpapi_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method))) ```

-#### frontend: batches_httpapi_errors_total +#### frontend: zoekt_configuration_grpc_all_codes_per_method -

Aggregate http handler operation errors every 5m

+

Response codes rate per-method over 2m

+ +The rate of all generated gRPC response codes per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101902` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101860` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3018,21 +3046,25 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101902` Query: ``` -sum(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method, grpc_code) ```

-#### frontend: batches_httpapi_error_rate +### Frontend: Zoekt Configuration GRPC "internal error" metrics -

Aggregate http handler operation error rate over 5m

+#### frontend: zoekt_configuration_grpc_clients_error_percentage_all_methods + +

Client baseline error percentage across all methods over 2m

+ +The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "zoekt_configuration" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101903` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101900` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3040,21 +3072,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101903` Query: ``` -sum(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_code!="OK"}[2m])))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))))))) ```

-#### frontend: batches_httpapi_total +#### frontend: zoekt_configuration_grpc_clients_error_percentage_per_method -

Http handler operations every 5m

+

Client baseline error percentage per-method over 2m

+ +The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_configuration" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101901` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3062,21 +3096,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101910` Query: ``` -sum by (op)(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### frontend: batches_httpapi_99th_percentile_duration +#### frontend: zoekt_configuration_grpc_clients_all_codes_per_method -

99th percentile successful http handler operation duration over 5m

+

Client baseline response codes rate per-method over 2m

+ +The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_configuration" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101902` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3084,43 +3120,29 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101911` Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_httpapi_duration_seconds_bucket{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) +(sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```

-#### frontend: batches_httpapi_errors_total +#### frontend: zoekt_configuration_grpc_clients_internal_error_percentage_all_methods -

Http handler operation errors every 5m

+

Client-observed gRPC internal error percentage across all methods over 2m

-This panel has no related alerts. +The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "zoekt_configuration" clients. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101912` on your Sourcegraph instance. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_configuration" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) -``` -
- -
- -#### frontend: batches_httpapi_error_rate +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. -

Http handler operation error rate over 5m

+**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101913` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101910` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3128,45 +3150,29 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101913` Query: ``` -sum by (op)(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum by (op)(increase(src_batches_httpapi_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum by (op)(increase(src_batches_httpapi_errors_total{job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))))))) ```

-### Frontend: Out-of-band migrations: up migration invocation (one batch processed) - -#### frontend: oobmigration_total - -

Migration handler operations every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102000` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details +#### frontend: zoekt_configuration_grpc_clients_internal_error_percentage_per_method -Query: +

Client-observed gRPC internal error percentage per-method over 2m

-``` -sum(increase(src_oobmigration_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) -``` -
+The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "zoekt_configuration" clients. -
+**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_configuration" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. -#### frontend: oobmigration_99th_percentile_duration +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. -

Aggregate successful migration handler operation duration distribution over 5m

+**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101911` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3174,43 +3180,29 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102001` Query: ``` -sum by (le)(rate(src_oobmigration_duration_seconds_bucket{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### frontend: oobmigration_errors_total - -

Migration handler operation errors every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102002` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details +#### frontend: zoekt_configuration_grpc_clients_internal_error_all_codes_per_method -Query: +

Client-observed gRPC internal error response code rate per-method over 2m

-``` -sum(increase(src_oobmigration_errors_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) -``` -
+The rate of gRPC internal-error response codes per method, aggregated across all "zoekt_configuration" clients. -
+**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_configuration" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. -#### frontend: oobmigration_error_rate +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. -

Migration handler operation error rate over 5m

+**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102003` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=101912` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3218,45 +3210,25 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102003` Query: ``` -sum(increase(src_oobmigration_errors_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_oobmigration_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_oobmigration_errors_total{op="up",job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +(sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",is_internal_error="true",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```

-### Frontend: Out-of-band migrations: down migration invocation (one batch processed) - -#### frontend: oobmigration_total - -

Migration handler operations every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102100` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_oobmigration_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) -``` -
+### Frontend: Zoekt Configuration GRPC retry metrics -
+#### frontend: zoekt_configuration_grpc_clients_retry_percentage_across_all_methods -#### frontend: oobmigration_99th_percentile_duration +

Client retry percentage across all methods over 2m

-

Aggregate successful migration handler operation duration distribution over 5m

+The percentage of gRPC requests that were retried across all methods, aggregated across all "zoekt_configuration" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3264,21 +3236,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102101` Query: ``` -sum by (le)(rate(src_oobmigration_duration_seconds_bucket{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))))))) ```

-#### frontend: oobmigration_errors_total +#### frontend: zoekt_configuration_grpc_clients_retry_percentage_per_method -

Migration handler operation errors every 5m

+

Client retry percentage per-method over 2m

+ +The percentage of gRPC requests that were retried aggregated across all "zoekt_configuration" clients, broken out per method. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3286,21 +3260,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102102` Query: ``` -sum(increase(src_oobmigration_errors_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",is_retried="true",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### frontend: oobmigration_error_rate +#### frontend: zoekt_configuration_grpc_clients_retry_count_per_method -

Migration handler operation error rate over 5m

+

Client retry count per-method over 2m

+ +The count of gRPC requests that were retried aggregated across all "zoekt_configuration" clients, broken out per method This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102002` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -3308,15 +3284,15 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102103` Query: ``` -sum(increase(src_oobmigration_errors_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) / (sum(increase(src_oobmigration_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m])) + sum(increase(src_oobmigration_errors_total{op="down",job=~"^(frontend|sourcegraph-frontend).*"}[5m]))) * 100 +(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}",is_retried="true"}[2m])) by (grpc_method)) ```

-### Frontend: Zoekt Configuration GRPC server metrics +### Frontend: Internal Api GRPC server metrics -#### frontend: zoekt_configuration_grpc_request_rate_all_methods +#### frontend: internal_api_grpc_request_rate_all_methods

Request rate across all methods over 2m

@@ -3324,7 +3300,7 @@ The number of gRPC requests received per second across all methods, aggregated a This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102100` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3334,13 +3310,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102200` Query: ``` -sum(rate(grpc_server_started_total{instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) +sum(rate(grpc_server_started_total{instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) ```
-#### frontend: zoekt_configuration_grpc_request_rate_per_method +#### frontend: internal_api_grpc_request_rate_per_method

Request rate per-method over 2m

@@ -3348,7 +3324,7 @@ The number of gRPC requests received per second broken out per method, aggregate This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102101` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3358,13 +3334,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102201` Query: ``` -sum(rate(grpc_server_started_total{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method) +sum(rate(grpc_server_started_total{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method) ```
-#### frontend: zoekt_configuration_error_percentage_all_methods +#### frontend: internal_api_error_percentage_all_methods

Error percentage across all methods over 2m

@@ -3372,7 +3348,7 @@ The percentage of gRPC requests that fail across all methods, aggregated across This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102110` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3382,13 +3358,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102210` Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) )) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) )) ```
-#### frontend: zoekt_configuration_grpc_error_percentage_per_method +#### frontend: internal_api_grpc_error_percentage_per_method

Error percentage per-method over 2m

@@ -3396,7 +3372,7 @@ The percentage of gRPC requests that fail per method, aggregated across all inst This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102111` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3406,13 +3382,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102211` Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_configuration_method:regex}`,grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method)) )) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${internal_api_method:regex}`,grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method)) )) ```
-#### frontend: zoekt_configuration_p99_response_time_per_method +#### frontend: internal_api_p99_response_time_per_method

99th percentile response time per method over 2m

@@ -3420,7 +3396,7 @@ The 99th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102220` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102120` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3430,13 +3406,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102220` Query: ``` -histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p90_response_time_per_method +#### frontend: internal_api_p90_response_time_per_method

90th percentile response time per method over 2m

@@ -3444,7 +3420,7 @@ The 90th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102221` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102121` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3454,13 +3430,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102221` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p75_response_time_per_method +#### frontend: internal_api_p75_response_time_per_method

75th percentile response time per method over 2m

@@ -3468,7 +3444,7 @@ The 75th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102222` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102122` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3478,13 +3454,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102222` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p99_9_response_size_per_method +#### frontend: internal_api_p99_9_response_size_per_method

99.9th percentile total response size per method over 2m

@@ -3492,7 +3468,7 @@ The 99.9th percentile total per-RPC response size per method, aggregated across This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102130` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3502,13 +3478,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102230` Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p90_response_size_per_method +#### frontend: internal_api_p90_response_size_per_method

90th percentile total response size per method over 2m

@@ -3516,7 +3492,7 @@ The 90th percentile total per-RPC response size per method, aggregated across al This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102231` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102131` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3526,13 +3502,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102231` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p75_response_size_per_method +#### frontend: internal_api_p75_response_size_per_method

75th percentile total response size per method over 2m

@@ -3540,7 +3516,7 @@ The 75th percentile total per-RPC response size per method, aggregated across al This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102232` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102132` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3550,13 +3526,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102232` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p99_9_invididual_sent_message_size_per_method +#### frontend: internal_api_p99_9_invididual_sent_message_size_per_method

99.9th percentile individual sent message size per method over 2m

@@ -3564,7 +3540,7 @@ The 99.9th percentile size of every individual protocol buffer size sent by the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102240` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102140` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3574,13 +3550,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102240` Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p90_invididual_sent_message_size_per_method +#### frontend: internal_api_p90_invididual_sent_message_size_per_method

90th percentile individual sent message size per method over 2m

@@ -3588,7 +3564,7 @@ The 90th percentile size of every individual protocol buffer size sent by the se This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102241` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102141` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3598,13 +3574,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102241` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_p75_invididual_sent_message_size_per_method +#### frontend: internal_api_p75_invididual_sent_message_size_per_method

75th percentile individual sent message size per method over 2m

@@ -3612,7 +3588,7 @@ The 75th percentile size of every individual protocol buffer size sent by the se This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102242` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102142` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3622,13 +3598,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102242` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) ```
-#### frontend: zoekt_configuration_grpc_response_stream_message_count_per_method +#### frontend: internal_api_grpc_response_stream_message_count_per_method

Average streaming response message count per-method over 2m

@@ -3636,7 +3612,7 @@ The average number of response messages sent during a streaming RPC method, brok This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102250` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102150` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3646,13 +3622,13 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102250` Query: ``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method))) +((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method))) ```
-#### frontend: zoekt_configuration_grpc_all_codes_per_method +#### frontend: internal_api_grpc_all_codes_per_method

Response codes rate per-method over 2m

@@ -3660,7 +3636,7 @@ The rate of all generated gRPC response codes per method, aggregated across all This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102260` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102160` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3670,23 +3646,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102260` Query: ``` -sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_configuration_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m])) by (grpc_method, grpc_code) +sum(rate(grpc_server_handled_total{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method, grpc_code) ```
-### Frontend: Zoekt Configuration GRPC "internal error" metrics +### Frontend: Internal Api GRPC "internal error" metrics -#### frontend: zoekt_configuration_grpc_clients_error_percentage_all_methods +#### frontend: internal_api_grpc_clients_error_percentage_all_methods

Client baseline error percentage across all methods over 2m

-The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "zoekt_configuration" clients. +The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "internal_api" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102200` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3696,21 +3672,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102300` Query: ``` -(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_code!="OK"}[2m])))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))))))) ```
-#### frontend: zoekt_configuration_grpc_clients_error_percentage_per_method +#### frontend: internal_api_grpc_clients_error_percentage_per_method

Client baseline error percentage per-method over 2m

-The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_configuration" clients. +The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "internal_api" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102201` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3720,21 +3696,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102301` Query: ``` -(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method)))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### frontend: zoekt_configuration_grpc_clients_all_codes_per_method +#### frontend: internal_api_grpc_clients_all_codes_per_method

Client baseline response codes rate per-method over 2m

-The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_configuration" clients. +The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "internal_api" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102202` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3744,19 +3720,19 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102302` Query: ``` -(sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method, grpc_code)) +(sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```
-#### frontend: zoekt_configuration_grpc_clients_internal_error_percentage_all_methods +#### frontend: internal_api_grpc_clients_internal_error_percentage_all_methods

Client-observed gRPC internal error percentage across all methods over 2m

-The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "zoekt_configuration" clients. +The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "internal_api" clients. -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_configuration" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "internal_api" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. @@ -3764,7 +3740,7 @@ When debugging, knowing that a particular error comes from the grpc-go library i This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102210` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3774,19 +3750,19 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102310` Query: ``` -(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))))))) ```
-#### frontend: zoekt_configuration_grpc_clients_internal_error_percentage_per_method +#### frontend: internal_api_grpc_clients_internal_error_percentage_per_method

Client-observed gRPC internal error percentage per-method over 2m

-The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "zoekt_configuration" clients. +The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "internal_api" clients. -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_configuration" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "internal_api" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. @@ -3794,7 +3770,7 @@ When debugging, knowing that a particular error comes from the grpc-go library i This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102211` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3804,19 +3780,19 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102311` Query: ``` -(100.0 * ((((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method)))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### frontend: zoekt_configuration_grpc_clients_internal_error_all_codes_per_method +#### frontend: internal_api_grpc_clients_internal_error_all_codes_per_method

Client-observed gRPC internal error response code rate per-method over 2m

-The rate of gRPC internal-error response codes per method, aggregated across all "zoekt_configuration" clients. +The rate of gRPC internal-error response codes per method, aggregated across all "internal_api" clients. -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_configuration" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "internal_api" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. @@ -3824,7 +3800,7 @@ When debugging, knowing that a particular error comes from the grpc-go library i This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102212` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3834,23 +3810,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102312` Query: ``` -(sum(rate(grpc_method_status{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",is_internal_error="true",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method, grpc_code)) +(sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",is_internal_error="true",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```
-### Frontend: Zoekt Configuration GRPC retry metrics +### Frontend: Internal Api GRPC retry metrics -#### frontend: zoekt_configuration_grpc_clients_retry_percentage_across_all_methods +#### frontend: internal_api_grpc_clients_retry_percentage_across_all_methods

Client retry percentage across all methods over 2m

-The percentage of gRPC requests that were retried across all methods, aggregated across all "zoekt_configuration" clients. +The percentage of gRPC requests that were retried across all methods, aggregated across all "internal_api" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102300` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3860,21 +3836,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102400` Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"}[2m]))))))) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))))))) ```
-#### frontend: zoekt_configuration_grpc_clients_retry_percentage_per_method +#### frontend: internal_api_grpc_clients_retry_percentage_per_method

Client retry percentage per-method over 2m

-The percentage of gRPC requests that were retried aggregated across all "zoekt_configuration" clients, broken out per method. +The percentage of gRPC requests that were retried aggregated across all "internal_api" clients, broken out per method. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102301` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3884,21 +3860,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102401` Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",is_retried="true",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}"}[2m])) by (grpc_method)))))) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",is_retried="true",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### frontend: zoekt_configuration_grpc_clients_retry_count_per_method +#### frontend: internal_api_grpc_clients_retry_count_per_method

Client retry count per-method over 2m

-The count of gRPC requests that were retried aggregated across all "zoekt_configuration" clients, broken out per method +The count of gRPC requests that were retried aggregated across all "internal_api" clients, broken out per method This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102302` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -3908,25 +3884,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102402` Query: ``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"sourcegraph.zoekt.configuration.v1.ZoektConfigurationService",grpc_method=~"${zoekt_configuration_method:regex}",is_retried="true"}[2m])) by (grpc_method)) +(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}",is_retried="true"}[2m])) by (grpc_method)) ```
-### Frontend: Internal Api GRPC server metrics - -#### frontend: internal_api_grpc_request_rate_all_methods +### Frontend: Internal service requests -

Request rate across all methods over 2m

+#### frontend: internal_indexed_search_error_responses -The number of gRPC requests received per second across all methods, aggregated across all instances. +

Internal indexed search error responses every 5m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-internal-indexed-search-error-responses) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -3934,23 +3908,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102500` Query: ``` -sum(rate(grpc_server_started_total{instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) +sum by(code) (increase(src_zoekt_request_duration_seconds_count{code!~"2.."}[5m])) / ignoring(code) group_left sum(increase(src_zoekt_request_duration_seconds_count[5m])) * 100 ```

-#### frontend: internal_api_grpc_request_rate_per_method - -

Request rate per-method over 2m

+#### frontend: internal_unindexed_search_error_responses -The number of gRPC requests received per second broken out per method, aggregated across all instances. +

Internal unindexed search error responses every 5m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-internal-unindexed-search-error-responses) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -3958,23 +3930,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102501` Query: ``` -sum(rate(grpc_server_started_total{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method) +sum by(code) (increase(searcher_service_request_total{code!~"2.."}[5m])) / ignoring(code) group_left sum(increase(searcher_service_request_total[5m])) * 100 ```

-#### frontend: internal_api_error_percentage_all_methods - -

Error percentage across all methods over 2m

+#### frontend: 99th_percentile_gitserver_duration -The percentage of gRPC requests that fail across all methods, aggregated across all instances. +

99th percentile successful gitserver query duration over 5m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-99th-percentile-gitserver-duration) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102410` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -3982,23 +3952,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102510` Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) )) +histogram_quantile(0.99, sum by (le,category)(rate(src_gitserver_request_duration_seconds_bucket{job=~"(sourcegraph-)?frontend"}[5m]))) ```

-#### frontend: internal_api_grpc_error_percentage_per_method - -

Error percentage per-method over 2m

+#### frontend: gitserver_error_responses -The percentage of gRPC requests that fail per method, aggregated across all instances. +

Gitserver error responses every 5m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-gitserver-error-responses) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4006,23 +3974,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102511` Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${internal_api_method:regex}`,grpc_code!="OK",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method)) )) +sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend",code!~"2.."}[5m])) / ignoring(code) group_left sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend"}[5m])) * 100 ```

-#### frontend: internal_api_p99_response_time_per_method - -

99th percentile response time per method over 2m

+#### frontend: observability_test_alert_warning -The 99th percentile response time per method, aggregated across all instances. +

Warning test alert metric

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-observability-test-alert-warning) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102520` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102420` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4030,23 +3996,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102520` Query: ``` -histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +max by(owner) (observability_test_metric_warning) ```

-#### frontend: internal_api_p90_response_time_per_method - -

90th percentile response time per method over 2m

+#### frontend: observability_test_alert_critical -The 90th percentile response time per method, aggregated across all instances. +

Critical test alert metric

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-observability-test-alert-critical) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102521` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102421` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4054,23 +4018,25 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102521` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +max by(owner) (observability_test_metric_critical) ```

-#### frontend: internal_api_p75_response_time_per_method +### Frontend: Authentication API requests -

75th percentile response time per method over 2m

+#### frontend: sign_in_rate -The 75th percentile response time per method, aggregated across all instances. +

Rate of API requests to sign-in

+ +Rate (QPS) of requests to sign-in This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102522` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4078,23 +4044,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102522` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +sum(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m])) ```

-#### frontend: internal_api_p99_9_response_size_per_method +#### frontend: sign_in_latency_p99 -

99.9th percentile total response size per method over 2m

+

99 percentile of sign-in latency

-The 99.9th percentile total per-RPC response size per method, aggregated across all instances. +99% percentile of sign-in latency This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102530` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4102,23 +4068,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102530` Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-in",method="post"}[5m])) by (le)) ```

-#### frontend: internal_api_p90_response_size_per_method +#### frontend: sign_in_error_rate -

90th percentile total response size per method over 2m

+

Percentage of sign-in requests by http code

-The 90th percentile total per-RPC response size per method, aggregated across all instances. +Percentage of sign-in requests grouped by http code This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102531` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4126,23 +4092,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102531` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))*100 ```

-#### frontend: internal_api_p75_response_size_per_method +#### frontend: sign_up_rate -

75th percentile total response size per method over 2m

+

Rate of API requests to sign-up

-The 75th percentile total per-RPC response size per method, aggregated across all instances. +Rate (QPS) of requests to sign-up This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102532` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102510` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4150,23 +4116,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102532` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +sum(irate(src_http_request_duration_seconds_count{route="sign-up",method="post"}[5m])) ```

-#### frontend: internal_api_p99_9_invididual_sent_message_size_per_method +#### frontend: sign_up_latency_p99 -

99.9th percentile individual sent message size per method over 2m

+

99 percentile of sign-up latency

-The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +99% percentile of sign-up latency This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102540` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4174,23 +4140,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102540` Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-up",method="post"}[5m])) by (le)) ```

-#### frontend: internal_api_p90_invididual_sent_message_size_per_method +#### frontend: sign_up_code_percentage -

90th percentile individual sent message size per method over 2m

+

Percentage of sign-up requests by http code

-The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +Percentage of sign-up requests grouped by http code This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102541` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102512` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4198,23 +4164,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102541` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-up",method="post"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))*100 ```

-#### frontend: internal_api_p75_invididual_sent_message_size_per_method +#### frontend: sign_out_rate -

75th percentile individual sent message size per method over 2m

+

Rate of API requests to sign-out

-The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +Rate (QPS) of requests to sign-out This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102542` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102520` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4222,23 +4188,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102542` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))) +sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m])) ```

-#### frontend: internal_api_grpc_response_stream_message_count_per_method +#### frontend: sign_out_latency_p99 -

Average streaming response message count per-method over 2m

+

99 percentile of sign-out latency

-The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. +99% percentile of sign-out latency This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102550` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102521` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4246,23 +4212,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102550` Query: ``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method))) +histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-out"}[5m])) by (le)) ```

-#### frontend: internal_api_grpc_all_codes_per_method +#### frontend: sign_out_error_rate -

Response codes rate per-method over 2m

+

Percentage of sign-out requests that return non-303 http code

-The rate of all generated gRPC response codes per method, aggregated across all instances. +Percentage of sign-out requests grouped by http code This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102560` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102522` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4270,25 +4236,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102560` Query: ``` -sum(rate(grpc_server_handled_total{grpc_method=~`${internal_api_method:regex}`,instance=~`${internalInstance:regex}`,grpc_service=~"api.internalapi.v1.ConfigService"}[2m])) by (grpc_method, grpc_code) + sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))*100 ```

-### Frontend: Internal Api GRPC "internal error" metrics - -#### frontend: internal_api_grpc_clients_error_percentage_all_methods +#### frontend: account_failed_sign_in_attempts -

Client baseline error percentage across all methods over 2m

+

Rate of failed sign-in attempts

-The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "internal_api" clients. +Failed sign-in attempts per minute This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102530` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4296,23 +4260,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102600` Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))))))) +sum(rate(src_frontend_account_failed_sign_in_attempts_total[1m])) ```

-#### frontend: internal_api_grpc_clients_error_percentage_per_method +#### frontend: account_lockouts -

Client baseline error percentage per-method over 2m

+

Rate of account lockouts

-The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "internal_api" clients. +Account lockouts per minute This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102531` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4320,23 +4284,25 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102601` Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method)))))) +sum(rate(src_frontend_account_lockouts_total[1m])) ```

-#### frontend: internal_api_grpc_clients_all_codes_per_method +### Frontend: External HTTP Request Rate -

Client baseline response codes rate per-method over 2m

+#### frontend: external_http_request_rate_by_host -The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "internal_api" clients. +

Rate of external HTTP requests by host over 1m

+ +Shows the rate of external HTTP requests made by Sourcegraph to other services, broken down by host. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4344,89 +4310,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102602` Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum by (host) (rate(src_http_client_external_request_count{host=~`${httpRequestHost:regex}`}[1m])) ```

-#### frontend: internal_api_grpc_clients_internal_error_percentage_all_methods +#### frontend: external_http_request_rate_by_host_by_code -

Client-observed gRPC internal error percentage across all methods over 2m

- -The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "internal_api" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "internal_api" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +

Rate of external HTTP requests by host and response code over 1m

-**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +Shows the rate of external HTTP requests made by Sourcegraph to other services, broken down by host and response code. This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102610` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))))))) -``` -
- -
- -#### frontend: internal_api_grpc_clients_internal_error_percentage_per_method - -

Client-observed gRPC internal error percentage per-method over 2m

- -The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "internal_api" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "internal_api" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. - -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102611` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method)))))) -``` -
- -
- -#### frontend: internal_api_grpc_clients_internal_error_all_codes_per_method - -

Client-observed gRPC internal error response code rate per-method over 2m

- -The rate of gRPC internal-error response codes per method, aggregated across all "internal_api" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "internal_api" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. - -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102612` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4434,25 +4334,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102612` Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"api.internalapi.v1.ConfigService",is_internal_error="true",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum by (host, status_code) (rate(src_http_client_external_request_count{host=~`${httpRequestHost:regex}`}[1m])) ```

-### Frontend: Internal Api GRPC retry metrics +### Frontend: Cody API requests -#### frontend: internal_api_grpc_clients_retry_percentage_across_all_methods +#### frontend: cody_api_rate -

Client retry percentage across all methods over 2m

+

Rate of API requests to cody endpoints (excluding GraphQL)

-The percentage of gRPC requests that were retried across all methods, aggregated across all "internal_api" clients. +Rate (QPS) of requests to cody related endpoints. completions.stream is for the conversational endpoints. completions.code is for the code auto-complete endpoints. This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -4460,23 +4359,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102700` Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService"}[2m]))))))) +sum by (route, code)(irate(src_http_request_duration_seconds_count{route=~"^completions.*"}[5m])) ```

-#### frontend: internal_api_grpc_clients_retry_percentage_per_method +### Frontend: Cloud KMS and cache -

Client retry percentage per-method over 2m

+#### frontend: cloudkms_cryptographic_requests -The percentage of gRPC requests that were retried aggregated across all "internal_api" clients, broken out per method. +

Cryptographic requests to Cloud KMS every 1m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-cloudkms-cryptographic-requests) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102800` on your Sourcegraph instance. -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -4484,90 +4383,22 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102701` Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",is_retried="true",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}"}[2m])) by (grpc_method)))))) +sum(increase(src_cloudkms_cryptographic_total[1m])) ```

-#### frontend: internal_api_grpc_clients_retry_count_per_method +#### frontend: encryption_cache_hit_ratio -

Client retry count per-method over 2m

+

Average encryption cache hit ratio per workload

-The count of gRPC requests that were retried aggregated across all "internal_api" clients, broken out per method +- Encryption cache hit ratio (hits/(hits+misses)) - minimum across all instances of a workload. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102702` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"api.internalapi.v1.ConfigService",grpc_method=~"${internal_api_method:regex}",is_retried="true"}[2m])) by (grpc_method)) -``` -
- -
- -### Frontend: Internal service requests - -#### frontend: internal_indexed_search_error_responses - -

Internal indexed search error responses every 5m

- -Refer to the [alerts reference](alerts#frontend-internal-indexed-search-error-responses) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102800` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* - -
-Technical details - -Query: - -``` -sum by(code) (increase(src_zoekt_request_duration_seconds_count{code!~"2.."}[5m])) / ignoring(code) group_left sum(increase(src_zoekt_request_duration_seconds_count[5m])) * 100 -``` -
- -
- -#### frontend: internal_unindexed_search_error_responses - -

Internal unindexed search error responses every 5m

- -Refer to the [alerts reference](alerts#frontend-internal-unindexed-search-error-responses) for 1 alert related to this panel. - To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* - -
-Technical details - -Query: - -``` -sum by(code) (increase(searcher_service_request_total{code!~"2.."}[5m])) / ignoring(code) group_left sum(increase(searcher_service_request_total[5m])) * 100 -``` -
- -
- -#### frontend: 99th_percentile_gitserver_duration - -

99th percentile successful gitserver query duration over 5m

- -Refer to the [alerts reference](alerts#frontend-99th-percentile-gitserver-duration) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102810` on your Sourcegraph instance. - *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
@@ -4576,19 +4407,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102810` Query: ``` -histogram_quantile(0.99, sum by (le,category)(rate(src_gitserver_request_duration_seconds_bucket{job=~"(sourcegraph-)?frontend"}[5m]))) +min by (kubernetes_name) (src_encryption_cache_hit_total/(src_encryption_cache_hit_total+src_encryption_cache_miss_total)) ```

-#### frontend: gitserver_error_responses +#### frontend: encryption_cache_evictions -

Gitserver error responses every 5m

+

Rate of encryption cache evictions - sum across all instances of a given workload

-Refer to the [alerts reference](alerts#frontend-gitserver-error-responses) for 1 alert related to this panel. +- Rate of encryption cache evictions (caused by cache exceeding its maximum size) - sum across all instances of a workload + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102802` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -4598,41 +4431,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102811` Query: ``` -sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend",code!~"2.."}[5m])) / ignoring(code) group_left sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend"}[5m])) * 100 +sum by (kubernetes_name) (irate(src_encryption_cache_eviction_total[5m])) ```
-#### frontend: observability_test_alert_warning - -

Warning test alert metric

- -Refer to the [alerts reference](alerts#frontend-observability-test-alert-warning) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102820` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* - -
-Technical details - -Query: - -``` -max by(owner) (observability_test_metric_warning) -``` -
+### Frontend: Periodic Goroutines -
+#### frontend: running_goroutines -#### frontend: observability_test_alert_critical +

Number of currently running periodic goroutines

-

Critical test alert metric

+The number of currently running periodic goroutines by name and job. +A value of 0 indicates the routine isn`t running currently, it awaits it`s next schedule. -Refer to the [alerts reference](alerts#frontend-observability-test-alert-critical) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102821` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102900` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -4642,25 +4458,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102821` Query: ``` -max by(owner) (observability_test_metric_critical) +sum by (name, job_name) (src_periodic_goroutine_running{job=~".*frontend.*"}) ```
-### Frontend: Authentication API requests +#### frontend: goroutine_success_rate -#### frontend: sign_in_rate +

Success rate for periodic goroutine executions

-

Rate of API requests to sign-in

- -Rate (QPS) of requests to sign-in +The rate of successful executions of each periodic goroutine. +A low or zero value could indicate that a routine is stalled or encountering errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102901` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4668,23 +4483,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102900` Query: ``` -sum(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m])) +sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*frontend.*"}[5m])) ```

-#### frontend: sign_in_latency_p99 +#### frontend: goroutine_error_rate -

99 percentile of sign-in latency

+

Error rate for periodic goroutine executions

-99% percentile of sign-in latency +The rate of errors encountered by each periodic goroutine. +A sustained high error rate may indicate a problem with the routine`s configuration or dependencies. -This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-goroutine-error-rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102910` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4692,23 +4508,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102901` Query: ``` -histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-in",method="post"}[5m])) by (le)) +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*frontend.*"}[5m])) ```

-#### frontend: sign_in_error_rate +#### frontend: goroutine_error_percentage -

Percentage of sign-in requests by http code

+

Percentage of periodic goroutine executions that result in errors

-Percentage of sign-in requests grouped by http code +The percentage of executions that result in errors for each periodic goroutine. +A value above 5% indicates that a significant portion of routine executions are failing. -This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-goroutine-error-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102902` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102911` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4716,23 +4533,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102902` Query: ``` -sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))*100 +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*frontend.*"}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*frontend.*"}[5m]) > 0) * 100 ```

-#### frontend: sign_up_rate +#### frontend: goroutine_handler_duration -

Rate of API requests to sign-up

+

95th percentile handler execution time

-Rate (QPS) of requests to sign-up +The 95th percentile execution time for each periodic goroutine handler. +Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4740,23 +4558,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102910` Query: ``` -sum(irate(src_http_request_duration_seconds_count{route="sign-up",method="post"}[5m])) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_duration_seconds_bucket{job=~".*frontend.*"}[5m]))) ```

-#### frontend: sign_up_latency_p99 +#### frontend: goroutine_loop_duration -

99 percentile of sign-up latency

+

95th percentile loop cycle time

-99% percentile of sign-up latency +The 95th percentile loop cycle time for each periodic goroutine (excluding sleep time). +This represents how long a complete loop iteration takes before sleeping for the next interval. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102921` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4764,23 +4583,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102911` Query: ``` -histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-up",method="post"}[5m])) by (le)) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_loop_duration_seconds_bucket{job=~".*frontend.*"}[5m]))) ```

-#### frontend: sign_up_code_percentage +#### frontend: tenant_processing_duration -

Percentage of sign-up requests by http code

+

95th percentile tenant processing time

-Percentage of sign-up requests grouped by http code +The 95th percentile processing time for individual tenants within periodic goroutines. +Higher values indicate that tenant processing is taking longer and may affect overall performance. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102912` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4788,23 +4608,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102912` Query: ``` -sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-up",method="post"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))*100 +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_tenant_duration_seconds_bucket{job=~".*frontend.*"}[5m]))) ```

-#### frontend: sign_out_rate +#### frontend: tenant_processing_max -

Rate of API requests to sign-out

+

Maximum tenant processing time

-Rate (QPS) of requests to sign-out +The maximum processing time for individual tenants within periodic goroutines. +Consistently high values might indicate problematic tenants or inefficient processing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4812,23 +4633,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` Query: ``` -sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m])) +max by (name, job_name) (rate(src_periodic_goroutine_tenant_duration_seconds_sum{job=~".*frontend.*"}[5m]) / rate(src_periodic_goroutine_tenant_duration_seconds_count{job=~".*frontend.*"}[5m])) ```

-#### frontend: sign_out_latency_p99 +#### frontend: tenant_count -

99 percentile of sign-out latency

+

Number of tenants processed per routine

-99% percentile of sign-out latency +The number of tenants processed by each periodic goroutine. +Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102921` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102940` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4836,23 +4658,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102921` Query: ``` -histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-out"}[5m])) by (le)) +max by (name, job_name) (src_periodic_goroutine_tenant_count{job=~".*frontend.*"}) ```

-#### frontend: sign_out_error_rate +#### frontend: tenant_success_rate -

Percentage of sign-out requests that return non-303 http code

+

Rate of successful tenant processing operations

-Percentage of sign-out requests grouped by http code +The rate of successful tenant processing operations. +A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102922` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102941` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4860,23 +4683,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102922` Query: ``` - sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))*100 +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*frontend.*"}[5m])) ```

-#### frontend: account_failed_sign_in_attempts +#### frontend: tenant_error_rate -

Rate of failed sign-in attempts

+

Rate of tenant processing errors

-Failed sign-in attempts per minute +The rate of tenant processing operations that result in errors. +Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102950` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4884,23 +4708,24 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` Query: ``` -sum(rate(src_frontend_account_failed_sign_in_attempts_total[1m])) +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*frontend.*"}[5m])) ```

-#### frontend: account_lockouts +#### frontend: tenant_error_percentage -

Rate of account lockouts

+

Percentage of tenant operations resulting in errors

-Account lockouts per minute +The percentage of tenant operations that result in errors. +Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102951` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4908,25 +4733,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` Query: ``` -sum(rate(src_frontend_account_lockouts_total[1m])) +(sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*frontend.*"}[5m])) / (sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*frontend.*"}[5m])) + sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*frontend.*"}[5m])))) * 100 ```

-### Frontend: External HTTP Request Rate - -#### frontend: external_http_request_rate_by_host +### Frontend: Database connections -

Rate of external HTTP requests by host over 1m

+#### frontend: max_open_conns -Shows the rate of external HTTP requests made by Sourcegraph to other services, broken down by host. +

Maximum open

This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4934,23 +4757,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103000` Query: ``` -sum by (host) (rate(src_http_client_external_request_count{host=~`${httpRequestHost:regex}`}[1m])) +sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="frontend"}) ```

-#### frontend: external_http_request_rate_by_host_by_code - -

Rate of external HTTP requests by host and response code over 1m

+#### frontend: open_conns -Shows the rate of external HTTP requests made by Sourcegraph to other services, broken down by host and response code. +

Established

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4958,24 +4779,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103010` Query: ``` -sum by (host, status_code) (rate(src_http_client_external_request_count{host=~`${httpRequestHost:regex}`}[1m])) +sum by (app_name, db_name) (src_pgsql_conns_open{app_name="frontend"}) ```

-### Frontend: Cody API requests - -#### frontend: cody_api_rate - -

Rate of API requests to cody endpoints (excluding GraphQL)

+#### frontend: in_use -Rate (QPS) of requests to cody related endpoints. completions.stream is for the conversational endpoints. completions.code is for the code auto-complete endpoints. +

Used

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103010` on your Sourcegraph instance. +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -4983,23 +4801,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103100` Query: ``` -sum by (route, code)(irate(src_http_request_duration_seconds_count{route=~"^completions.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="frontend"}) ```

-### Frontend: Cloud KMS and cache - -#### frontend: cloudkms_cryptographic_requests +#### frontend: idle -

Cryptographic requests to Cloud KMS every 1m

+

Idle

-Refer to the [alerts reference](alerts#frontend-cloudkms-cryptographic-requests) for 2 alerts related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -5007,23 +4823,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103200` Query: ``` -sum(increase(src_cloudkms_cryptographic_total[1m])) +sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="frontend"}) ```

-#### frontend: encryption_cache_hit_ratio - -

Average encryption cache hit ratio per workload

+#### frontend: mean_blocked_seconds_per_conn_request -- Encryption cache hit ratio (hits/(hits+misses)) - minimum across all instances of a workload. +

Mean blocked seconds per conn request

-This panel has no related alerts. +Refer to the [alerts reference](alerts#frontend-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103020` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -5031,23 +4845,21 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103201` Query: ``` -min by (kubernetes_name) (src_encryption_cache_hit_total/(src_encryption_cache_hit_total+src_encryption_cache_miss_total)) +sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="frontend"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="frontend"}[5m])) ```

-#### frontend: encryption_cache_evictions - -

Rate of encryption cache evictions - sum across all instances of a given workload

+#### frontend: closed_max_idle -- Rate of encryption cache evictions (caused by cache exceeding its maximum size) - sum across all instances of a workload +

Closed by SetMaxIdleConns

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103030` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -5055,21 +4867,19 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103202` Query: ``` -sum by (kubernetes_name) (irate(src_encryption_cache_eviction_total[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="frontend"}[5m])) ```

-### Frontend: Database connections - -#### frontend: max_open_conns +#### frontend: closed_max_lifetime -

Maximum open

+

Closed by SetConnMaxLifetime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103031` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5079,19 +4889,19 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103300` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="frontend"}) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="frontend"}[5m])) ```
-#### frontend: open_conns +#### frontend: closed_max_idle_time -

Established

+

Closed by SetConnMaxIdleTime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103032` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5101,21 +4911,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103301` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="frontend"}) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="frontend"}[5m])) ```
-#### frontend: in_use +### Frontend: (frontend|sourcegraph-frontend) (CPU, Memory) -

Used

+#### frontend: cpu_usage_percentage -This panel has no related alerts. +

CPU usage

-To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103310` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#frontend-cpu-usage-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103100` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -5123,21 +4935,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103310` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="frontend"}) +cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend).*"} ```

-#### frontend: idle +#### frontend: memory_usage_percentage -

Idle

+

Memory usage percentage (total)

+ +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -5145,21 +4959,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103311` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="frontend"}) +cadvisor_container_memory_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend).*"} ```

-#### frontend: mean_blocked_seconds_per_conn_request +#### frontend: memory_working_set_bytes -

Mean blocked seconds per conn request

+

Memory usage bytes (total)

-Refer to the [alerts reference](alerts#frontend-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103320` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103102` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -5167,21 +4983,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103320` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="frontend"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="frontend"}[5m])) +max by (name) (container_memory_working_set_bytes{name=~"^(frontend|sourcegraph-frontend).*"}) ```

-#### frontend: closed_max_idle +#### frontend: memory_rss -

Closed by SetMaxIdleConns

+

Memory (RSS)

-This panel has no related alerts. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103330` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#frontend-memory-rss) for 1 alert related to this panel. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103110` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -5189,21 +5007,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103330` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="frontend"}[5m])) +max(container_memory_rss{name=~"^(frontend|sourcegraph-frontend).*"} / container_spec_memory_limit_bytes{name=~"^(frontend|sourcegraph-frontend).*"}) by (name) * 100.0 ```

-#### frontend: closed_max_lifetime +#### frontend: memory_total_active_file -

Closed by SetConnMaxLifetime

+

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103331` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -5211,21 +5031,23 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103331` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="frontend"}[5m])) +max(container_memory_total_active_file_bytes{name=~"^(frontend|sourcegraph-frontend).*"} / container_spec_memory_limit_bytes{name=~"^(frontend|sourcegraph-frontend).*"}) by (name) * 100.0 ```

-#### frontend: closed_max_idle_time +#### frontend: memory_kernel_usage -

Closed by SetConnMaxIdleTime

+

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103332` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103112` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -5233,7 +5055,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103332` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="frontend"}[5m])) +max(container_memory_kernel_usage{name=~"^(frontend|sourcegraph-frontend).*"} / container_spec_memory_limit_bytes{name=~"^(frontend|sourcegraph-frontend).*"}) by (name) * 100.0 ```
@@ -5257,7 +5079,7 @@ value change independent of deployment events (such as an upgrade), it could ind This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103200` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5279,7 +5101,7 @@ count by(name) ((time() - container_last_seen{name=~"^(frontend|sourcegraph-fron Refer to the [alerts reference](alerts#frontend-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103201` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5301,7 +5123,7 @@ cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|sourcegraph-fron Refer to the [alerts reference](alerts#frontend-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103202` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5326,7 +5148,7 @@ When extremely high, this can indicate a resource usage problem, or can cause pr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103203` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5350,7 +5172,7 @@ sum by(name) (rate(container_fs_reads_total{name=~"^(frontend|sourcegraph-fronte Refer to the [alerts reference](alerts#frontend-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103300` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5372,7 +5194,7 @@ quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^(f Refer to the [alerts reference](alerts#frontend-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103301` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5394,7 +5216,7 @@ max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(frontend Refer to the [alerts reference](alerts#frontend-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103310` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5416,7 +5238,7 @@ max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|so Refer to the [alerts reference](alerts#frontend-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103311` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5441,7 +5263,7 @@ When it occurs frequently, it is an indicator of underprovisioning. Refer to the [alerts reference](alerts#frontend-container-oomkill-events-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103312` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5467,7 +5289,7 @@ A high value here indicates a possible goroutine leak. Refer to the [alerts reference](alerts#frontend-go-goroutines) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103400` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5489,7 +5311,7 @@ max by(instance) (go_goroutines{job=~".*(frontend|sourcegraph-frontend)"}) Refer to the [alerts reference](alerts#frontend-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103401` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5513,7 +5335,7 @@ max by(instance) (go_gc_duration_seconds{job=~".*(frontend|sourcegraph-frontend) Refer to the [alerts reference](alerts#frontend-pods-available-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103500` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5539,7 +5361,7 @@ The total number of search clicks across all search types over a 6 hour window. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103600` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -5563,7 +5385,7 @@ The percent of clicks that were on the top search result, excluding searches wit This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103601` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -5587,7 +5409,7 @@ The percent of clicks that were on the first 3 search results, excluding searche This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103602` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -5611,7 +5433,7 @@ The distribution of clicked search results by result type. At every point in tim This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103610` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -5635,7 +5457,7 @@ The percent of Zoekt searches that hit the flush time limit. These searches don` This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103611` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -5659,7 +5481,7 @@ sum(increase(zoekt_final_aggregate_size_count{reason="timer_expired"}[1d])) / su Refer to the [alerts reference](alerts#frontend-email-delivery-failures) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103700` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5683,7 +5505,7 @@ Total emails successfully delivered. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103710` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5707,7 +5529,7 @@ Emails successfully delivered by source, i.e. product feature. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103711` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -5733,7 +5555,7 @@ Mean search duration for all successful sentinel queries Refer to the [alerts reference](alerts#frontend-mean-successful-sentinel-duration-over-2h) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103800` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5757,7 +5579,7 @@ Mean time to first result for all successful streaming sentinel queries Refer to the [alerts reference](alerts#frontend-mean-sentinel-stream-latency-over-2h) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103801` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5781,7 +5603,7 @@ sum(rate(src_search_streaming_latency_seconds_sum{source=~"searchblitz.*"}[2h])) Refer to the [alerts reference](alerts#frontend-90th-percentile-successful-sentinel-duration-over-2h) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103810` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5805,7 +5627,7 @@ histogram_quantile(0.90, sum by (le)(label_replace(rate(src_search_response_late Refer to the [alerts reference](alerts#frontend-90th-percentile-sentinel-stream-latency-over-2h) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103811` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5829,7 +5651,7 @@ Mean search duration for successful sentinel queries, broken down by query. Usef This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104020` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103820` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5853,7 +5675,7 @@ Mean time to first result for successful streaming sentinel queries, broken down This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104021` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103821` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5877,7 +5699,7 @@ sum(rate(src_search_streaming_latency_seconds_sum{source=~"searchblitz.*"}[$sent This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104030` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103830` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5901,7 +5723,7 @@ histogram_quantile(0.90, sum(rate(src_search_response_latency_seconds_bucket{sou This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104031` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103831` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5925,7 +5747,7 @@ histogram_quantile(0.90, sum(rate(src_search_streaming_latency_seconds_bucket{so This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104040` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103840` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5949,7 +5771,7 @@ histogram_quantile(0.90, sum(rate(src_search_response_latency_seconds_bucket{sou This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104050` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103850` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5973,7 +5795,7 @@ histogram_quantile(0.75, sum(rate(src_search_response_latency_seconds_bucket{sou This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104051` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103851` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -5997,7 +5819,7 @@ histogram_quantile(0.75, sum(rate(src_search_streaming_latency_seconds_bucket{so This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104060` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103860` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6021,7 +5843,7 @@ The rate of unsuccessful sentinel queries, broken down by failure type. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104070` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103870` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6051,7 +5873,7 @@ p95 response time to incoming webhook requests from code hosts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=103900` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6075,7 +5897,7 @@ histogram_quantile(0.95, sum (rate(src_http_request_duration_seconds_bucket{rou This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6097,7 +5919,7 @@ sum(increase(src_insights_aggregations_total{job=~"^(frontend|sourcegraph-fronte This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6119,7 +5941,7 @@ sum by (le)(rate(src_insights_aggregations_duration_seconds_bucket{job=~"^(fron This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104002` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6141,7 +5963,7 @@ sum(increase(src_insights_aggregations_errors_total{job=~"^(frontend|sourcegraph This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104003` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6163,7 +5985,7 @@ sum(increase(src_insights_aggregations_errors_total{job=~"^(frontend|sourcegraph This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104010` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6185,7 +6007,7 @@ sum by (op,extended_mode)(increase(src_insights_aggregations_total{job=~"^(front This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104011` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6207,7 +6029,7 @@ histogram_quantile(0.99, sum by (le,op,extended_mode)(rate(src_insights_aggrega This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104012` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6229,7 +6051,7 @@ sum by (op,extended_mode)(increase(src_insights_aggregations_errors_total{job=~" This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=104013` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -6275,6 +6097,30 @@ go_goroutines{app="gitserver", instance=~`${shard:regex}`}
+#### gitserver: disk_space_remaining + +

Disk space remaining

+ +Indicates disk space remaining for each gitserver instance. When disk space is low, gitserver may experience slowdowns or fails to fetch repositories. + +Refer to the [alerts reference](alerts#gitserver-disk-space-remaining) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100001` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Query: + +``` +(src_gitserver_disk_space_available{instance=~`${shard:regex}`} / src_gitserver_disk_space_total{instance=~`${shard:regex}`}) * 100 +``` +
+ +
+ #### gitserver: cpu_throttling_time

Container CPU throttling time %

@@ -6326,13 +6172,13 @@ sum by (container_label_io_kubernetes_pod_name) (rate(container_cpu_usage_second
-#### gitserver: disk_space_remaining +#### gitserver: memory_major_page_faults -

Disk space remaining

+

Gitserver page faults

-Indicates disk space remaining for each gitserver instance, which is used to determine when to start evicting least-used repository clones from disk (default 10%, configured by `SRC_REPOS_DESIRED_PERCENT_FREE`). +The number of major page faults in a 5 minute window for gitserver. If this number increases significantly, it indicates that more git API calls need to load data from disk. There may not be enough memory to efficiently support the amount of API requests served concurrently. -Refer to the [alerts reference](alerts#gitserver-disk-space-remaining) for 2 alerts related to this panel. +This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100020` on your Sourcegraph instance. @@ -6344,7 +6190,38 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10002 Query: ``` -(src_gitserver_disk_space_available{instance=~`${shard:regex}`} / src_gitserver_disk_space_total{instance=~`${shard:regex}`}) * 100 +rate(container_memory_failures_total{failure_type="pgmajfault", name=~"^gitserver.*"}[5m]) +``` + + +
+ +#### gitserver: high_memory_git_commands + +

Number of git commands that exceeded the threshold for high memory usage

+ +This graph tracks the number of git subcommands that gitserver ran that exceeded the threshold for high memory usage. +This graph in itself is not an alert, but it is used to learn about the memory usage of gitserver. + +If gitserver frequently serves requests where the status code is KILLED, this graph might help to correlate that +with the high memory usage. + +This graph spiking is not a problem necessarily. But when subcommands or the whole gitserver service are getting +OOM killed and this graph shows spikes, increasing the memory might be useful. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100021` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* + +
+Technical details + +Query: + +``` +sort_desc(sum(sum_over_time(src_gitserver_exec_high_memory_usage_count{instance=~`${shard:regex}`}[2m])) by (cmd)) ```
@@ -6404,7 +6281,7 @@ sum by (cmd) (rate(src_gitserver_exec_duration_seconds_count{instance=~`${shard: -Refer to the [alerts reference](alerts#gitserver-echo-command-duration-test) for 2 alerts related to this panel. +Refer to the [alerts reference](alerts#gitserver-echo-command-duration-test) for 1 alert related to this panel. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100040` on your Sourcegraph instance. @@ -6470,13 +6347,13 @@ sum(src_gitserver_clone_queue)
-#### gitserver: src_gitserver_repo_count +#### gitserver: src_gitserver_client_concurrent_requests -

Number of repositories on gitserver

+

Number of concurrent requests running against gitserver client

-This metric is only for informational purposes. It indicates the total number of repositories on gitserver. +This metric is only for informational purposes. It indicates the current number of concurrently running requests by process against gitserver gRPC. -It does not indicate any problems with the instance. +It does not indicate any problems with the instance, but can give a good indication of load spikes or request throttling. This panel has no related alerts. @@ -6490,7 +6367,7 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10005 Query: ``` -src_gitserver_repo_count +sum by (job, instance) (src_gitserver_client_concurrent_requests) ``` @@ -6498,9 +6375,9 @@ src_gitserver_repo_count ### Git Server: Gitservice for internal cloning -#### gitserver: aggregate_gitservice_request_duration +#### gitserver: gitservice_request_duration -

95th percentile gitservice request duration aggregate

+

95th percentile gitservice request duration per shard

A high value means any internal service trying to clone a repo from gitserver is slowed down. @@ -6516,17 +6393,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10010 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`false`}[5m])) by (le)) +histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{instance=~`${shard:regex}`}[5m])) by (le, gitservice)) ```
-#### gitserver: gitservice_request_duration +#### gitserver: gitservice_request_rate -

95th percentile gitservice request duration per shard

+

Gitservice request rate per shard

-A high value means any internal service trying to clone a repo from gitserver is slowed down. +Per shard gitservice request rate This panel has no related alerts. @@ -6540,21 +6417,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10010 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`false`, instance=~`${shard:regex}`}[5m])) by (le, instance)) +sum(rate(src_gitserver_gitservice_duration_seconds_count{instance=~`${shard:regex}`}[5m])) by (gitservice) ```
-#### gitserver: aggregate_gitservice_error_request_duration +#### gitserver: gitservice_requests_running -

95th percentile gitservice error request duration aggregate

+

Gitservice requests running per shard

-95th percentile gitservice error request duration aggregate +Per shard gitservice requests running This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100102` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6564,21 +6441,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10011 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`true`}[5m])) by (le)) +sum(src_gitserver_gitservice_running{instance=~`${shard:regex}`}) by (gitservice) ```
-#### gitserver: gitservice_request_duration +### Git Server: Gitserver cleanup jobs -

95th percentile gitservice error request duration per shard

+#### gitserver: janitor_tasks_total -95th percentile gitservice error request duration per shard +

Total housekeeping tasks by type and status

+ +The rate of housekeeping tasks performed in repositories, broken down by task type and success/failure status This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100200` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6588,21 +6467,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10011 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`true`, instance=~`${shard:regex}`}[5m])) by (le, instance)) +sum(rate(src_gitserver_janitor_tasks_total{instance=~`${shard:regex}`}[5m])) by (housekeeping_task, status) ```
-#### gitserver: aggregate_gitservice_request_rate +#### gitserver: p90_janitor_tasks_latency_success_over_5m -

Aggregate gitservice request rate

+

90th percentile latency of successful tasks by type over 5m

-Aggregate gitservice request rate +The 90th percentile latency of successful housekeeping tasks, broken down by task type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100210` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6612,21 +6491,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10012 Query: ``` -sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`false`}[5m])) +histogram_quantile(0.90, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="success"}[5m])) by (le, housekeeping_task)) ```
-#### gitserver: gitservice_request_rate +#### gitserver: p95_janitor_tasks_latency_success_over_5m -

Gitservice request rate per shard

+

95th percentile latency of successful tasks by type over 5m

-Per shard gitservice request rate +The 95th percentile latency of successful housekeeping tasks, broken down by task type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100121` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100211` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6636,21 +6515,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10012 Query: ``` -sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`false`, instance=~`${shard:regex}`}[5m])) +histogram_quantile(0.95, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="success"}[5m])) by (le, housekeeping_task)) ```
-#### gitserver: aggregate_gitservice_request_error_rate +#### gitserver: p99_janitor_tasks_latency_success_over_5m -

Aggregate gitservice request error rate

+

99th percentile latency of successful tasks by type over 5m

-Aggregate gitservice request error rate +The 99th percentile latency of successful housekeeping tasks, broken down by task type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100130` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100212` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6660,21 +6539,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10013 Query: ``` -sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`true`}[5m])) +histogram_quantile(0.99, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="success"}[5m])) by (le, housekeeping_task)) ```
-#### gitserver: gitservice_request_error_rate +#### gitserver: p90_janitor_tasks_latency_failure_over_5m -

Gitservice request error rate per shard

+

90th percentile latency of failed tasks by type over 5m

-Per shard gitservice request error rate +The 90th percentile latency of failed housekeeping tasks, broken down by task type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100131` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100220` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6684,21 +6563,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10013 Query: ``` -sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`true`, instance=~`${shard:regex}`}[5m])) +histogram_quantile(0.90, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="failure"}[5m])) by (le, housekeeping_task)) ```
-#### gitserver: aggregate_gitservice_requests_running +#### gitserver: p95_janitor_tasks_latency_failure_over_5m -

Aggregate gitservice requests running

+

95th percentile latency of failed tasks by type over 5m

-Aggregate gitservice requests running +The 95th percentile latency of failed housekeeping tasks, broken down by task type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100140` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100221` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6708,21 +6587,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10014 Query: ``` -sum(src_gitserver_gitservice_running{type=`gitserver`}) +histogram_quantile(0.95, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="failure"}[5m])) by (le, housekeeping_task)) ```
-#### gitserver: gitservice_requests_running +#### gitserver: p99_janitor_tasks_latency_failure_over_5m -

Gitservice requests running per shard

+

99th percentile latency of failed tasks by type over 5m

-Per shard gitservice requests running +The 99th percentile latency of failed housekeeping tasks, broken down by task type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100141` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100222` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6732,23 +6611,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10014 Query: ``` -sum(src_gitserver_gitservice_running{type=`gitserver`, instance=~`${shard:regex}`}) by (instance) +histogram_quantile(0.99, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="failure"}[5m])) by (le, housekeeping_task)) ```
-### Git Server: Gitserver cleanup jobs - -#### gitserver: janitor_running +#### gitserver: pruned_files_total_over_5m -

Janitor process is running

+

Files pruned by type over 5m

-1, if the janitor process is currently running +The rate of files pruned during cleanup, broken down by file type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100230` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6758,21 +6635,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10020 Query: ``` -max by (instance) (src_gitserver_janitor_running{instance=~`${shard:regex}`}) +sum(rate(src_gitserver_janitor_pruned_files_total{instance=~`${shard:regex}`}[5m])) by (filetype) ```
-#### gitserver: janitor_job_duration +#### gitserver: data_structure_count_over_5m -

95th percentile job run duration

+

Data structure counts over 5m

-95th percentile job run duration +The count distribution of various Git data structures in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100240` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6782,21 +6659,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10021 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_janitor_job_duration_seconds_bucket{instance=~`${shard:regex}`}[5m])) by (le, job_name)) +histogram_quantile(0.95, sum(rate(src_gitserver_janitor_data_structure_count_bucket{instance=~`${shard:regex}`}[5m])) by (le, data_structure)) ```
-#### gitserver: janitor_job_failures +#### gitserver: janitor_data_structure_size -

Failures over 5m (by job)

+

Data structure sizes

-the rate of failures over 5m (by job) +The size distribution of various Git data structures in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100220` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100250` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6806,21 +6683,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10022 Query: ``` -sum by (job_name) (rate(src_gitserver_janitor_job_duration_seconds_count{instance=~`${shard:regex}`,success="false"}[5m])) +histogram_quantile(0.95, sum(rate(src_gitserver_janitor_data_structure_size_bucket{instance=~`${shard:regex}`}[5m])) by (le, data_structure)) ```
-#### gitserver: repos_removed +#### gitserver: janitor_time_since_optimization -

Repositories removed due to disk pressure

+

Time since last optimization

-Repositories removed due to disk pressure +The time elapsed since last optimization of various Git data structures This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100260` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6830,21 +6707,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10023 Query: ``` -sum by (instance) (rate(src_gitserver_repos_removed_disk_pressure{instance=~`${shard:regex}`}[5m])) +histogram_quantile(0.95, sum(rate(src_gitserver_janitor_time_since_last_optimization_seconds_bucket{instance=~`${shard:regex}`}[5m])) by (le, data_structure)) ```
-#### gitserver: non_existent_repos_removed +#### gitserver: janitor_data_structure_existence -

Repositories removed because they are not defined in the DB

+

Data structure existence

-Repositoriess removed because they are not defined in the DB +The rate at which data structures are reported to exist in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100240` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100270` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6854,21 +6731,25 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10024 Query: ``` -sum by (instance) (increase(src_gitserver_non_existing_repos_removed[5m])) +sum(rate(src_gitserver_janitor_data_structure_existence_total{instance=~`${shard:regex}`, exists="true"}[5m])) by (data_structure) ```
-#### gitserver: sg_maintenance_reason +### Git Server: Git Command Corruption Retries -

Successful sg maintenance jobs over 1h (by reason)

+#### gitserver: git_command_retry_attempts_rate -the rate of successful sg maintenance jobs and the reason why they were triggered +

Rate of git command corruption retry attempts over 5m

-This panel has no related alerts. +The rate of git command retry attempts due to corruption detection. +A non-zero value indicates that gitserver is detecting potential corruption and attempting retries. +This metric helps track how often the retry mechanism is triggered. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100250` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#gitserver-git-command-retry-attempts-rate) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6878,21 +6759,22 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10025 Query: ``` -sum by (reason) (rate(src_gitserver_maintenance_status{success="true"}[1h])) +sum(rate(src_gitserver_retry_attempts_total{instance=~`${shard:regex}`}[5m])) ```
-#### gitserver: git_prune_skipped +#### gitserver: git_command_retry_success_rate -

Successful git prune jobs over 1h

+

Rate of successful git command corruption retries over 5m

-the rate of successful git prune jobs over 1h and whether they were skipped +The rate of git commands that succeeded after retry attempts. +This indicates how effective the retry mechanism is at resolving transient corruption issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100260` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100301` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -6902,25 +6784,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10026 Query: ``` -sum by (skipped) (rate(src_gitserver_prune_status{success="true"}[1h])) +sum(rate(src_gitserver_retry_success_total{instance=~`${shard:regex}`}[5m])) ```
-### Git Server: Search - -#### gitserver: search_latency +#### gitserver: git_command_retry_failure_rate -

Mean time until first result is sent

+

Rate of failed git command corruption retries over 5m

-Mean latency (time to first result) of gitserver search requests +The rate of git commands that failed even after all retry attempts were exhausted. +These failures will result in repository corruption marking and potential recloning. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -6928,23 +6809,25 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10030 Query: ``` -rate(src_gitserver_search_latency_seconds_sum[5m]) / rate(src_gitserver_search_latency_seconds_count[5m]) +sum(rate(src_gitserver_retry_failure_total{instance=~`${shard:regex}`}[5m])) ```

-#### gitserver: search_duration +#### gitserver: git_command_retry_different_error_rate -

Mean search duration

+

Rate of corruption retries that failed with non-corruption errors over 5m

-Mean duration of gitserver search requests +The rate of retry attempts that failed with errors other than corruption. +This indicates that repository state or environment changed between the original command and retry attempt. +Common causes include network issues, permission changes, or concurrent repository modifications. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100311` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -6952,23 +6835,25 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10030 Query: ``` -rate(src_gitserver_search_duration_seconds_sum[5m]) / rate(src_gitserver_search_duration_seconds_count[5m]) +sum(rate(src_gitserver_retry_different_error_total{instance=~`${shard:regex}`}[5m])) ```

-#### gitserver: search_rate +#### gitserver: git_command_retry_success_ratio -

Rate of searches run by pod

+

Ratio of successful corruption retries to total corruption retry attempts over 5m

-The rate of searches executed on gitserver by pod +The percentage of retry attempts that ultimately succeeded. +A high ratio indicates that most corruption errors are transient and resolved by retries. +A low ratio may indicate persistent corruption issues requiring investigation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100312` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -6976,23 +6861,26 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10031 Query: ``` -rate(src_gitserver_search_latency_seconds_count{instance=~`${shard:regex}`}[5m]) +sum(rate(src_gitserver_retry_success_total{instance=~`${shard:regex}`}[5m])) / sum(rate(src_gitserver_retry_attempts_total{instance=~`${shard:regex}`}[5m])) ```

-#### gitserver: running_searches +### Git Server: Periodic Goroutines + +#### gitserver: running_goroutines -

Number of searches currently running by pod

+

Number of currently running periodic goroutines

-The number of searches currently executing on gitserver by pod +The number of currently running periodic goroutines by name and job. +A value of 0 indicates the routine isn`t running currently, it awaits it`s next schedule. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7000,25 +6888,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10031 Query: ``` -sum by (instance) (src_gitserver_search_running{instance=~`${shard:regex}`}) +sum by (name, job_name) (src_periodic_goroutine_running{job=~".*gitserver.*"}) ```

-### Git Server: Gitserver: Gitserver Backend - -#### gitserver: concurrent_backend_operations +#### gitserver: goroutine_success_rate -

Number of concurrently running backend operations

+

Success rate for periodic goroutine executions

-The number of requests that are currently being handled by gitserver backend layer, at the point in time of scraping. +The rate of successful executions of each periodic goroutine. +A low or zero value could indicate that a routine is stalled or encountering errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7026,21 +6913,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10040 Query: ``` -src_gitserver_backend_concurrent_operations +sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*gitserver.*"}[5m])) ```

-#### gitserver: gitserver_backend_total +#### gitserver: goroutine_error_rate -

Aggregate operations every 5m

+

Error rate for periodic goroutine executions

-This panel has no related alerts. +The rate of errors encountered by each periodic goroutine. +A sustained high error rate may indicate a problem with the routine`s configuration or dependencies. + +Refer to the [alerts reference](alerts#gitserver-goroutine-error-rate) for 1 alert related to this panel. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100410` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7048,21 +6938,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10041 Query: ``` -sum(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*gitserver.*"}[5m])) ```

-#### gitserver: gitserver_backend_99th_percentile_duration +#### gitserver: goroutine_error_percentage -

Aggregate successful operation duration distribution over 5m

+

Percentage of periodic goroutine executions that result in errors

-This panel has no related alerts. +The percentage of executions that result in errors for each periodic goroutine. +A value above 5% indicates that a significant portion of routine executions are failing. + +Refer to the [alerts reference](alerts#gitserver-goroutine-error-percentage) for 1 alert related to this panel. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7070,21 +6963,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10041 Query: ``` -sum by (le)(rate(src_gitserver_backend_duration_seconds_bucket{job=~"^gitserver.*"}[5m])) +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*gitserver.*"}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*gitserver.*"}[5m]) > 0) * 100 ```

-#### gitserver: gitserver_backend_errors_total +#### gitserver: goroutine_handler_duration -

Aggregate operation errors every 5m

+

95th percentile handler execution time

+ +The 95th percentile execution time for each periodic goroutine handler. +Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100420` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7092,21 +6988,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10041 Query: ``` -sum(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_duration_seconds_bucket{job=~".*gitserver.*"}[5m]))) ```

-#### gitserver: gitserver_backend_error_rate +#### gitserver: goroutine_loop_duration -

Aggregate operation error rate over 5m

+

95th percentile loop cycle time

+ +The 95th percentile loop cycle time for each periodic goroutine (excluding sleep time). +This represents how long a complete loop iteration takes before sleeping for the next interval. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100421` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7114,21 +7013,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10041 Query: ``` -sum(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) / (sum(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) + sum(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m]))) * 100 +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_loop_duration_seconds_bucket{job=~".*gitserver.*"}[5m]))) ```

-#### gitserver: gitserver_backend_total +#### gitserver: tenant_processing_duration -

operations every 5m

+

95th percentile tenant processing time

+ +The 95th percentile processing time for individual tenants within periodic goroutines. +Higher values indicate that tenant processing is taking longer and may affect overall performance. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100420` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100430` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7136,21 +7038,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10042 Query: ``` -sum by (op)(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_tenant_duration_seconds_bucket{job=~".*gitserver.*"}[5m]))) ```

-#### gitserver: gitserver_backend_99th_percentile_duration +#### gitserver: tenant_processing_max -

99th percentile successful operation duration over 5m

+

Maximum tenant processing time

+ +The maximum processing time for individual tenants within periodic goroutines. +Consistently high values might indicate problematic tenants or inefficient processing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100421` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100431` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7158,21 +7063,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10042 Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_gitserver_backend_duration_seconds_bucket{job=~"^gitserver.*"}[5m]))) +max by (name, job_name) (rate(src_periodic_goroutine_tenant_duration_seconds_sum{job=~".*gitserver.*"}[5m]) / rate(src_periodic_goroutine_tenant_duration_seconds_count{job=~".*gitserver.*"}[5m])) ```

-#### gitserver: gitserver_backend_errors_total +#### gitserver: tenant_count -

operation errors every 5m

+

Number of tenants processed per routine

+ +The number of tenants processed by each periodic goroutine. +Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100422` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100440` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7180,21 +7088,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10042 Query: ``` -sum by (op)(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) +max by (name, job_name) (src_periodic_goroutine_tenant_count{job=~".*gitserver.*"}) ```

-#### gitserver: gitserver_backend_error_rate +#### gitserver: tenant_success_rate -

operation error rate over 5m

+

Rate of successful tenant processing operations

+ +The rate of successful tenant processing operations. +A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100423` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100441` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7202,23 +7113,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10042 Query: ``` -sum by (op)(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) / (sum by (op)(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) + sum by (op)(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m]))) * 100 +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*gitserver.*"}[5m])) ```

-### Git Server: Gitserver: Gitserver Client +#### gitserver: tenant_error_rate -#### gitserver: gitserver_client_total +

Rate of tenant processing errors

-

Aggregate graphql operations every 5m

+The rate of tenant processing operations that result in errors. +Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100450` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7226,21 +7138,24 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10050 Query: ``` -sum(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*gitserver.*"}[5m])) ```

-#### gitserver: gitserver_client_99th_percentile_duration +#### gitserver: tenant_error_percentage -

Aggregate successful graphql operation duration distribution over 5m

+

Percentage of tenant operations resulting in errors

+ +The percentage of tenant operations that result in errors. +Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100451` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -7248,19 +7163,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10050 Query: ``` -sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^*.*"}[5m])) +(sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*gitserver.*"}[5m])) / (sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*gitserver.*"}[5m])) + sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*gitserver.*"}[5m])))) * 100 ```

-#### gitserver: gitserver_client_errors_total +### Git Server: Gitserver (CPU, Memory) -

Aggregate graphql operation errors every 5m

+#### gitserver: cpu_usage_percentage -This panel has no related alerts. +

CPU usage

-To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100502` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#gitserver-cpu-usage-percentage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100500` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7270,19 +7187,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10050 Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) +cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"} ```
-#### gitserver: gitserver_client_error_rate +#### gitserver: memory_usage_percentage -

Aggregate graphql operation error rate over 5m

+

Memory usage percentage (total)

+ +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100501` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7292,19 +7211,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10050 Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m]))) * 100 +cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"} ```
-#### gitserver: gitserver_client_total +#### gitserver: memory_working_set_bytes -

Graphql operations every 5m

+

Memory usage bytes (total)

+ +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100502` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7314,19 +7235,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10051 Query: ``` -sum by (op,scope)(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) +max by (name) (container_memory_working_set_bytes{name=~"^gitserver.*"}) ```
-#### gitserver: gitserver_client_99th_percentile_duration +#### gitserver: memory_rss -

99th percentile successful graphql operation duration over 5m

+

Memory (RSS)

-This panel has no related alerts. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100511` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#gitserver-memory-rss) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100510` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7336,19 +7259,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10051 Query: ``` -histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^*.*"}[5m]))) +max(container_memory_rss{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```
-#### gitserver: gitserver_client_errors_total +#### gitserver: memory_total_active_file -

Graphql operation errors every 5m

+

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100511` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7358,19 +7283,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10051 Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) +max(container_memory_total_active_file_bytes{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```
-#### gitserver: gitserver_client_error_rate +#### gitserver: memory_kernel_usage -

Graphql operation error rate over 5m

+

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100512` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7380,21 +7307,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10051 Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m]))) * 100 +max(container_memory_kernel_usage{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```
-### Git Server: Repos disk I/O metrics - -#### gitserver: repos_disk_reads_sec +### Git Server: Network I/O pod metrics (only available on Kubernetes) -

Read request rate over 1m (per instance)

+#### gitserver: network_sent_bytes_aggregate -The number of read requests that were issued to the device per second. +

Transmission rate over 5m (aggregate)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The rate of bytes sent over the network across all pods This panel has no related alerts. @@ -7408,19 +7333,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10060 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))) +sum(rate(container_network_transmit_bytes_total{container_label_io_kubernetes_pod_name=~`.*gitserver.*`}[5m])) ```
-#### gitserver: repos_disk_writes_sec - -

Write request rate over 1m (per instance)

+#### gitserver: network_received_packets_per_instance -The number of write requests that were issued to the device per second. +

Transmission rate over 5m (per instance)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The amount of bytes sent over the network by individual pods This panel has no related alerts. @@ -7434,19 +7357,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10060 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))) +sum by (container_label_io_kubernetes_pod_name) (rate(container_network_transmit_bytes_total{container_label_io_kubernetes_pod_name=~`${instance:regex}`}[5m])) ```
-#### gitserver: repos_disk_read_throughput - -

Read throughput over 1m (per instance)

+#### gitserver: network_received_bytes_aggregate -The amount of data that was read from the device per second. +

Receive rate over 5m (aggregate)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The amount of bytes received from the network across pods This panel has no related alerts. @@ -7460,19 +7381,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10061 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m]))))) +sum(rate(container_network_receive_bytes_total{container_label_io_kubernetes_pod_name=~`.*gitserver.*`}[5m])) ```
-#### gitserver: repos_disk_write_throughput - -

Write throughput over 1m (per instance)

+#### gitserver: network_received_bytes_per_instance -The amount of data that was written to the device per second. +

Receive rate over 5m (per instance)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The amount of bytes received from the network by individual pods This panel has no related alerts. @@ -7486,19 +7405,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10061 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m]))))) +sum by (container_label_io_kubernetes_pod_name) (rate(container_network_receive_bytes_total{container_label_io_kubernetes_pod_name=~`${instance:regex}`}[5m])) ```
-#### gitserver: repos_disk_read_duration - -

Average read duration over 1m (per instance)

+#### gitserver: network_transmitted_packets_dropped_by_instance -The average time for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. +

Transmit packet drop rate over 5m (by instance)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +An increase in dropped packets could be a leading indicator of network saturation. This panel has no related alerts. @@ -7512,19 +7429,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10062 Query: ``` -(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) +sum by (container_label_io_kubernetes_pod_name) (rate(container_network_transmit_packets_dropped_total{container_label_io_kubernetes_pod_name=~`${instance:regex}`}[5m])) ```
-#### gitserver: repos_disk_write_duration - -

Average write duration over 1m (per instance)

+#### gitserver: network_transmitted_packets_errors_per_instance -The average time for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. +

Errors encountered while transmitting over 5m (per instance)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +An increase in transmission errors could indicate a networking issue This panel has no related alerts. @@ -7538,23 +7453,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10062 Query: ``` -(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_write_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) +sum by (container_label_io_kubernetes_pod_name) (rate(container_network_transmit_errors_total{container_label_io_kubernetes_pod_name=~`${instance:regex}`}[5m])) ```
-#### gitserver: repos_disk_read_request_size - -

Average read request size over 1m (per instance)

+#### gitserver: network_received_packets_dropped_by_instance -The average size of read requests that were issued to the device. +

Receive packet drop rate over 5m (by instance)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +An increase in dropped packets could be a leading indicator of network saturation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100630` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100622` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7564,23 +7477,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10063 Query: ``` -(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) +sum by (container_label_io_kubernetes_pod_name) (rate(container_network_receive_packets_dropped_total{container_label_io_kubernetes_pod_name=~`${instance:regex}`}[5m])) ```
-#### gitserver: repos_disk_write_request_size) - -

Average write request size over 1m (per instance)

+#### gitserver: network_transmitted_packets_errors_by_instance -The average size of write requests that were issued to the device. +

Errors encountered while receiving over 5m (per instance)

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +An increase in errors while receiving could indicate a networking issue. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100631` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100623` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7590,23 +7501,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10063 Query: ``` -(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) +sum by (container_label_io_kubernetes_pod_name) (rate(container_network_receive_errors_total{container_label_io_kubernetes_pod_name=~`${instance:regex}`}[5m])) ```
-#### gitserver: repos_disk_reads_merged_sec +### Git Server: VCS Clone metrics -

Merged read request rate over 1m (per instance)

+#### gitserver: vcs_syncer_999_successful_clone_duration -The number of read requests merged per second that were queued to the device. +

99.9th percentile successful Clone duration over 1m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The 99.9th percentile duration for successful `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100640` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100700` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7616,23 +7527,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10064 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_merged_total{instance=~`node-exporter.*`}[1m]))))) +histogram_quantile(0.999, sum by (type, le) (rate(vcssyncer_clone_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: repos_disk_writes_merged_sec - -

Merged writes request rate over 1m (per instance)

+#### gitserver: vcs_syncer_99_successful_clone_duration -The number of write requests merged per second that were queued to the device. +

99th percentile successful Clone duration over 1m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The 99th percentile duration for successful `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100641` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100701` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7642,23 +7551,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10064 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_merged_total{instance=~`node-exporter.*`}[1m]))))) +histogram_quantile(0.99, sum by (type, le) (rate(vcssyncer_clone_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: repos_disk_average_queue_size - -

Average queue size over 1m (per instance)

+#### gitserver: vcs_syncer_95_successful_clone_duration -The number of I/O operations that were being queued or being serviced. See https://blog.actorsfit.com/a?ID=00200-428fa2ac-e338-4540-848c-af9a3eb1ebd2 for background (avgqu-sz). +

95th percentile successful Clone duration over 1m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. +The 95th percentile duration for successful `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100650` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100702` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7668,23 +7575,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10065 Query: ``` -(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_io_time_weighted_seconds_total{instance=~`node-exporter.*`}[1m]))))) +histogram_quantile(0.95, sum by (type, le) (rate(vcssyncer_clone_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-### Git Server: Gitserver GRPC server metrics - -#### gitserver: gitserver_grpc_request_rate_all_methods +#### gitserver: vcs_syncer_successful_clone_rate -

Request rate across all methods over 2m

+

Rate of successful Clone VCS operations over 1m

-The number of gRPC requests received per second across all methods, aggregated across all instances. +The rate of successful `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100710` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7694,21 +7599,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10070 Query: ``` -sum(rate(grpc_server_started_total{instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) +sum by (type) (rate(vcssyncer_clone_duration_seconds_count{type=~`${vcsSyncerType:regex}`, success="true"}[1m])) ```
-#### gitserver: gitserver_grpc_request_rate_per_method +#### gitserver: vcs_syncer_999_failed_clone_duration -

Request rate per-method over 2m

+

99.9th percentile failed Clone duration over 1m

-The number of gRPC requests received per second broken out per method, aggregated across all instances. +The 99.9th percentile duration for failed `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100720` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7718,21 +7623,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10070 Query: ``` -sum(rate(grpc_server_started_total{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method) +histogram_quantile(0.999, sum by (type, le) (rate(vcssyncer_clone_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_error_percentage_all_methods +#### gitserver: vcs_syncer_99_failed_clone_duration -

Error percentage across all methods over 2m

+

99th percentile failed Clone duration over 1m

-The percentage of gRPC requests that fail across all methods, aggregated across all instances. +The 99th percentile duration for failed `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100721` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7742,21 +7647,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10071 Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) )) +histogram_quantile(0.99, sum by (type, le) (rate(vcssyncer_clone_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_grpc_error_percentage_per_method +#### gitserver: vcs_syncer_95_failed_clone_duration -

Error percentage per-method over 2m

+

95th percentile failed Clone duration over 1m

-The percentage of gRPC requests that fail per method, aggregated across all instances. +The 95th percentile duration for failed `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100722` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7766,21 +7671,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10071 Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${gitserver_method:regex}`,grpc_code!="OK",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method)) )) +histogram_quantile(0.95, sum by (type, le) (rate(vcssyncer_clone_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_p99_response_time_per_method +#### gitserver: vcs_syncer_failed_clone_rate -

99th percentile response time per method over 2m

+

Rate of failed Clone VCS operations over 1m

-The 99th percentile response time per method, aggregated across all instances. +The rate of failed `Clone` VCS operations. This is the time taken to clone a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100720` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100730` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7790,21 +7695,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10072 Query: ``` -histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +sum by (type) (rate(vcssyncer_clone_duration_seconds_count{type=~`${vcsSyncerType:regex}`, success="false"}[1m])) ```
-#### gitserver: gitserver_p90_response_time_per_method +### Git Server: VCS Fetch metrics -

90th percentile response time per method over 2m

+#### gitserver: vcs_syncer_999_successful_fetch_duration -The 90th percentile response time per method, aggregated across all instances. +

99.9th percentile successful Fetch duration over 1m

+ +The 99.9th percentile duration for successful `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100721` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100800` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7814,21 +7721,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10072 Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +histogram_quantile(0.999, sum by (type, le) (rate(vcssyncer_fetch_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: gitserver_p75_response_time_per_method +#### gitserver: vcs_syncer_99_successful_fetch_duration -

75th percentile response time per method over 2m

+

99th percentile successful Fetch duration over 1m

-The 75th percentile response time per method, aggregated across all instances. +The 99th percentile duration for successful `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100722` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100801` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7838,21 +7745,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10072 Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +histogram_quantile(0.99, sum by (type, le) (rate(vcssyncer_fetch_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: gitserver_p99_9_response_size_per_method +#### gitserver: vcs_syncer_95_successful_fetch_duration -

99.9th percentile total response size per method over 2m

+

95th percentile successful Fetch duration over 1m

-The 99.9th percentile total per-RPC response size per method, aggregated across all instances. +The 95th percentile duration for successful `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100730` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100802` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7862,21 +7769,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10073 Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +histogram_quantile(0.95, sum by (type, le) (rate(vcssyncer_fetch_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: gitserver_p90_response_size_per_method +#### gitserver: vcs_syncer_successful_fetch_rate -

90th percentile total response size per method over 2m

+

Rate of successful Fetch VCS operations over 1m

-The 90th percentile total per-RPC response size per method, aggregated across all instances. +The rate of successful `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100731` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100810` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7886,21 +7793,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10073 Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +sum by (type) (rate(vcssyncer_fetch_duration_seconds_count{type=~`${vcsSyncerType:regex}`, success="true"}[1m])) ```
-#### gitserver: gitserver_p75_response_size_per_method +#### gitserver: vcs_syncer_999_failed_fetch_duration -

75th percentile total response size per method over 2m

+

99.9th percentile failed Fetch duration over 1m

-The 75th percentile total per-RPC response size per method, aggregated across all instances. +The 99.9th percentile duration for failed `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100732` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100820` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7910,21 +7817,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10073 Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +histogram_quantile(0.999, sum by (type, le) (rate(vcssyncer_fetch_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_p99_9_invididual_sent_message_size_per_method +#### gitserver: vcs_syncer_99_failed_fetch_duration -

99.9th percentile individual sent message size per method over 2m

+

99th percentile failed Fetch duration over 1m

-The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The 99th percentile duration for failed `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100740` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100821` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7934,21 +7841,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10074 Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +histogram_quantile(0.99, sum by (type, le) (rate(vcssyncer_fetch_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_p90_invididual_sent_message_size_per_method +#### gitserver: vcs_syncer_95_failed_fetch_duration -

90th percentile individual sent message size per method over 2m

+

95th percentile failed Fetch duration over 1m

-The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The 95th percentile duration for failed `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100741` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100822` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7958,21 +7865,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10074 Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +histogram_quantile(0.95, sum by (type, le) (rate(vcssyncer_fetch_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_p75_invididual_sent_message_size_per_method +#### gitserver: vcs_syncer_failed_fetch_rate -

75th percentile individual sent message size per method over 2m

+

Rate of failed Fetch VCS operations over 1m

-The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The rate of failed `Fetch` VCS operations. This is the time taken to fetch a repository from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100742` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100830` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -7982,21 +7889,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10074 Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) +sum by (type) (rate(vcssyncer_fetch_duration_seconds_count{type=~`${vcsSyncerType:regex}`, success="false"}[1m])) ```
-#### gitserver: gitserver_grpc_response_stream_message_count_per_method +### Git Server: VCS Is_cloneable metrics -

Average streaming response message count per-method over 2m

+#### gitserver: vcs_syncer_999_successful_is_cloneable_duration -The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. +

99.9th percentile successful Is_cloneable duration over 1m

+ +The 99.9th percentile duration for successful `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100750` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100900` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8006,21 +7915,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10075 Query: ``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method))) +histogram_quantile(0.999, sum by (type, le) (rate(vcssyncer_is_cloneable_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: gitserver_grpc_all_codes_per_method +#### gitserver: vcs_syncer_99_successful_is_cloneable_duration -

Response codes rate per-method over 2m

+

99th percentile successful Is_cloneable duration over 1m

-The rate of all generated gRPC response codes per method, aggregated across all instances. +The 99th percentile duration for successful `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100760` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100901` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8030,23 +7939,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10076 Query: ``` -sum(rate(grpc_server_handled_total{grpc_method=~`${gitserver_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method, grpc_code) +histogram_quantile(0.99, sum by (type, le) (rate(vcssyncer_is_cloneable_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-### Git Server: Gitserver GRPC "internal error" metrics - -#### gitserver: gitserver_grpc_clients_error_percentage_all_methods +#### gitserver: vcs_syncer_95_successful_is_cloneable_duration -

Client baseline error percentage across all methods over 2m

+

95th percentile successful Is_cloneable duration over 1m

-The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "gitserver" clients. +The 95th percentile duration for successful `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100902` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8056,21 +7963,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10080 Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService"}[2m]))))))) +histogram_quantile(0.95, sum by (type, le) (rate(vcssyncer_is_cloneable_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="true"}[1m]))) ```
-#### gitserver: gitserver_grpc_clients_error_percentage_per_method +#### gitserver: vcs_syncer_successful_is_cloneable_rate -

Client baseline error percentage per-method over 2m

+

Rate of successful Is_cloneable VCS operations over 1m

-The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "gitserver" clients. +The rate of successful `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100910` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8080,21 +7987,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10080 Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}"}[2m])) by (grpc_method)))))) +sum by (type) (rate(vcssyncer_is_cloneable_duration_seconds_count{type=~`${vcsSyncerType:regex}`, success="true"}[1m])) ```
-#### gitserver: gitserver_grpc_clients_all_codes_per_method +#### gitserver: vcs_syncer_999_failed_is_cloneable_duration -

Client baseline response codes rate per-method over 2m

+

99.9th percentile failed Is_cloneable duration over 1m

-The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "gitserver" clients. +The 99.9th percentile duration for failed `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100920` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8104,27 +8011,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10080 Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}"}[2m])) by (grpc_method, grpc_code)) +histogram_quantile(0.999, sum by (type, le) (rate(vcssyncer_is_cloneable_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_grpc_clients_internal_error_percentage_all_methods - -

Client-observed gRPC internal error percentage across all methods over 2m

- -The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "gitserver" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "gitserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +#### gitserver: vcs_syncer_99_failed_is_cloneable_duration -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +

99th percentile failed Is_cloneable duration over 1m

-**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +The 99th percentile duration for failed `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100921` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8134,27 +8035,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10081 Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService"}[2m]))))))) +histogram_quantile(0.99, sum by (type, le) (rate(vcssyncer_is_cloneable_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_grpc_clients_internal_error_percentage_per_method - -

Client-observed gRPC internal error percentage per-method over 2m

- -The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "gitserver" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "gitserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +#### gitserver: vcs_syncer_95_failed_is_cloneable_duration -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +

95th percentile failed Is_cloneable duration over 1m

-**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +The 95th percentile duration for failed `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100922` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8164,27 +8059,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10081 Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}"}[2m])) by (grpc_method)))))) +histogram_quantile(0.95, sum by (type, le) (rate(vcssyncer_is_cloneable_duration_seconds_bucket{type=~`${vcsSyncerType:regex}`, success="false"}[1m]))) ```
-#### gitserver: gitserver_grpc_clients_internal_error_all_codes_per_method +#### gitserver: vcs_syncer_failed_is_cloneable_rate -

Client-observed gRPC internal error response code rate per-method over 2m

- -The rate of gRPC internal-error response codes per method, aggregated across all "gitserver" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "gitserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +

Rate of failed Is_cloneable VCS operations over 1m

-**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +The rate of failed `Is_cloneable` VCS operations. This is the time taken to check to see if a repository is cloneable from the upstream source. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100812` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100930` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8194,23 +8083,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10081 Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",is_internal_error="true",grpc_method=~"${gitserver_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum by (type) (rate(vcssyncer_is_cloneable_duration_seconds_count{type=~`${vcsSyncerType:regex}`, success="false"}[1m])) ```
-### Git Server: Gitserver GRPC retry metrics +### Git Server: Gitserver: Gitserver Backend -#### gitserver: gitserver_grpc_clients_retry_percentage_across_all_methods +#### gitserver: concurrent_backend_operations -

Client retry percentage across all methods over 2m

+

Number of concurrently running backend operations

-The percentage of gRPC requests that were retried across all methods, aggregated across all "gitserver" clients. +The number of requests that are currently being handled by gitserver backend layer, at the point in time of scraping. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101000` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8220,21 +8109,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10090 Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService"}[2m]))))))) +src_gitserver_backend_concurrent_operations ```
-#### gitserver: gitserver_grpc_clients_retry_percentage_per_method - -

Client retry percentage per-method over 2m

+#### gitserver: gitserver_backend_total -The percentage of gRPC requests that were retried aggregated across all "gitserver" clients, broken out per method. +

Aggregate operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101010` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8244,21 +8131,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10090 Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",is_retried="true",grpc_method=~"${gitserver_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}"}[2m])) by (grpc_method)))))) +sum(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) ```
-#### gitserver: gitserver_grpc_clients_retry_count_per_method - -

Client retry count per-method over 2m

+#### gitserver: gitserver_backend_99th_percentile_duration -The count of gRPC requests that were retried aggregated across all "gitserver" clients, broken out per method +

Aggregate successful operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100902` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101011` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8268,25 +8153,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10090 Query: ``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${gitserver_method:regex}",is_retried="true"}[2m])) by (grpc_method)) +sum by (le)(rate(src_gitserver_backend_duration_seconds_bucket{job=~"^gitserver.*"}[5m])) ```
-### Git Server: Site configuration client update latency - -#### gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance - -

Duration since last successful site configuration update (by instance)

+#### gitserver: gitserver_backend_errors_total -The duration since the configuration client used by the "gitserver" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. +

Aggregate operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101012` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8294,21 +8175,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10100 Query: ``` -src_conf_client_time_since_last_successful_update_seconds{job=~`.*gitserver`,instance=~`${shard:regex}`} +sum(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) ```

-#### gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance +#### gitserver: gitserver_backend_error_rate -

Maximum duration since last successful site configuration update (all "gitserver" instances)

+

Aggregate operation error rate over 5m

-Refer to the [alerts reference](alerts#gitserver-gitserver-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101013` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8316,23 +8197,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10100 Query: ``` -max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*gitserver`,instance=~`${shard:regex}`}[1m])) +sum(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) / (sum(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) + sum(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m]))) * 100 ```

-### Git Server: Codeintel: Coursier invocation stats - -#### gitserver: codeintel_coursier_total +#### gitserver: gitserver_backend_total -

Aggregate invocations operations every 5m

+

operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101020` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8340,21 +8219,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10110 Query: ``` -sum(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum by (op)(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) ```

-#### gitserver: codeintel_coursier_99th_percentile_duration +#### gitserver: gitserver_backend_99th_percentile_duration -

Aggregate successful invocations operation duration distribution over 5m

+

99th percentile successful operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101021` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8362,21 +8241,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10110 Query: ``` -sum by (le)(rate(src_codeintel_coursier_duration_seconds_bucket{op!="RunCommand",job=~"^gitserver.*"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_gitserver_backend_duration_seconds_bucket{job=~"^gitserver.*"}[5m]))) ```

-#### gitserver: codeintel_coursier_errors_total +#### gitserver: gitserver_backend_errors_total -

Aggregate invocations operation errors every 5m

+

operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101022` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8384,21 +8263,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10110 Query: ``` -sum(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum by (op)(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) ```

-#### gitserver: codeintel_coursier_error_rate +#### gitserver: gitserver_backend_error_rate -

Aggregate invocations operation error rate over 5m

+

operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101023` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8406,21 +8285,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10110 Query: ``` -sum(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) / (sum(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) + sum(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m]))) * 100 +sum by (op)(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m])) / (sum by (op)(increase(src_gitserver_backend_total{job=~"^gitserver.*"}[5m])) + sum by (op)(increase(src_gitserver_backend_errors_total{job=~"^gitserver.*"}[5m]))) * 100 ```

-#### gitserver: codeintel_coursier_total +### Git Server: Gitserver: Gitserver Client + +#### gitserver: gitserver_client_total -

Invocations operations every 5m

+

Aggregate client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8428,21 +8309,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10111 Query: ``` -sum by (op)(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_coursier_99th_percentile_duration +#### gitserver: gitserver_client_99th_percentile_duration -

99th percentile successful invocations operation duration over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8450,21 +8331,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10111 Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_coursier_duration_seconds_bucket{op!="RunCommand",job=~"^gitserver.*"}[5m]))) +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_coursier_errors_total +#### gitserver: gitserver_client_errors_total -

Invocations operation errors every 5m

+

Aggregate client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8472,21 +8353,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10111 Query: ``` -sum by (op)(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_coursier_error_rate +#### gitserver: gitserver_client_error_rate -

Invocations operation error rate over 5m

+

Aggregate client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8494,23 +8375,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10111 Query: ``` -sum by (op)(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) / (sum by (op)(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) + sum by (op)(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m]))) * 100 +sum(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m]))) * 100 ```

-### Git Server: Codeintel: npm invocation stats - -#### gitserver: codeintel_npm_total +#### gitserver: gitserver_client_total -

Aggregate invocations operations every 5m

+

Client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8518,21 +8397,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10120 Query: ``` -sum(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum by (op,scope)(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_npm_99th_percentile_duration +#### gitserver: gitserver_client_99th_percentile_duration -

Aggregate successful invocations operation duration distribution over 5m

+

99th percentile successful client operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8540,21 +8419,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10120 Query: ``` -sum by (le)(rate(src_codeintel_npm_duration_seconds_bucket{op!="RunCommand",job=~"^gitserver.*"}[5m])) +histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^*.*"}[5m]))) ```

-#### gitserver: codeintel_npm_errors_total +#### gitserver: gitserver_client_errors_total -

Aggregate invocations operation errors every 5m

+

Client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101112` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8562,21 +8441,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10120 Query: ``` -sum(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_npm_error_rate +#### gitserver: gitserver_client_error_rate -

Aggregate invocations operation error rate over 5m

+

Client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101113` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8584,21 +8463,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10120 Query: ``` -sum(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) / (sum(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) + sum(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m]))) * 100 +sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^*.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^*.*"}[5m]))) * 100 ```

-#### gitserver: codeintel_npm_total +### Git Server: Gitserver: Gitserver Repository Service Client -

Invocations operations every 5m

+#### gitserver: gitserver_repositoryservice_client_total + +

Aggregate client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8606,21 +8487,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10121 Query: ``` -sum by (op)(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum(increase(src_gitserver_repositoryservice_client_total{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_npm_99th_percentile_duration +#### gitserver: gitserver_repositoryservice_client_99th_percentile_duration -

99th percentile successful invocations operation duration over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8628,21 +8509,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10121 Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_npm_duration_seconds_bucket{op!="RunCommand",job=~"^gitserver.*"}[5m]))) +sum by (le)(rate(src_gitserver_repositoryservice_client_duration_seconds_bucket{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_npm_errors_total +#### gitserver: gitserver_repositoryservice_client_errors_total -

Invocations operation errors every 5m

+

Aggregate client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8650,21 +8531,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10121 Query: ``` -sum by (op)(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) +sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^*.*"}[5m])) ```

-#### gitserver: codeintel_npm_error_rate +#### gitserver: gitserver_repositoryservice_client_error_rate -

Invocations operation error rate over 5m

+

Aggregate client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101203` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -8672,23 +8553,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10121 Query: ``` -sum by (op)(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) / (sum by (op)(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^gitserver.*"}[5m])) + sum by (op)(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^gitserver.*"}[5m]))) * 100 +sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^*.*"}[5m])) / (sum(increase(src_gitserver_repositoryservice_client_total{job=~"^*.*"}[5m])) + sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^*.*"}[5m]))) * 100 ```

-### Git Server: HTTP handlers - -#### gitserver: healthy_request_rate - -

Requests per second, by route, when status code is 200

+#### gitserver: gitserver_repositoryservice_client_total -The number of healthy HTTP requests per second to internal HTTP api +

Client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101210` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8698,21 +8575,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10130 Query: ``` -sum by (route) (rate(src_http_request_duration_seconds_count{app="gitserver",code=~"2.."}[5m])) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_total{job=~"^*.*"}[5m])) ```
-#### gitserver: unhealthy_request_rate - -

Requests per second, by route, when status code is not 200

+#### gitserver: gitserver_repositoryservice_client_99th_percentile_duration -The number of unhealthy HTTP requests per second to internal HTTP api +

99th percentile successful client operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101211` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8722,21 +8597,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10130 Query: ``` -sum by (route) (rate(src_http_request_duration_seconds_count{app="gitserver",code!~"2.."}[5m])) +histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_repositoryservice_client_duration_seconds_bucket{job=~"^*.*"}[5m]))) ```
-#### gitserver: request_rate_by_code - -

Requests per second, by status code

+#### gitserver: gitserver_repositoryservice_client_errors_total -The number of HTTP requests per second by code +

Client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101212` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8746,21 +8619,19 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10130 Query: ``` -sum by (code) (rate(src_http_request_duration_seconds_count{app="gitserver"}[5m])) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^*.*"}[5m])) ```
-#### gitserver: 95th_percentile_healthy_requests - -

95th percentile duration by route, when status code is 200

+#### gitserver: gitserver_repositoryservice_client_error_rate -The 95th percentile duration by route when the status code is 200 +

Client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101213` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8770,21 +8641,25 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10131 Query: ``` -histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app="gitserver",code=~"2.."}[5m])) by (le, route)) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^*.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_repositoryservice_client_total{job=~"^*.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^*.*"}[5m]))) * 100 ```
-#### gitserver: 95th_percentile_unhealthy_requests +### Git Server: Repos disk I/O metrics -

95th percentile duration by route, when status code is not 200

+#### gitserver: repos_disk_reads_sec -The 95th percentile duration by route when the status code is not 200 +

Read request rate over 1m (per instance)

+ +The number of read requests that were issued to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101300` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8794,21 +8669,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10131 Query: ``` -histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app="gitserver",code!~"2.."}[5m])) by (le, route)) +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))) ```
-### Git Server: Database connections +#### gitserver: repos_disk_writes_sec -#### gitserver: max_open_conns +

Write request rate over 1m (per instance)

-

Maximum open

+The number of write requests that were issued to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101301` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8818,19 +8695,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10140 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="gitserver"}) +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### gitserver: open_conns +#### gitserver: repos_disk_read_throughput -

Established

+

Read throughput over 1m (per instance)

+ +The amount of data that was read from the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101310` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8840,19 +8721,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10140 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="gitserver"}) +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### gitserver: in_use +#### gitserver: repos_disk_write_throughput -

Used

+

Write throughput over 1m (per instance)

+ +The amount of data that was written to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101311` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8862,19 +8747,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10141 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="gitserver"}) +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### gitserver: idle +#### gitserver: repos_disk_read_duration -

Idle

+

Average read duration over 1m (per instance)

+ +The average time for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101320` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8884,19 +8773,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10141 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="gitserver"}) +(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### gitserver: mean_blocked_seconds_per_conn_request +#### gitserver: repos_disk_write_duration -

Mean blocked seconds per conn request

+

Average write duration over 1m (per instance)

-Refer to the [alerts reference](alerts#gitserver-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +The average time for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101420` on your Sourcegraph instance. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101321` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8906,19 +8799,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10142 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="gitserver"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="gitserver"}[5m])) +(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_write_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### gitserver: closed_max_idle +#### gitserver: repos_disk_read_request_size -

Closed by SetMaxIdleConns

+

Average read request size over 1m (per instance)

+ +The average size of read requests that were issued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101430` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101330` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8928,19 +8825,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10143 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="gitserver"}[5m])) +(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### gitserver: closed_max_lifetime +#### gitserver: repos_disk_write_request_size) -

Closed by SetConnMaxLifetime

+

Average write request size over 1m (per instance)

+ +The average size of write requests that were issued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101431` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101331` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8950,19 +8851,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10143 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="gitserver"}[5m])) +(((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### gitserver: closed_max_idle_time +#### gitserver: repos_disk_reads_merged_sec -

Closed by SetConnMaxIdleTime

+

Merged read request rate over 1m (per instance)

+ +The number of read requests merged per second that were queued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101432` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101340` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -8972,31 +8877,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10143 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="gitserver"}[5m])) +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_merged_total{instance=~`node-exporter.*`}[1m]))))) ```
-### Git Server: Container monitoring (not available on server) - -#### gitserver: container_missing +#### gitserver: repos_disk_writes_merged_sec -

Container missing

+

Merged writes request rate over 1m (per instance)

-This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +The number of write requests merged per second that were queued to the device. -- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod gitserver` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p gitserver`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' gitserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the gitserver container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs gitserver` (note this will include logs from the previous and currently running container). +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101341` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9006,19 +8903,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10150 Query: ``` -count by(name) ((time() - container_last_seen{name=~"^gitserver.*"}) > 60) +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_merged_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### gitserver: container_cpu_usage +#### gitserver: repos_disk_average_queue_size -

Container cpu usage total (1m average) across all cores by instance

+

Average queue size over 1m (per instance)

-Refer to the [alerts reference](alerts#gitserver-container-cpu-usage) for 1 alert related to this panel. +The number of I/O operations that were being queued or being serviced. See https://blog.actorsfit.com/a?ID=00200-428fa2ac-e338-4540-848c-af9a3eb1ebd2 for background (avgqu-sz). -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101501` on your Sourcegraph instance. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), gitserver could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device gitserver is using, not the load gitserver is solely responsible for causing. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101350` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9028,19 +8929,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10150 Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"} +(max by (instance) (gitserver_mount_point_info{mount_name="reposDir",instance=~`${shard:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_io_time_weighted_seconds_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### gitserver: container_memory_usage +### Git Server: Git Service GRPC server metrics -

Container memory usage by instance

+#### gitserver: git_service_grpc_request_rate_all_methods -Refer to the [alerts reference](alerts#gitserver-container-memory-usage) for 1 alert related to this panel. +

Request rate across all methods over 2m

-To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101502` on your Sourcegraph instance. +The number of gRPC requests received per second across all methods, aggregated across all instances. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101400` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9050,22 +8955,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10150 Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"} +sum(rate(grpc_server_started_total{instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) ```
-#### gitserver: fs_io_operations +#### gitserver: git_service_grpc_request_rate_per_method -

Filesystem reads and writes rate by instance over 1h

+

Request rate per-method over 2m

-This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +The number of gRPC requests received per second broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101401` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9075,21 +8979,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10150 Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^gitserver.*"}[1h]) + rate(container_fs_writes_total{name=~"^gitserver.*"}[1h])) +sum(rate(grpc_server_started_total{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method) ```
-### Git Server: Provisioning indicators (not available on server) +#### gitserver: git_service_error_percentage_all_methods -#### gitserver: provisioning_container_cpu_usage_long_term +

Error percentage across all methods over 2m

-

Container cpu usage total (90th percentile over 1d) across all cores by instance

+The percentage of gRPC requests that fail across all methods, aggregated across all instances. -Refer to the [alerts reference](alerts#gitserver-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101410` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9099,21 +9003,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10160 Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[1d]) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) )) ```
-#### gitserver: provisioning_container_memory_usage_long_term +#### gitserver: git_service_grpc_error_percentage_per_method -

Container memory usage (1d maximum) by instance

+

Error percentage per-method over 2m

-Git Server is expected to use up all the memory it is provided. +The percentage of gRPC requests that fail per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101411` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9123,19 +9027,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10160 Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"}[1d]) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${git_service_method:regex}`,grpc_code!="OK",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method)) )) ```
-#### gitserver: provisioning_container_cpu_usage_short_term +#### gitserver: git_service_p99_response_time_per_method -

Container cpu usage total (5m maximum) across all cores by instance

+

99th percentile response time per method over 2m

-Refer to the [alerts reference](alerts#gitserver-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +The 99th percentile response time per method, aggregated across all instances. + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101420` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9145,21 +9051,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10161 Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[5m]) +histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```
-#### gitserver: provisioning_container_memory_usage_short_term +#### gitserver: git_service_p90_response_time_per_method -

Container memory usage (5m maximum) by instance

+

90th percentile response time per method over 2m

-Git Server is expected to use up all the memory it is provided. +The 90th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101421` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9169,22 +9075,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10161 Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"}[5m]) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```
-#### gitserver: container_oomkill_events_total +#### gitserver: git_service_p75_response_time_per_method -

Container OOMKILL events total by instance

+

75th percentile response time per method over 2m

-This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +The 75th percentile response time per method, aggregated across all instances. -Refer to the [alerts reference](alerts#gitserver-container-oomkill-events-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101422` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9194,23 +9099,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10161 Query: ``` -max by (name) (container_oom_events_total{name=~"^gitserver.*"}) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```
-### Git Server: Golang runtime monitoring - -#### gitserver: go_goroutines +#### gitserver: git_service_p99_9_response_size_per_method -

Maximum active goroutines

+

99.9th percentile total response size per method over 2m

-A high value here indicates a possible goroutine leak. +The 99.9th percentile total per-RPC response size per method, aggregated across all instances. -Refer to the [alerts reference](alerts#gitserver-go-goroutines) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101430` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9220,19 +9123,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10170 Query: ``` -max by(instance) (go_goroutines{job=~".*gitserver"}) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```
-#### gitserver: go_gc_duration_seconds +#### gitserver: git_service_p90_response_size_per_method -

Maximum go garbage collection duration

+

90th percentile total response size per method over 2m

-Refer to the [alerts reference](alerts#gitserver-go-gc-duration-seconds) for 1 alert related to this panel. +The 90th percentile total per-RPC response size per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101701` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101431` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9242,21 +9147,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10170 Query: ``` -max by(instance) (go_gc_duration_seconds{job=~".*gitserver"}) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```
-### Git Server: Kubernetes monitoring (only available on Kubernetes) +#### gitserver: git_service_p75_response_size_per_method -#### gitserver: pods_available_percentage +

75th percentile total response size per method over 2m

-

Percentage pods available

+The 75th percentile total per-RPC response size per method, aggregated across all instances. -Refer to the [alerts reference](alerts#gitserver-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101432` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -9266,27 +9171,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10180 Query: ``` -sum by(app) (up{app=~".*gitserver"}) / count by (app) (up{app=~".*gitserver"}) * 100 +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```
-## Postgres - -

Postgres metrics, exported from postgres_exporter (not available on server).

- -To see this dashboard, visit `/-/debug/grafana/d/postgres/postgres` on your Sourcegraph instance. +#### gitserver: git_service_p99_9_invididual_sent_message_size_per_method -#### postgres: connections +

99.9th percentile individual sent message size per method over 2m

-

Active connections

+The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. -Refer to the [alerts reference](alerts#postgres-connections) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101440` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9294,21 +9195,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100000` Query: ``` -sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"}) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```

-#### postgres: usage_connections_percentage +#### gitserver: git_service_p90_invididual_sent_message_size_per_method -

Connection in use

+

90th percentile individual sent message size per method over 2m

-Refer to the [alerts reference](alerts#postgres-usage-connections-percentage) for 2 alerts related to this panel. +The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100001` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101441` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9316,21 +9219,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100001` Query: ``` -sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100 +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```

-#### postgres: transaction_durations +#### gitserver: git_service_p75_invididual_sent_message_size_per_method -

Maximum transaction durations

+

75th percentile individual sent message size per method over 2m

-Refer to the [alerts reference](alerts#postgres-transaction-durations) for 1 alert related to this panel. +The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100002` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101442` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9338,25 +9243,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100002` Query: ``` -sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"}) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m]))) ```

-### Postgres: Database and collector status - -#### postgres: postgres_up +#### gitserver: git_service_grpc_response_stream_message_count_per_method -

Database availability

+

Average streaming response message count per-method over 2m

-A non-zero value indicates the database is online. +The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. -Refer to the [alerts reference](alerts#postgres-postgres-up) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101450` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9364,23 +9267,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100100` Query: ``` -pg_up +((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method))) ```

-#### postgres: invalid_indexes +#### gitserver: git_service_grpc_all_codes_per_method -

Invalid indexes (unusable by the query planner)

+

Response codes rate per-method over 2m

-A non-zero value indicates the that Postgres failed to build an index. Expect degraded performance until the index is manually rebuilt. +The rate of all generated gRPC response codes per method, aggregated across all instances. -Refer to the [alerts reference](alerts#postgres-invalid-indexes) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101460` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9388,23 +9291,25 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100101` Query: ``` -max by (relname)(pg_invalid_index_count) +sum(rate(grpc_server_handled_total{grpc_method=~`${git_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverService"}[2m])) by (grpc_method, grpc_code) ```

-#### postgres: pg_exporter_err +### Git Server: Git Service GRPC "internal error" metrics -

Errors scraping postgres exporter

+#### gitserver: git_service_grpc_clients_error_percentage_all_methods -This value indicates issues retrieving metrics from postgres_exporter. +

Client baseline error percentage across all methods over 2m

-Refer to the [alerts reference](alerts#postgres-pg-exporter-err) for 1 alert related to this panel. +The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "git_service" clients. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100110` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101500` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9412,23 +9317,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100110` Query: ``` -pg_exporter_last_scrape_error +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService"}[2m]))))))) ```

-#### postgres: migration_in_progress +#### gitserver: git_service_grpc_clients_error_percentage_per_method -

Active schema migration

+

Client baseline error percentage per-method over 2m

-A 0 value indicates that no migration is in progress. +The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "git_service" clients. -Refer to the [alerts reference](alerts#postgres-migration-in-progress) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9436,25 +9341,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100111` Query: ``` -pg_sg_migration_status +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}"}[2m])) by (grpc_method)))))) ```

-### Postgres: Object size and bloat - -#### postgres: pg_table_size +#### gitserver: git_service_grpc_clients_all_codes_per_method -

Table size

+

Client baseline response codes rate per-method over 2m

-Total size of this table +The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "git_service" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9462,23 +9365,29 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100200` Query: ``` -max by (relname)(pg_table_bloat_size) +(sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```

-#### postgres: pg_table_bloat_ratio +#### gitserver: git_service_grpc_clients_internal_error_percentage_all_methods -

Table bloat ratio

+

Client-observed gRPC internal error percentage across all methods over 2m

-Estimated bloat ratio of this table (high bloat = high overhead) +The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "git_service" clients. + +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "git_service" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101510` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9486,23 +9395,29 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100201` Query: ``` -max by (relname)(pg_table_bloat_ratio) * 100 +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService"}[2m]))))))) ```

-#### postgres: pg_index_size +#### gitserver: git_service_grpc_clients_internal_error_percentage_per_method -

Index size

+

Client-observed gRPC internal error percentage per-method over 2m

-Total size of this index +The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "git_service" clients. + +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "git_service" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9510,23 +9425,29 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100210` Query: ``` -max by (relname)(pg_index_bloat_size) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### postgres: pg_index_bloat_ratio +#### gitserver: git_service_grpc_clients_internal_error_all_codes_per_method -

Index bloat ratio

+

Client-observed gRPC internal error response code rate per-method over 2m

-Estimated bloat ratio of this index (high bloat = high overhead) +The rate of gRPC internal-error response codes per method, aggregated across all "git_service" clients. + +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "git_service" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101512` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9534,23 +9455,25 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100211` Query: ``` -max by (relname)(pg_index_bloat_ratio) * 100 +(sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverService",is_internal_error="true",grpc_method=~"${git_service_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```

-### Postgres: Provisioning indicators (not available on server) +### Git Server: Git Service GRPC retry metrics -#### postgres: provisioning_container_cpu_usage_long_term +#### gitserver: git_service_grpc_clients_retry_percentage_across_all_methods -

Container cpu usage total (90th percentile over 1d) across all cores by instance

+

Client retry percentage across all methods over 2m

-Refer to the [alerts reference](alerts#postgres-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +The percentage of gRPC requests that were retried across all methods, aggregated across all "git_service" clients. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100300` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101600` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9558,21 +9481,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100300` Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[1d]) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService"}[2m]))))))) ```

-#### postgres: provisioning_container_memory_usage_long_term +#### gitserver: git_service_grpc_clients_retry_percentage_per_method -

Container memory usage (1d maximum) by instance

+

Client retry percentage per-method over 2m

-Refer to the [alerts reference](alerts#postgres-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +The percentage of gRPC requests that were retried aggregated across all "git_service" clients, broken out per method. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100301` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101601` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9580,21 +9505,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100301` Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[1d]) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",is_retried="true",grpc_method=~"${git_service_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### postgres: provisioning_container_cpu_usage_short_term +#### gitserver: git_service_grpc_clients_retry_count_per_method -

Container cpu usage total (5m maximum) across all cores by instance

+

Client retry count per-method over 2m

-Refer to the [alerts reference](alerts#postgres-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +The count of gRPC requests that were retried aggregated across all "git_service" clients, broken out per method -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100310` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101602` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9602,21 +9529,25 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100310` Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[5m]) +(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverService",grpc_method=~"${git_service_method:regex}",is_retried="true"}[2m])) by (grpc_method)) ```

-#### postgres: provisioning_container_memory_usage_short_term +### Git Server: Repository Service GRPC server metrics -

Container memory usage (5m maximum) by instance

+#### gitserver: repository_service_grpc_request_rate_all_methods -Refer to the [alerts reference](alerts#postgres-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +

Request rate across all methods over 2m

-To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100311` on your Sourcegraph instance. +The number of gRPC requests received per second across all methods, aggregated across all instances. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101700` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9624,24 +9555,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100311` Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[5m]) +sum(rate(grpc_server_started_total{instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) ```

-#### postgres: container_oomkill_events_total +#### gitserver: repository_service_grpc_request_rate_per_method -

Container OOMKILL events total by instance

+

Request rate per-method over 2m

-This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +The number of gRPC requests received per second broken out per method, aggregated across all instances. -Refer to the [alerts reference](alerts#postgres-container-oomkill-events-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9649,23 +9579,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100312` Query: ``` -max by (name) (container_oom_events_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}) +sum(rate(grpc_server_started_total{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) by (grpc_method) ```

-### Postgres: Kubernetes monitoring (only available on Kubernetes) +#### gitserver: repository_service_error_percentage_all_methods -#### postgres: pods_available_percentage +

Error percentage across all methods over 2m

-

Percentage pods available

+The percentage of gRPC requests that fail across all methods, aggregated across all instances. -Refer to the [alerts reference](alerts#postgres-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101710` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9673,29 +9603,23 @@ To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100400` Query: ``` -sum by(app) (up{app=~".*(pgsql|codeintel-db|codeinsights)"}) / count by (app) (up{app=~".*(pgsql|codeintel-db|codeinsights)"}) * 100 +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) )) ```

-## Precise Code Intel Worker - -

Handles conversion of uploaded precise code intelligence bundles.

- -To see this dashboard, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker` on your Sourcegraph instance. - -### Precise Code Intel Worker: Codeintel: LSIF uploads +#### gitserver: repository_service_grpc_error_percentage_per_method -#### precise-code-intel-worker: codeintel_upload_queue_size +

Error percentage per-method over 2m

-

Unprocessed upload record queue size

+The percentage of gRPC requests that fail per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101711` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9703,27 +9627,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max(src_codeintel_upload_total{job=~"^precise-code-intel-worker.*"}) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${repository_service_method:regex}`,grpc_code!="OK",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) by (grpc_method)) )) ```

-#### precise-code-intel-worker: codeintel_upload_queue_growth_rate +#### gitserver: repository_service_p99_response_time_per_method -

Unprocessed upload record queue growth rate over 30m

- -This value compares the rate of enqueues against the rate of finished jobs. +

99th percentile response time per method over 2m

- - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +The 99th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101720` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9731,21 +9651,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_upload_total{job=~"^precise-code-intel-worker.*"}[30m])) / sum(increase(src_codeintel_upload_processor_total{job=~"^precise-code-intel-worker.*"}[30m])) +histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_upload_queued_max_age +#### gitserver: repository_service_p90_response_time_per_method -

Unprocessed upload record queue longest time in queue

+

90th percentile response time per method over 2m

-Refer to the [alerts reference](alerts#precise-code-intel-worker-codeintel-upload-queued-max-age) for 1 alert related to this panel. +The 90th percentile response time per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100002` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101721` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9753,23 +9675,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max(src_codeintel_upload_queued_duration_seconds_total{job=~"^precise-code-intel-worker.*"}) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-### Precise Code Intel Worker: Codeintel: LSIF uploads +#### gitserver: repository_service_p75_response_time_per_method -#### precise-code-intel-worker: codeintel_upload_handlers +

75th percentile response time per method over 2m

-

Handler active handlers

+The 75th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101722` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9777,21 +9699,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(src_codeintel_upload_processor_handlers{job=~"^precise-code-intel-worker.*"}) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_upload_processor_upload_size +#### gitserver: repository_service_p99_9_response_size_per_method -

Sum of upload sizes in bytes being processed by each precise code-intel worker instance

+

99.9th percentile total response size per method over 2m

+ +The 99.9th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101730` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9799,21 +9723,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by(instance) (src_codeintel_upload_processor_upload_size{job="precise-code-intel-worker"}) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_upload_processor_total +#### gitserver: repository_service_p90_response_size_per_method -

Handler operations every 5m

+

90th percentile total response size per method over 2m

+ +The 90th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101731` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9821,21 +9747,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_upload_processor_total{job=~"^precise-code-intel-worker.*"}[5m])) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_upload_processor_99th_percentile_duration +#### gitserver: repository_service_p75_response_size_per_method -

Aggregate successful handler operation duration distribution over 5m

+

75th percentile total response size per method over 2m

+ +The 75th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101732` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9843,21 +9771,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (le)(rate(src_codeintel_upload_processor_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_upload_processor_errors_total +#### gitserver: repository_service_p99_9_invididual_sent_message_size_per_method -

Handler operation errors every 5m

+

99.9th percentile individual sent message size per method over 2m

+ +The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101740` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9865,21 +9795,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_upload_processor_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_upload_processor_error_rate +#### gitserver: repository_service_p90_invididual_sent_message_size_per_method -

Handler operation error rate over 5m

+

90th percentile individual sent message size per method over 2m

+ +The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101741` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9887,23 +9819,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_upload_processor_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_upload_processor_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_upload_processor_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-### Precise Code Intel Worker: Codeintel: dbstore stats +#### gitserver: repository_service_p75_invididual_sent_message_size_per_method -#### precise-code-intel-worker: codeintel_uploads_store_total +

75th percentile individual sent message size per method over 2m

-

Aggregate store operations every 5m

+The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101742` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9911,21 +9843,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))) ```

-#### precise-code-intel-worker: codeintel_uploads_store_99th_percentile_duration +#### gitserver: repository_service_grpc_response_stream_message_count_per_method -

Aggregate successful store operation duration distribution over 5m

+

Average streaming response message count per-method over 2m

+ +The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101750` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9933,21 +9867,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (le)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) +((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) by (grpc_method))) ```

-#### precise-code-intel-worker: codeintel_uploads_store_errors_total +#### gitserver: repository_service_grpc_all_codes_per_method -

Aggregate store operation errors every 5m

+

Response codes rate per-method over 2m

+ +The rate of all generated gRPC response codes per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101760` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9955,21 +9891,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum(rate(grpc_server_handled_total{grpc_method=~`${repository_service_method:regex}`,instance=~`${shard:regex}`,grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m])) by (grpc_method, grpc_code) ```

-#### precise-code-intel-worker: codeintel_uploads_store_error_rate +### Git Server: Repository Service GRPC "internal error" metrics -

Aggregate store operation error rate over 5m

+#### gitserver: repository_service_grpc_clients_error_percentage_all_methods + +

Client baseline error percentage across all methods over 2m

+ +The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "repository_service" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101800` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9977,21 +9917,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))))))) ```

-#### precise-code-intel-worker: codeintel_uploads_store_total +#### gitserver: repository_service_grpc_clients_error_percentage_per_method -

Store operations every 5m

+

Client baseline error percentage per-method over 2m

+ +The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "repository_service" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -9999,21 +9941,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### precise-code-intel-worker: codeintel_uploads_store_99th_percentile_duration +#### gitserver: repository_service_grpc_clients_all_codes_per_method -

99th percentile successful store operation duration over 5m

+

Client baseline response codes rate per-method over 2m

+ +The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "repository_service" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101802` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10021,21 +9965,29 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) +(sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```

-#### precise-code-intel-worker: codeintel_uploads_store_errors_total +#### gitserver: repository_service_grpc_clients_internal_error_percentage_all_methods -

Store operation errors every 5m

+

Client-observed gRPC internal error percentage across all methods over 2m

+ +The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "repository_service" clients. + +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "repository_service" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101810` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10043,21 +9995,29 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))))))) ```

-#### precise-code-intel-worker: codeintel_uploads_store_error_rate +#### gitserver: repository_service_grpc_clients_internal_error_percentage_per_method -

Store operation error rate over 5m

+

Client-observed gRPC internal error percentage per-method over 2m

+ +The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "repository_service" clients. + +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "repository_service" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101811` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10065,23 +10025,29 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}"}[2m])) by (grpc_method)))))) ```

-### Precise Code Intel Worker: Codeintel: lsifstore stats +#### gitserver: repository_service_grpc_clients_internal_error_all_codes_per_method -#### precise-code-intel-worker: codeintel_uploads_lsifstore_total +

Client-observed gRPC internal error response code rate per-method over 2m

-

Aggregate store operations every 5m

+The rate of gRPC internal-error response codes per method, aggregated across all "repository_service" clients. + +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "repository_service" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101812` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10089,21 +10055,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) +(sum(rate(src_grpc_method_status{grpc_service=~"gitserver.v1.GitserverRepositoryService",is_internal_error="true",grpc_method=~"${repository_service_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_99th_percentile_duration +### Git Server: Repository Service GRPC retry metrics -

Aggregate successful store operation duration distribution over 5m

+#### gitserver: repository_service_grpc_clients_retry_percentage_across_all_methods + +

Client retry percentage across all methods over 2m

+ +The percentage of gRPC requests that were retried across all methods, aggregated across all "repository_service" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101900` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10111,21 +10081,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (le)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverRepositoryService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverRepositoryService"}[2m]))))))) ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_errors_total +#### gitserver: repository_service_grpc_clients_retry_percentage_per_method -

Aggregate store operation errors every 5m

+

Client retry percentage per-method over 2m

+ +The percentage of gRPC requests that were retried aggregated across all "repository_service" clients, broken out per method. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101901` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10133,21 +10105,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverRepositoryService",is_retried="true",grpc_method=~"${repository_service_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}"}[2m])) by (grpc_method)))))) ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_error_rate +#### gitserver: repository_service_grpc_clients_retry_count_per_method -

Aggregate store operation error rate over 5m

+

Client retry count per-method over 2m

+ +The count of gRPC requests that were retried aggregated across all "repository_service" clients, broken out per method This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=101902` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10155,21 +10129,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"gitserver.v1.GitserverRepositoryService",grpc_method=~"${repository_service_method:regex}",is_retried="true"}[2m])) by (grpc_method)) ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_total +### Git Server: Site configuration client update latency -

Store operations every 5m

+#### gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance + +

Duration since last successful site configuration update (by instance)

+ +The duration since the configuration client used by the "gitserver" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10177,21 +10155,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) +src_conf_client_time_since_last_successful_update_seconds{job=~`.*gitserver`,instance=~`${shard:regex}`} ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_99th_percentile_duration +#### gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance -

99th percentile successful store operation duration over 5m

+

Maximum duration since last successful site configuration update (all "gitserver" instances)

-This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-gitserver-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10199,21 +10177,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) +max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*gitserver`,instance=~`${shard:regex}`}[1m])) ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_errors_total +### Git Server: HTTP handlers -

Store operation errors every 5m

+#### gitserver: healthy_request_rate + +

Requests per second, by route, when status code is 200

+ +The number of healthy HTTP requests per second to internal HTTP api This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10221,21 +10203,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (route) (rate(src_http_request_duration_seconds_count{app="gitserver",code=~"2.."}[5m])) ```

-#### precise-code-intel-worker: codeintel_uploads_lsifstore_error_rate +#### gitserver: unhealthy_request_rate -

Store operation error rate over 5m

+

Requests per second, by route, when status code is not 200

+ +The number of unhealthy HTTP requests per second to internal HTTP api This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10243,23 +10227,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +sum by (route) (rate(src_http_request_duration_seconds_count{app="gitserver",code!~"2.."}[5m])) ```

-### Precise Code Intel Worker: Workerutil: lsif_uploads dbworker/store stats +#### gitserver: request_rate_by_code -#### precise-code-intel-worker: workerutil_dbworker_store_codeintel_upload_total +

Requests per second, by status code

-

Store operations every 5m

+The number of HTTP requests per second by code This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10267,21 +10251,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_upload_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (code) (rate(src_http_request_duration_seconds_count{app="gitserver"}[5m])) ```

-#### precise-code-intel-worker: workerutil_dbworker_store_codeintel_upload_99th_percentile_duration +#### gitserver: 95th_percentile_healthy_requests -

Aggregate successful store operation duration distribution over 5m

+

95th percentile duration by route, when status code is 200

+ +The 95th percentile duration by route when the status code is 200 This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10289,21 +10275,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (le)(rate(src_workerutil_dbworker_store_codeintel_upload_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) +histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app="gitserver",code=~"2.."}[5m])) by (le, route)) ```

-#### precise-code-intel-worker: workerutil_dbworker_store_codeintel_upload_errors_total +#### gitserver: 95th_percentile_unhealthy_requests + +

95th percentile duration by route, when status code is not 200

-

Store operation errors every 5m

+The 95th percentile duration by route when the status code is not 200 This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10311,21 +10299,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_upload_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app="gitserver",code!~"2.."}[5m])) by (le, route)) ```

-#### precise-code-intel-worker: workerutil_dbworker_store_codeintel_upload_error_rate +### Git Server: Database connections -

Store operation error rate over 5m

+#### gitserver: max_open_conns + +

Maximum open

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10333,23 +10323,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_upload_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_codeintel_upload_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_codeintel_upload_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="gitserver"}) ```

-### Precise Code Intel Worker: Codeintel: gitserver client - -#### precise-code-intel-worker: codeintel_gitserver_total +#### gitserver: open_conns -

Aggregate client operations every 5m

+

Established

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10357,21 +10345,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_gitserver_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_open{app_name="gitserver"}) ```

-#### precise-code-intel-worker: codeintel_gitserver_99th_percentile_duration +#### gitserver: in_use -

Aggregate successful client operation duration distribution over 5m

+

Used

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10379,21 +10367,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (le)(rate(src_codeintel_gitserver_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="gitserver"}) ```

-#### precise-code-intel-worker: codeintel_gitserver_errors_total +#### gitserver: idle -

Aggregate client operation errors every 5m

+

Idle

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10401,21 +10389,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_gitserver_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="gitserver"}) ```

-#### precise-code-intel-worker: codeintel_gitserver_error_rate +#### gitserver: mean_blocked_seconds_per_conn_request -

Aggregate client operation error rate over 5m

+

Mean blocked seconds per conn request

-This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102220` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10423,21 +10411,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_gitserver_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_gitserver_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_gitserver_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="gitserver"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="gitserver"}[5m])) ```

-#### precise-code-intel-worker: codeintel_gitserver_total +#### gitserver: closed_max_idle -

Client operations every 5m

+

Closed by SetMaxIdleConns

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102230` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10445,21 +10433,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_gitserver_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="gitserver"}[5m])) ```

-#### precise-code-intel-worker: codeintel_gitserver_99th_percentile_duration +#### gitserver: closed_max_lifetime -

99th percentile successful client operation duration over 5m

+

Closed by SetConnMaxLifetime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102231` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10467,21 +10455,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_gitserver_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="gitserver"}[5m])) ```

-#### precise-code-intel-worker: codeintel_gitserver_errors_total +#### gitserver: closed_max_idle_time -

Client operation errors every 5m

+

Closed by SetConnMaxIdleTime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102232` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10489,21 +10477,33 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="gitserver"}[5m])) ```

-#### precise-code-intel-worker: codeintel_gitserver_error_rate +### Git Server: Container monitoring (not available on server) -

Client operation error rate over 5m

+#### gitserver: container_missing + +

Container missing

+ +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod gitserver` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p gitserver`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' gitserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the gitserver container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs gitserver` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10511,23 +10511,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_gitserver_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +count by(name) ((time() - container_last_seen{name=~"^gitserver.*"}) > 60) ```

-### Precise Code Intel Worker: Codeintel: uploadstore stats - -#### precise-code-intel-worker: codeintel_uploadstore_total +#### gitserver: container_cpu_usage -

Aggregate store operations every 5m

+

Container cpu usage total (1m average) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10535,21 +10533,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) +cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"} ```

-#### precise-code-intel-worker: codeintel_uploadstore_99th_percentile_duration +#### gitserver: container_memory_usage -

Aggregate successful store operation duration distribution over 5m

+

Container memory usage by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102302` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10557,21 +10555,24 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (le)(rate(src_codeintel_uploadstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) +cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"} ```

-#### precise-code-intel-worker: codeintel_uploadstore_errors_total +#### gitserver: fs_io_operations -

Aggregate store operation errors every 5m

+

Filesystem reads and writes rate by instance over 1h

+ +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102303` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10579,21 +10580,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +sum by(name) (rate(container_fs_reads_total{name=~"^gitserver.*"}[1h]) + rate(container_fs_writes_total{name=~"^gitserver.*"}[1h])) ```

-#### precise-code-intel-worker: codeintel_uploadstore_error_rate +### Git Server: Provisioning indicators (not available on server) -

Aggregate store operation error rate over 5m

+#### gitserver: provisioning_container_cpu_usage_long_term -This panel has no related alerts. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

-To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100603` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#gitserver-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102400` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10601,21 +10604,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[1d]) ```

-#### precise-code-intel-worker: codeintel_uploadstore_total +#### gitserver: provisioning_container_memory_usage_long_term -

Store operations every 5m

+

Container memory usage (1d maximum) by instance

+ +Git Server is expected to use up all the memory it is provided. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10623,21 +10628,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"}[1d]) ```

-#### precise-code-intel-worker: codeintel_uploadstore_99th_percentile_duration +#### gitserver: provisioning_container_cpu_usage_short_term -

99th percentile successful store operation duration over 5m

+

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102410` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10645,21 +10650,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploadstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"}[5m]) ```

-#### precise-code-intel-worker: codeintel_uploadstore_errors_total +#### gitserver: provisioning_container_memory_usage_short_term -

Store operation errors every 5m

+

Container memory usage (5m maximum) by instance

+ +Git Server is expected to use up all the memory it is provided. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10667,21 +10674,24 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"}[5m]) ```

-#### precise-code-intel-worker: codeintel_uploadstore_error_rate +#### gitserver: container_oomkill_events_total -

Store operation error rate over 5m

+

Container OOMKILL events total by instance

-This panel has no related alerts. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. + +Refer to the [alerts reference](alerts#gitserver-container-oomkill-events-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100613` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102412` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10689,23 +10699,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 +max by (name) (container_oom_events_total{name=~"^gitserver.*"}) ```

-### Precise Code Intel Worker: Database connections +### Git Server: Golang runtime monitoring -#### precise-code-intel-worker: max_open_conns +#### gitserver: go_goroutines -

Maximum open

+

Maximum active goroutines

-This panel has no related alerts. +A high value here indicates a possible goroutine leak. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100700` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#gitserver-go-goroutines) for 1 alert related to this panel. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102500` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10713,21 +10725,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="precise-code-intel-worker"}) +max by(instance) (go_goroutines{job=~".*gitserver"}) ```

-#### precise-code-intel-worker: open_conns +#### gitserver: go_gc_duration_seconds -

Established

+

Maximum go garbage collection duration

-This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10735,21 +10747,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="precise-code-intel-worker"}) +max by(instance) (go_gc_duration_seconds{job=~".*gitserver"}) ```

-#### precise-code-intel-worker: in_use +### Git Server: Kubernetes monitoring (only available on Kubernetes) -

Used

+#### gitserver: pods_available_percentage -This panel has no related alerts. +

Percentage pods available

-To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100710` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#gitserver-pods-available-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=102600` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -10757,19 +10771,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="precise-code-intel-worker"}) +sum by(app) (up{app=~".*gitserver"}) / count by (app) (up{app=~".*gitserver"}) * 100 ```

-#### precise-code-intel-worker: idle +## Postgres -

Idle

+

Postgres metrics, exported from postgres_exporter (not available on server).

-This panel has no related alerts. +To see this dashboard, visit `/-/debug/grafana/d/postgres/postgres` on your Sourcegraph instance. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100711` on your Sourcegraph instance. +#### postgres: connections + +

Active connections

+ +Refer to the [alerts reference](alerts#postgres-connections) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100000` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -10779,19 +10799,19 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="precise-code-intel-worker"}) +sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"}) ```
-#### precise-code-intel-worker: mean_blocked_seconds_per_conn_request +#### postgres: usage_connections_percentage -

Mean blocked seconds per conn request

+

Connection in use

-Refer to the [alerts reference](alerts#precise-code-intel-worker-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +Refer to the [alerts reference](alerts#postgres-usage-connections-percentage) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100720` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100001` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -10801,19 +10821,19 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="precise-code-intel-worker"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="precise-code-intel-worker"}[5m])) +sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100 ```
-#### precise-code-intel-worker: closed_max_idle +#### postgres: transaction_durations -

Closed by SetMaxIdleConns

+

Maximum transaction durations

-This panel has no related alerts. +Refer to the [alerts reference](alerts#postgres-transaction-durations) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100730` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100002` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -10823,19 +10843,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="precise-code-intel-worker"}[5m])) +sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"}) ```
-#### precise-code-intel-worker: closed_max_lifetime +### Postgres: Database and collector status -

Closed by SetConnMaxLifetime

+#### postgres: postgres_up -This panel has no related alerts. +

Database availability

+ +A non-zero value indicates the database is online. + +Refer to the [alerts reference](alerts#postgres-postgres-up) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100731` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100100` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -10845,19 +10869,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="precise-code-intel-worker"}[5m])) +pg_up ```
-#### precise-code-intel-worker: closed_max_idle_time +#### postgres: invalid_indexes -

Closed by SetConnMaxIdleTime

+

Invalid indexes (unusable by the query planner)

-This panel has no related alerts. +A non-zero value indicates the that Postgres failed to build an index. Expect degraded performance until the index is manually rebuilt. + +Refer to the [alerts reference](alerts#postgres-invalid-indexes) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100732` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100101` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -10867,33 +10893,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="precise-code-intel-worker"}[5m])) +max by (relname)(pg_invalid_index_count) ```
-### Precise Code Intel Worker: Container monitoring (not available on server) - -#### precise-code-intel-worker: container_missing - -

Container missing

+#### postgres: pg_exporter_err -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +

Errors scraping postgres exporter

-- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod precise-code-intel-worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p precise-code-intel-worker`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' precise-code-intel-worker` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the precise-code-intel-worker container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs precise-code-intel-worker` (note this will include logs from the previous and currently running container). +This value indicates issues retrieving metrics from postgres_exporter. -This panel has no related alerts. +Refer to the [alerts reference](alerts#postgres-pg-exporter-err) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10901,21 +10917,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -count by(name) ((time() - container_last_seen{name=~"^precise-code-intel-worker.*"}) > 60) +pg_exporter_last_scrape_error ```

-#### precise-code-intel-worker: container_cpu_usage +#### postgres: migration_in_progress -

Container cpu usage total (1m average) across all cores by instance

+

Active schema migration

-Refer to the [alerts reference](alerts#precise-code-intel-worker-container-cpu-usage) for 1 alert related to this panel. +A 0 value indicates that no migration is in progress. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100801` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#postgres-migration-in-progress) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100111` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10923,21 +10941,25 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"} +pg_sg_migration_status ```

-#### precise-code-intel-worker: container_memory_usage +### Postgres: Object size and bloat -

Container memory usage by instance

+#### postgres: pg_table_size -Refer to the [alerts reference](alerts#precise-code-intel-worker-container-memory-usage) for 1 alert related to this panel. +

Table size

-To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100802` on your Sourcegraph instance. +Total size of this table -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100200` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10945,24 +10967,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"} +max by (relname)(pg_table_bloat_size) ```

-#### precise-code-intel-worker: fs_io_operations +#### postgres: pg_table_bloat_ratio -

Filesystem reads and writes rate by instance over 1h

+

Table bloat ratio

-This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +Estimated bloat ratio of this table (high bloat = high overhead) This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100803` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10970,23 +10991,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^precise-code-intel-worker.*"}[1h]) + rate(container_fs_writes_total{name=~"^precise-code-intel-worker.*"}[1h])) +max by (relname)(pg_table_bloat_ratio) * 100 ```

-### Precise Code Intel Worker: Provisioning indicators (not available on server) +#### postgres: pg_index_size -#### precise-code-intel-worker: provisioning_container_cpu_usage_long_term +

Index size

-

Container cpu usage total (90th percentile over 1d) across all cores by instance

+Total size of this index -Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -10994,21 +11015,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[1d]) +max by (relname)(pg_index_bloat_size) ```

-#### precise-code-intel-worker: provisioning_container_memory_usage_long_term +#### postgres: pg_index_bloat_ratio -

Container memory usage (1d maximum) by instance

+

Index bloat ratio

-Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +Estimated bloat ratio of this index (high bloat = high overhead) -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100901` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100211` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11016,21 +11039,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[1d]) +max by (relname)(pg_index_bloat_ratio) * 100 ```

-#### precise-code-intel-worker: provisioning_container_cpu_usage_short_term +### Postgres: Provisioning indicators (not available on server) -

Container cpu usage total (5m maximum) across all cores by instance

+#### postgres: provisioning_container_cpu_usage_long_term -Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

-To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100910` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#postgres-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11038,21 +11063,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[5m]) +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[1d]) ```

-#### precise-code-intel-worker: provisioning_container_memory_usage_short_term +#### postgres: provisioning_container_memory_usage_long_term -

Container memory usage (5m maximum) by instance

+

Container memory usage (1d maximum) by instance

-Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#postgres-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11060,24 +11085,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[5m]) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[1d]) ```

-#### precise-code-intel-worker: container_oomkill_events_total - -

Container OOMKILL events total by instance

+#### postgres: provisioning_container_cpu_usage_short_term -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

Container cpu usage total (5m maximum) across all cores by instance

-Refer to the [alerts reference](alerts#precise-code-intel-worker-container-oomkill-events-total) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#postgres-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100912` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11085,25 +11107,21 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max by (name) (container_oom_events_total{name=~"^precise-code-intel-worker.*"}) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[5m]) ```

-### Precise Code Intel Worker: Golang runtime monitoring - -#### precise-code-intel-worker: go_goroutines - -

Maximum active goroutines

+#### postgres: provisioning_container_memory_usage_short_term -A high value here indicates a possible goroutine leak. +

Container memory usage (5m maximum) by instance

-Refer to the [alerts reference](alerts#precise-code-intel-worker-go-goroutines) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#postgres-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100311` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11111,21 +11129,24 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max by(instance) (go_goroutines{job=~".*precise-code-intel-worker"}) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}[5m]) ```

-#### precise-code-intel-worker: go_gc_duration_seconds +#### postgres: container_oomkill_events_total -

Maximum go garbage collection duration

+

Container OOMKILL events total by instance

-Refer to the [alerts reference](alerts#precise-code-intel-worker-go-gc-duration-seconds) for 1 alert related to this panel. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=101001` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#postgres-container-oomkill-events-total) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100312` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11133,23 +11154,23 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -max by(instance) (go_gc_duration_seconds{job=~".*precise-code-intel-worker"}) +max by (name) (container_oom_events_total{name=~"^(pgsql|codeintel-db|codeinsights).*"}) ```

-### Precise Code Intel Worker: Kubernetes monitoring (only available on Kubernetes) +### Postgres: Kubernetes monitoring (only available on Kubernetes) -#### precise-code-intel-worker: pods_available_percentage +#### postgres: pods_available_percentage

Percentage pods available

-Refer to the [alerts reference](alerts#precise-code-intel-worker-pods-available-percentage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#postgres-pods-available-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/postgres/postgres?viewPanel=100400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -11157,31 +11178,29 @@ To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-c Query: ``` -sum by(app) (up{app=~".*precise-code-intel-worker"}) / count by (app) (up{app=~".*precise-code-intel-worker"}) * 100 +sum by(app) (up{app=~".*(pgsql|codeintel-db|codeinsights)"}) / count by (app) (up{app=~".*(pgsql|codeintel-db|codeinsights)"}) * 100 ```

-## Redis - -

Metrics from both redis databases.

+## Precise Code Intel Worker -To see this dashboard, visit `/-/debug/grafana/d/redis/redis` on your Sourcegraph instance. +

Handles conversion of uploaded precise code intelligence bundles.

-### Redis: Redis Store +To see this dashboard, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker` on your Sourcegraph instance. -#### redis: redis-store_up +### Precise Code Intel Worker: Codeintel: LSIF uploads -

Redis-store availability

+#### precise-code-intel-worker: codeintel_upload_handlers -A value of 1 indicates the service is currently running +

Handler active handlers

-Refer to the [alerts reference](alerts#redis-redis-store-up) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11189,25 +11208,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100000` on yo Query: ``` -redis_up{app="redis-store"} +sum(src_codeintel_upload_processor_handlers{job=~"^precise-code-intel-worker.*"}) ```

-### Redis: Redis Cache - -#### redis: redis-cache_up - -

Redis-cache availability

+#### precise-code-intel-worker: codeintel_upload_processor_upload_size -A value of 1 indicates the service is currently running +

Sum of upload sizes in bytes being processed by each precise code-intel worker instance

-Refer to the [alerts reference](alerts#redis-redis-cache-up) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11215,23 +11230,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100100` on yo Query: ``` -redis_up{app="redis-cache"} +sum by(instance) (src_codeintel_upload_processor_upload_size{job="precise-code-intel-worker"}) ```

-### Redis: Provisioning indicators (not available on server) - -#### redis: provisioning_container_cpu_usage_long_term +#### precise-code-intel-worker: codeintel_upload_processor_total -

Container cpu usage total (90th percentile over 1d) across all cores by instance

+

Handler operations every 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100010` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11239,21 +11252,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100200` on yo Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[1d]) +sum(increase(src_codeintel_upload_processor_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### redis: provisioning_container_memory_usage_long_term +#### precise-code-intel-worker: codeintel_upload_processor_99th_percentile_duration -

Container memory usage (1d maximum) by instance

+

Aggregate successful handler operation duration distribution over 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11261,21 +11274,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100201` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[1d]) +sum by (le)(rate(src_codeintel_upload_processor_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### redis: provisioning_container_cpu_usage_short_term +#### precise-code-intel-worker: codeintel_upload_processor_errors_total -

Container cpu usage total (5m maximum) across all cores by instance

+

Handler operation errors every 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100012` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11283,21 +11296,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100210` on yo Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[5m]) +sum(increase(src_codeintel_upload_processor_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### redis: provisioning_container_memory_usage_short_term +#### precise-code-intel-worker: codeintel_upload_processor_error_rate -

Container memory usage (5m maximum) by instance

+

Handler operation error rate over 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100013` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11305,24 +11318,23 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100211` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[5m]) +sum(increase(src_codeintel_upload_processor_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_upload_processor_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_upload_processor_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```

-#### redis: container_oomkill_events_total +### Precise Code Intel Worker: Codeintel: dbstore stats -

Container OOMKILL events total by instance

+#### precise-code-intel-worker: codeintel_uploads_store_total -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

Aggregate store operations every 5m

-Refer to the [alerts reference](alerts#redis-container-oomkill-events-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11330,23 +11342,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100212` on yo Query: ``` -max by (name) (container_oom_events_total{name=~"^redis-cache.*"}) +sum(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-### Redis: Provisioning indicators (not available on server) - -#### redis: provisioning_container_cpu_usage_long_term +#### precise-code-intel-worker: codeintel_uploads_store_99th_percentile_duration -

Container cpu usage total (90th percentile over 1d) across all cores by instance

+

Aggregate successful store operation duration distribution over 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11354,21 +11364,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100300` on yo Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[1d]) +sum by (le)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### redis: provisioning_container_memory_usage_long_term +#### precise-code-intel-worker: codeintel_uploads_store_errors_total -

Container memory usage (1d maximum) by instance

+

Aggregate store operation errors every 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11376,21 +11386,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100301` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[1d]) +sum(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### redis: provisioning_container_cpu_usage_short_term +#### precise-code-intel-worker: codeintel_uploads_store_error_rate -

Container cpu usage total (5m maximum) across all cores by instance

+

Aggregate store operation error rate over 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11398,21 +11408,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100310` on yo Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[5m]) +sum(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```

-#### redis: provisioning_container_memory_usage_short_term +#### precise-code-intel-worker: codeintel_uploads_store_total -

Container memory usage (5m maximum) by instance

+

Store operations every 5m

-Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11420,24 +11430,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100311` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[5m]) +sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### redis: container_oomkill_events_total - -

Container OOMKILL events total by instance

+#### precise-code-intel-worker: codeintel_uploads_store_99th_percentile_duration -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

99th percentile successful store operation duration over 5m

-Refer to the [alerts reference](alerts#redis-container-oomkill-events-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11445,23 +11452,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100312` on yo Query: ``` -max by (name) (container_oom_events_total{name=~"^redis-store.*"}) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) ```

-### Redis: Kubernetes monitoring (only available on Kubernetes) - -#### redis: pods_available_percentage +#### precise-code-intel-worker: codeintel_uploads_store_errors_total -

Percentage pods available

+

Store operation errors every 5m

-Refer to the [alerts reference](alerts#redis-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100112` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11469,23 +11474,21 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100400` on yo Query: ``` -sum by(app) (up{app=~".*redis-cache"}) / count by (app) (up{app=~".*redis-cache"}) * 100 +sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-### Redis: Kubernetes monitoring (only available on Kubernetes) - -#### redis: pods_available_percentage +#### precise-code-intel-worker: codeintel_uploads_store_error_rate -

Percentage pods available

+

Store operation error rate over 5m

-Refer to the [alerts reference](alerts#redis-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100113` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11493,31 +11496,23 @@ To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100500` on yo Query: ``` -sum by(app) (up{app=~".*redis-store"}) / count by (app) (up{app=~".*redis-store"}) * 100 +sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```

-## Worker - -

Manages background processes.

- -To see this dashboard, visit `/-/debug/grafana/d/worker/worker` on your Sourcegraph instance. - -### Worker: Active jobs - -#### worker: worker_job_count +### Precise Code Intel Worker: Codeintel: lsifstore stats -

Number of worker instances running each job

+#### precise-code-intel-worker: codeintel_uploads_lsifstore_total -The number of worker instances running each job type. -It is necessary for each job type to be managed by at least one worker instance. +

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100200` on your Sourcegraph instance. +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11525,19 +11520,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100000` on Query: ``` -sum by (job_name) (src_worker_jobs{job=~"^worker.*"}) +sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### worker: worker_job_codeintel-upload-janitor_count +#### precise-code-intel-worker: codeintel_uploads_lsifstore_99th_percentile_duration -

Number of worker instances running the codeintel-upload-janitor job

+

Aggregate successful store operation duration distribution over 5m

-Refer to the [alerts reference](alerts#worker-worker-job-codeintel-upload-janitor-count) for 2 alerts related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100201` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11547,19 +11542,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100010` on Query: ``` -sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-upload-janitor"}) +sum by (le)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: worker_job_codeintel-commitgraph-updater_count +#### precise-code-intel-worker: codeintel_uploads_lsifstore_errors_total -

Number of worker instances running the codeintel-commitgraph-updater job

+

Aggregate store operation errors every 5m

-Refer to the [alerts reference](alerts#worker-worker-job-codeintel-commitgraph-updater-count) for 2 alerts related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100202` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11569,19 +11564,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100011` on Query: ``` -sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-commitgraph-updater"}) +sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: worker_job_codeintel-autoindexing-scheduler_count +#### precise-code-intel-worker: codeintel_uploads_lsifstore_error_rate -

Number of worker instances running the codeintel-autoindexing-scheduler job

+

Aggregate store operation error rate over 5m

-Refer to the [alerts reference](alerts#worker-worker-job-codeintel-autoindexing-scheduler-count) for 2 alerts related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100203` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11591,25 +11586,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100012` on Query: ``` -sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-autoindexing-scheduler"}) +sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```
-### Worker: Database record encrypter - -#### worker: records_encrypted_at_rest_percentage - -

Percentage of database records encrypted at rest

+#### precise-code-intel-worker: codeintel_uploads_lsifstore_total -Percentage of encrypted database records +

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11617,23 +11608,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100100` on Query: ``` -(max(src_records_encrypted_at_rest_total) by (tableName)) / ((max(src_records_encrypted_at_rest_total) by (tableName)) + (max(src_records_unencrypted_at_rest_total) by (tableName))) * 100 +sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### worker: records_encrypted_total - -

Database records encrypted every 5m

+#### precise-code-intel-worker: codeintel_uploads_lsifstore_99th_percentile_duration -Number of encrypted database records every 5m +

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11641,23 +11630,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100101` on Query: ``` -sum by (tableName)(increase(src_records_encrypted_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) ```

-#### worker: records_decrypted_total - -

Database records decrypted every 5m

+#### precise-code-intel-worker: codeintel_uploads_lsifstore_errors_total -Number of encrypted database records every 5m +

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100212` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11665,23 +11652,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100102` on Query: ``` -sum by (tableName)(increase(src_records_decrypted_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```

-#### worker: record_encryption_errors_total - -

Encryption operation errors every 5m

+#### precise-code-intel-worker: codeintel_uploads_lsifstore_error_rate -Number of database record encryption/decryption errors every 5m +

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100213` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -11689,21 +11674,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100103` on Query: ``` -sum(increase(src_record_encryption_errors_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```

-### Worker: Codeintel: Repository with stale commit graph +### Precise Code Intel Worker: Workerutil: lsif_uploads dbworker/store stats -#### worker: codeintel_commit_graph_queue_size +#### precise-code-intel-worker: workerutil_dbworker_store_total -

Repository queue size

+

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11713,25 +11698,41 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100200` on Query: ``` -max(src_codeintel_commit_graph_total{job=~"^worker.*"}) +sum(increase(src_workerutil_dbworker_store_total{domain='codeintel_upload',job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_commit_graph_queue_growth_rate +#### precise-code-intel-worker: workerutil_dbworker_store_99th_percentile_duration -

Repository queue growth rate over 30m

+

Aggregate successful store operation duration distribution over 5m

-This value compares the rate of enqueues against the rate of finished jobs. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100301` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* + +
+Technical details + +Query: + +``` +sum by (le)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='codeintel_upload',job=~"^precise-code-intel-worker.*"}[5m])) +``` +
+ +
+ +#### precise-code-intel-worker: workerutil_dbworker_store_errors_total - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100302` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11741,19 +11742,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100201` on Query: ``` -sum(increase(src_codeintel_commit_graph_total{job=~"^worker.*"}[30m])) / sum(increase(src_codeintel_commit_graph_processor_total{job=~"^worker.*"}[30m])) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='codeintel_upload',job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_commit_graph_queued_max_age +#### precise-code-intel-worker: workerutil_dbworker_store_error_rate -

Repository queue longest time in queue

+

Store operation error rate over 5m

-Refer to the [alerts reference](alerts#worker-codeintel-commit-graph-queued-max-age) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100303` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11763,21 +11764,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100202` on Query: ``` -max(src_codeintel_commit_graph_queued_duration_seconds_total{job=~"^worker.*"}) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='codeintel_upload',job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_total{domain='codeintel_upload',job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_errors_total{domain='codeintel_upload',job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```
-### Worker: Codeintel: Repository commit graph updates +### Precise Code Intel Worker: Codeintel: gitserver client -#### worker: codeintel_commit_graph_processor_total +#### precise-code-intel-worker: gitserver_client_total -

Update operations every 5m

+

Aggregate client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11787,19 +11788,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100300` on Query: ``` -sum(increase(src_codeintel_commit_graph_processor_total{job=~"^worker.*"}[5m])) +sum(increase(src_gitserver_client_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_commit_graph_processor_99th_percentile_duration +#### precise-code-intel-worker: gitserver_client_99th_percentile_duration -

Aggregate successful update operation duration distribution over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100401` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11809,19 +11810,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100301` on Query: ``` -sum by (le)(rate(src_codeintel_commit_graph_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_commit_graph_processor_errors_total +#### precise-code-intel-worker: gitserver_client_errors_total -

Update operation errors every 5m

+

Aggregate client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100402` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11831,19 +11832,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100302` on Query: ``` -sum(increase(src_codeintel_commit_graph_processor_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_gitserver_client_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_commit_graph_processor_error_rate +#### precise-code-intel-worker: gitserver_client_error_rate -

Update operation error rate over 5m

+

Aggregate client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100403` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11853,21 +11854,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100303` on Query: ``` -sum(increase(src_codeintel_commit_graph_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_commit_graph_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_commit_graph_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum(increase(src_gitserver_client_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```
-### Worker: Codeintel: Dependency index job - -#### worker: codeintel_dependency_index_queue_size +#### precise-code-intel-worker: gitserver_client_total -

Dependency index job queue size

+

Client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100410` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11877,25 +11876,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100400` on Query: ``` -max(src_codeintel_dependency_index_total{job=~"^worker.*"}) +sum by (op)(increase(src_gitserver_client_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_index_queue_growth_rate - -

Dependency index job queue growth rate over 30m

+#### precise-code-intel-worker: gitserver_client_99th_percentile_duration -This value compares the rate of enqueues against the rate of finished jobs. - - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +

99th percentile successful client operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100411` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11905,19 +11898,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100401` on Query: ``` -sum(increase(src_codeintel_dependency_index_total{job=~"^worker.*"}[30m])) / sum(increase(src_codeintel_dependency_index_processor_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) ```
-#### worker: codeintel_dependency_index_queued_max_age +#### precise-code-intel-worker: gitserver_client_errors_total -

Dependency index job queue longest time in queue

+

Client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100412` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11927,21 +11920,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100402` on Query: ``` -max(src_codeintel_dependency_index_queued_duration_seconds_total{job=~"^worker.*"}) +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-### Worker: Codeintel: Dependency index jobs - -#### worker: codeintel_dependency_index_handlers +#### precise-code-intel-worker: gitserver_client_error_rate -

Handler active handlers

+

Client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100413` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11951,19 +11942,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100500` on Query: ``` -sum(src_codeintel_dependency_index_processor_handlers{job=~"^worker.*"}) +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_gitserver_client_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_gitserver_client_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```
-#### worker: codeintel_dependency_index_processor_total +### Precise Code Intel Worker: Codeintel: uploadstore stats -

Handler operations every 5m

+#### precise-code-intel-worker: codeintel_uploadstore_total + +

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100500` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11973,19 +11966,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100510` on Query: ``` -sum(increase(src_codeintel_dependency_index_processor_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_index_processor_99th_percentile_duration +#### precise-code-intel-worker: codeintel_uploadstore_99th_percentile_duration -

Aggregate successful handler operation duration distribution over 5m

+

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100501` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -11995,19 +11988,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100511` on Query: ``` -sum by (le)(rate(src_codeintel_dependency_index_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum by (le)(rate(src_codeintel_uploadstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_index_processor_errors_total +#### precise-code-intel-worker: codeintel_uploadstore_errors_total -

Handler operation errors every 5m

+

Aggregate store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100502` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12017,19 +12010,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100512` on Query: ``` -sum(increase(src_codeintel_dependency_index_processor_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_index_processor_error_rate +#### precise-code-intel-worker: codeintel_uploadstore_error_rate -

Handler operation error rate over 5m

+

Aggregate store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100503` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12039,21 +12032,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100513` on Query: ``` -sum(increase(src_codeintel_dependency_index_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_dependency_index_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_dependency_index_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```
-### Worker: Codeintel: Auto-index scheduler - -#### worker: codeintel_autoindexing_total +#### precise-code-intel-worker: codeintel_uploadstore_total -

Auto-indexing job scheduler operations every 10m

+

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100510` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12063,19 +12054,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100600` on Query: ``` -sum(increase(src_codeintel_autoindexing_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) +sum by (op)(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_autoindexing_99th_percentile_duration +#### precise-code-intel-worker: codeintel_uploadstore_99th_percentile_duration -

Aggregate successful auto-indexing job scheduler operation duration distribution over 10m

+

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100511` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12085,19 +12076,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100601` on Query: ``` -sum by (le)(rate(src_codeintel_autoindexing_duration_seconds_bucket{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploadstore_duration_seconds_bucket{job=~"^precise-code-intel-worker.*"}[5m]))) ```
-#### worker: codeintel_autoindexing_errors_total +#### precise-code-intel-worker: codeintel_uploadstore_errors_total -

Auto-indexing job scheduler operation errors every 10m

+

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100512` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12107,19 +12098,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100602` on Query: ``` -sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) +sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_autoindexing_error_rate +#### precise-code-intel-worker: codeintel_uploadstore_error_rate -

Auto-indexing job scheduler operation error rate over 10m

+

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100603` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100513` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12129,23 +12120,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100603` on Query: ``` -sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) / (sum(increase(src_codeintel_autoindexing_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) + sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m]))) * 100 +sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploadstore_total{job=~"^precise-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploadstore_errors_total{job=~"^precise-code-intel-worker.*"}[5m]))) * 100 ```
-### Worker: Codeintel: dbstore stats +### Precise Code Intel Worker: Database connections -#### worker: codeintel_uploads_store_total +#### precise-code-intel-worker: max_open_conns -

Aggregate store operations every 5m

+

Maximum open

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12153,21 +12144,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100700` on Query: ``` -sum(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="precise-code-intel-worker"}) ```

-#### worker: codeintel_uploads_store_99th_percentile_duration +#### precise-code-intel-worker: open_conns -

Aggregate successful store operation duration distribution over 5m

+

Established

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100601` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12175,21 +12166,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100701` on Query: ``` -sum by (le)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_open{app_name="precise-code-intel-worker"}) ```

-#### worker: codeintel_uploads_store_errors_total +#### precise-code-intel-worker: in_use -

Aggregate store operation errors every 5m

+

Used

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100610` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12197,21 +12188,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100702` on Query: ``` -sum(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="precise-code-intel-worker"}) ```

-#### worker: codeintel_uploads_store_error_rate +#### precise-code-intel-worker: idle -

Aggregate store operation error rate over 5m

+

Idle

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100703` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100611` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12219,21 +12210,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100703` on Query: ``` -sum(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="precise-code-intel-worker"}) ```

-#### worker: codeintel_uploads_store_total +#### precise-code-intel-worker: mean_blocked_seconds_per_conn_request -

Store operations every 5m

+

Mean blocked seconds per conn request

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100620` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12241,21 +12232,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100710` on Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="precise-code-intel-worker"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="precise-code-intel-worker"}[5m])) ```

-#### worker: codeintel_uploads_store_99th_percentile_duration +#### precise-code-intel-worker: closed_max_idle -

99th percentile successful store operation duration over 5m

+

Closed by SetMaxIdleConns

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100630` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12263,21 +12254,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100711` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="precise-code-intel-worker"}[5m])) ```

-#### worker: codeintel_uploads_store_errors_total +#### precise-code-intel-worker: closed_max_lifetime -

Store operation errors every 5m

+

Closed by SetConnMaxLifetime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100712` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100631` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12285,21 +12276,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100712` on Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="precise-code-intel-worker"}[5m])) ```

-#### worker: codeintel_uploads_store_error_rate +#### precise-code-intel-worker: closed_max_idle_time -

Store operation error rate over 5m

+

Closed by SetConnMaxIdleTime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100713` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100632` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -12307,23 +12298,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100713` on Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="precise-code-intel-worker"}[5m])) ```

-### Worker: Codeintel: lsifstore stats +### Precise Code Intel Worker: Precise-code-intel-worker (CPU, Memory) -#### worker: codeintel_uploads_lsifstore_total +#### precise-code-intel-worker: cpu_usage_percentage -

Aggregate store operations every 5m

+

CPU usage

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-cpu-usage-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -12331,21 +12322,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100800` on Query: ``` -sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) +cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"} ```

-#### worker: codeintel_uploads_lsifstore_99th_percentile_duration +#### precise-code-intel-worker: memory_usage_percentage -

Aggregate successful store operation duration distribution over 5m

+

Memory usage percentage (total)

+ +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -12353,21 +12346,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100801` on Query: ``` -sum by (le)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^worker.*"}[5m])) +cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"} ```

-#### worker: codeintel_uploads_lsifstore_errors_total +#### precise-code-intel-worker: memory_working_set_bytes -

Aggregate store operation errors every 5m

+

Memory usage bytes (total)

+ +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100702` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -12375,21 +12370,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100802` on Query: ``` -sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) +max by (name) (container_memory_working_set_bytes{name=~"^precise-code-intel-worker.*"}) ```

-#### worker: codeintel_uploads_lsifstore_error_rate +#### precise-code-intel-worker: memory_rss -

Aggregate store operation error rate over 5m

+

Memory (RSS)

-This panel has no related alerts. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100803` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#precise-code-intel-worker-memory-rss) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100710` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -12397,21 +12394,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100803` on Query: ``` -sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m]))) * 100 +max(container_memory_rss{name=~"^precise-code-intel-worker.*"} / container_spec_memory_limit_bytes{name=~"^precise-code-intel-worker.*"}) by (name) * 100.0 ```

-#### worker: codeintel_uploads_lsifstore_total +#### precise-code-intel-worker: memory_total_active_file -

Store operations every 5m

+

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100711` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -12419,21 +12418,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100810` on Query: ``` -sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) +max(container_memory_total_active_file_bytes{name=~"^precise-code-intel-worker.*"} / container_spec_memory_limit_bytes{name=~"^precise-code-intel-worker.*"}) by (name) * 100.0 ```

-#### worker: codeintel_uploads_lsifstore_99th_percentile_duration +#### precise-code-intel-worker: memory_kernel_usage -

99th percentile successful store operation duration over 5m

+

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100712` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -12441,19 +12442,31 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100811` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +max(container_memory_kernel_usage{name=~"^precise-code-intel-worker.*"} / container_spec_memory_limit_bytes{name=~"^precise-code-intel-worker.*"}) by (name) * 100.0 ```

-#### worker: codeintel_uploads_lsifstore_errors_total +### Precise Code Intel Worker: Container monitoring (not available on server) -

Store operation errors every 5m

+#### precise-code-intel-worker: container_missing + +

Container missing

+ +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod precise-code-intel-worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p precise-code-intel-worker`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' precise-code-intel-worker` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the precise-code-intel-worker container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs precise-code-intel-worker` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100812` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100800` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12463,19 +12476,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100812` on Query: ``` -sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) +count by(name) ((time() - container_last_seen{name=~"^precise-code-intel-worker.*"}) > 60) ```
-#### worker: codeintel_uploads_lsifstore_error_rate +#### precise-code-intel-worker: container_cpu_usage -

Store operation error rate over 5m

+

Container cpu usage total (1m average) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100813` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100801` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12485,21 +12498,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100813` on Query: ``` -sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m]))) * 100 +cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"} ```
-### Worker: Workerutil: lsif_dependency_indexes dbworker/store stats - -#### worker: workerutil_dbworker_store_codeintel_dependency_index_total +#### precise-code-intel-worker: container_memory_usage -

Store operations every 5m

+

Container memory usage by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100802` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12509,19 +12520,22 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100900` on Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_dependency_index_total{job=~"^worker.*"}[5m])) +cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"} ```
-#### worker: workerutil_dbworker_store_codeintel_dependency_index_99th_percentile_duration +#### precise-code-intel-worker: fs_io_operations -

Aggregate successful store operation duration distribution over 5m

+

Filesystem reads and writes rate by instance over 1h

+ +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100803` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12531,19 +12545,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100901` on Query: ``` -sum by (le)(rate(src_workerutil_dbworker_store_codeintel_dependency_index_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum by(name) (rate(container_fs_reads_total{name=~"^precise-code-intel-worker.*"}[1h]) + rate(container_fs_writes_total{name=~"^precise-code-intel-worker.*"}[1h])) ```
-#### worker: workerutil_dbworker_store_codeintel_dependency_index_errors_total +### Precise Code Intel Worker: Provisioning indicators (not available on server) -

Store operation errors every 5m

+#### precise-code-intel-worker: provisioning_container_cpu_usage_long_term -This panel has no related alerts. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

+ +Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100902` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100900` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12553,19 +12569,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100902` on Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_dependency_index_errors_total{job=~"^worker.*"}[5m])) +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[1d]) ```
-#### worker: workerutil_dbworker_store_codeintel_dependency_index_error_rate +#### precise-code-intel-worker: provisioning_container_memory_usage_long_term -

Store operation error rate over 5m

+

Container memory usage (1d maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100903` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100901` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12575,21 +12591,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100903` on Query: ``` -sum(increase(src_workerutil_dbworker_store_codeintel_dependency_index_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_codeintel_dependency_index_total{job=~"^worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_codeintel_dependency_index_errors_total{job=~"^worker.*"}[5m]))) * 100 +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[1d]) ```
-### Worker: Codeintel: gitserver client - -#### worker: codeintel_gitserver_total +#### precise-code-intel-worker: provisioning_container_cpu_usage_short_term -

Aggregate client operations every 5m

+

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100910` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12599,19 +12613,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101000` on Query: ``` -sum(increase(src_codeintel_gitserver_total{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[5m]) ```
-#### worker: codeintel_gitserver_99th_percentile_duration +#### precise-code-intel-worker: provisioning_container_memory_usage_short_term -

Aggregate successful client operation duration distribution over 5m

+

Container memory usage (5m maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100911` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12621,19 +12635,22 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101001` on Query: ``` -sum by (le)(rate(src_codeintel_gitserver_duration_seconds_bucket{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^precise-code-intel-worker.*"}[5m]) ```
-#### worker: codeintel_gitserver_errors_total +#### precise-code-intel-worker: container_oomkill_events_total -

Aggregate client operation errors every 5m

+

Container OOMKILL events total by instance

-This panel has no related alerts. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101002` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#precise-code-intel-worker-container-oomkill-events-total) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=100912` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12643,19 +12660,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101002` on Query: ``` -sum(increase(src_codeintel_gitserver_errors_total{job=~"^worker.*"}[5m])) +max by (name) (container_oom_events_total{name=~"^precise-code-intel-worker.*"}) ```
-#### worker: codeintel_gitserver_error_rate +### Precise Code Intel Worker: Golang runtime monitoring -

Aggregate client operation error rate over 5m

+#### precise-code-intel-worker: go_goroutines -This panel has no related alerts. +

Maximum active goroutines

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101003` on your Sourcegraph instance. +A high value here indicates a possible goroutine leak. + +Refer to the [alerts reference](alerts#precise-code-intel-worker-go-goroutines) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=101000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12665,19 +12686,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101003` on Query: ``` -sum(increase(src_codeintel_gitserver_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_gitserver_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_gitserver_errors_total{job=~"^worker.*"}[5m]))) * 100 +max by(instance) (go_goroutines{job=~".*precise-code-intel-worker"}) ```
-#### worker: codeintel_gitserver_total +#### precise-code-intel-worker: go_gc_duration_seconds -

Client operations every 5m

+

Maximum go garbage collection duration

-This panel has no related alerts. +Refer to the [alerts reference](alerts#precise-code-intel-worker-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=101001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12687,19 +12708,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101010` on Query: ``` -sum by (op)(increase(src_codeintel_gitserver_total{job=~"^worker.*"}[5m])) +max by(instance) (go_gc_duration_seconds{job=~".*precise-code-intel-worker"}) ```
-#### worker: codeintel_gitserver_99th_percentile_duration +### Precise Code Intel Worker: Kubernetes monitoring (only available on Kubernetes) -

99th percentile successful client operation duration over 5m

+#### precise-code-intel-worker: pods_available_percentage -This panel has no related alerts. +

Percentage pods available

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101011` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#precise-code-intel-worker-pods-available-percentage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/precise-code-intel-worker/precise-code-intel-worker?viewPanel=101100` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12709,19 +12732,33 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101011` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_gitserver_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +sum by(app) (up{app=~".*precise-code-intel-worker"}) / count by (app) (up{app=~".*precise-code-intel-worker"}) * 100 ```
-#### worker: codeintel_gitserver_errors_total +## Syntactic Indexing -

Client operation errors every 5m

+

Handles syntactic indexing of repositories.

+ +To see this dashboard, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing` on your Sourcegraph instance. + +### Syntactic Indexing: Syntactic indexing scheduling: summary + +####syntactic-indexing: + +

Syntactic indexing jobs proposed for insertion over 5m

+ +Syntactic indexing jobs are proposed for insertion into the queue +based on round-robin scheduling across recently modified repos. + +This should be equal to the sum of inserted + updated + skipped, +but is shown separately for clarity. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12731,19 +12768,25 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101012` on Query: ``` -sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_syntactic_enqueuer_jobs_proposed[5m])) ```
-#### worker: codeintel_gitserver_error_rate +####syntactic-indexing: -

Client operation error rate over 5m

+

Syntactic indexing jobs inserted over 5m

+ +Syntactic indexing jobs are inserted into the queue if there is a proposed +repo commit pair (R, X) such that there is no existing job for R in the queue. + +If this number is close to the number of proposed jobs, it may indicate that +the scheduler is not able to keep up with the rate of incoming commits. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101013` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12753,21 +12796,26 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101013` on Query: ``` -sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_gitserver_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_codeintel_gitserver_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum(increase(src_codeintel_syntactic_enqueuer_jobs_inserted[5m])) ```
-### Worker: Codeintel: Dependency repository insert +####syntactic-indexing: -#### worker: codeintel_dependency_repos_total +

Syntactic indexing jobs updated in-place over 5m

-

Aggregate insert operations every 5m

+Syntactic indexing jobs are updated in-place when the scheduler attempts to +enqueue a repo commit pair (R, X) and discovers that the queue already had some +other repo commit pair (R, Y) where Y is an ancestor of X. In that case, the +job is updated in-place to point to X, to reflect the fact that users looking +at the tip of the default branch of R are more likely to benefit from newer +commits being indexed. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100002` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12777,19 +12825,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101100` on Query: ``` -sum(increase(src_codeintel_dependency_repos_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_syntactic_enqueuer_jobs_updated[5m])) ```
-#### worker: codeintel_dependency_repos_99th_percentile_duration +####syntactic-indexing: + +

Syntactic indexing jobs skipped over 5m

-

Aggregate successful insert operation duration distribution over 5m

+Syntactic indexing jobs insertion is skipped when the scheduler attempts to +enqueue a repo commit pair (R, X) and discovers that the queue already had the +same job (most likely) or another job (R, Y) where Y is not an ancestor of X. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100003` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12799,19 +12851,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101101` on Query: ``` -sum by (le)(rate(src_codeintel_dependency_repos_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_syntactic_enqueuer_jobs_skipped[5m])) ```
-#### worker: codeintel_dependency_repos_errors_total +### Syntactic Indexing: Workerutil: syntactic_scip_indexing_jobs dbworker/store stats + +#### syntactic-indexing: workerutil_dbworker_store_total -

Aggregate insert operation errors every 5m

+

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100100` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12821,19 +12875,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101102` on Query: ``` -sum(increase(src_codeintel_dependency_repos_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_workerutil_dbworker_store_total{domain='syntactic_scip_indexing_jobs',job=~"^syntactic-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_repos_error_rate +#### syntactic-indexing: workerutil_dbworker_store_99th_percentile_duration -

Aggregate insert operation error rate over 5m

+

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100101` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12843,19 +12897,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101103` on Query: ``` -sum(increase(src_codeintel_dependency_repos_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_dependency_repos_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_dependency_repos_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (le)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='syntactic_scip_indexing_jobs',job=~"^syntactic-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_repos_total +#### syntactic-indexing: workerutil_dbworker_store_errors_total -

Insert operations every 5m

+

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100102` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12865,19 +12919,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101110` on Query: ``` -sum by (scheme,new)(increase(src_codeintel_dependency_repos_total{job=~"^worker.*"}[5m])) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='syntactic_scip_indexing_jobs',job=~"^syntactic-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_repos_99th_percentile_duration +#### syntactic-indexing: workerutil_dbworker_store_error_rate -

99th percentile successful insert operation duration over 5m

+

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100103` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12887,19 +12941,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101111` on Query: ``` -histogram_quantile(0.99, sum by (le,scheme,new)(rate(src_codeintel_dependency_repos_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='syntactic_scip_indexing_jobs',job=~"^syntactic-code-intel-worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_total{domain='syntactic_scip_indexing_jobs',job=~"^syntactic-code-intel-worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_errors_total{domain='syntactic_scip_indexing_jobs',job=~"^syntactic-code-intel-worker.*"}[5m]))) * 100 ```
-#### worker: codeintel_dependency_repos_errors_total +### Syntactic Indexing: Codeintel: gitserver client -

Insert operation errors every 5m

+#### syntactic-indexing: gitserver_client_total + +

Aggregate client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100200` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12909,19 +12965,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101112` on Query: ``` -sum by (scheme,new)(increase(src_codeintel_dependency_repos_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_gitserver_client_total{job=~"^syntactic-code-intel-worker.*"}[5m])) ```
-#### worker: codeintel_dependency_repos_error_rate +#### syntactic-indexing: gitserver_client_99th_percentile_duration -

Insert operation error rate over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100201` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -12931,25 +12987,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101113` on Query: ``` -sum by (scheme,new)(increase(src_codeintel_dependency_repos_errors_total{job=~"^worker.*"}[5m])) / (sum by (scheme,new)(increase(src_codeintel_dependency_repos_total{job=~"^worker.*"}[5m])) + sum by (scheme,new)(increase(src_codeintel_dependency_repos_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^syntactic-code-intel-worker.*"}[5m])) ```
-### Worker: Permissions - -#### worker: user_success_syncs_total - -

Total number of user permissions syncs

+#### syntactic-indexing: gitserver_client_errors_total -Indicates the total number of user permissions sync completed. +

Aggregate client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -12957,23 +13009,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101200` on Query: ``` -sum(src_repo_perms_syncer_success_syncs{type="user"}) +sum(increase(src_gitserver_client_errors_total{job=~"^syntactic-code-intel-worker.*"}[5m])) ```

-#### worker: user_success_syncs - -

Number of user permissions syncs [5m]

+#### syntactic-indexing: gitserver_client_error_rate -Indicates the number of users permissions syncs completed. +

Aggregate client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100203` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -12981,23 +13031,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101201` on Query: ``` -sum(increase(src_repo_perms_syncer_success_syncs{type="user"}[5m])) +sum(increase(src_gitserver_client_errors_total{job=~"^syntactic-code-intel-worker.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^syntactic-code-intel-worker.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^syntactic-code-intel-worker.*"}[5m]))) * 100 ```

-#### worker: user_initial_syncs - -

Number of first user permissions syncs [5m]

+#### syntactic-indexing: gitserver_client_total -Indicates the number of permissions syncs done for the first time for the user. +

Client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13005,23 +13053,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101202` on Query: ``` -sum(increase(src_repo_perms_syncer_initial_syncs{type="user"}[5m])) +sum by (op)(increase(src_gitserver_client_total{job=~"^syntactic-code-intel-worker.*"}[5m])) ```

-#### worker: repo_success_syncs_total - -

Total number of repo permissions syncs

+#### syntactic-indexing: gitserver_client_99th_percentile_duration -Indicates the total number of repo permissions sync completed. +

99th percentile successful client operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13029,23 +13075,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101210` on Query: ``` -sum(src_repo_perms_syncer_success_syncs{type="repo"}) +histogram_quantile(0.99, sum by (le,op)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^syntactic-code-intel-worker.*"}[5m]))) ```

-#### worker: repo_success_syncs - -

Number of repo permissions syncs over 5m

+#### syntactic-indexing: gitserver_client_errors_total -Indicates the number of repos permissions syncs completed. +

Client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100212` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13053,23 +13097,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101211` on Query: ``` -sum(increase(src_repo_perms_syncer_success_syncs{type="repo"}[5m])) +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^syntactic-code-intel-worker.*"}[5m])) ```

-#### worker: repo_initial_syncs - -

Number of first repo permissions syncs over 5m

+#### syntactic-indexing: gitserver_client_error_rate -Indicates the number of permissions syncs done for the first time for the repo. +

Client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100213` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13077,23 +13119,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101212` on Query: ``` -sum(increase(src_repo_perms_syncer_initial_syncs{type="repo"}[5m])) +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^syntactic-code-intel-worker.*"}[5m])) / (sum by (op)(increase(src_gitserver_client_total{job=~"^syntactic-code-intel-worker.*"}[5m])) + sum by (op)(increase(src_gitserver_client_errors_total{job=~"^syntactic-code-intel-worker.*"}[5m]))) * 100 ```

-#### worker: users_consecutive_sync_delay +### Syntactic Indexing: Database connections -

Max duration between two consecutive permissions sync for user

+#### syntactic-indexing: max_open_conns -Indicates the max delay between two consecutive permissions sync for a user during the period. +

Maximum open

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101220` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13101,23 +13143,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101220` on Query: ``` -max(max_over_time (src_repo_perms_syncer_perms_consecutive_sync_delay{type="user"} [1m])) +sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="syntactic-code-intel-worker"}) ```

-#### worker: repos_consecutive_sync_delay - -

Max duration between two consecutive permissions sync for repo

+#### syntactic-indexing: open_conns -Indicates the max delay between two consecutive permissions sync for a repo during the period. +

Established

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101221` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13125,23 +13165,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101221` on Query: ``` -max(max_over_time (src_repo_perms_syncer_perms_consecutive_sync_delay{type="repo"} [1m])) +sum by (app_name, db_name) (src_pgsql_conns_open{app_name="syntactic-code-intel-worker"}) ```

-#### worker: users_first_sync_delay - -

Max duration between user creation and first permissions sync

+#### syntactic-indexing: in_use -Indicates the max delay between user creation and their permissions sync +

Used

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13149,23 +13187,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101230` on Query: ``` -max(max_over_time(src_repo_perms_syncer_perms_first_sync_delay{type="user"}[1m])) +sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="syntactic-code-intel-worker"}) ```

-#### worker: repos_first_sync_delay - -

Max duration between repo creation and first permissions sync over 1m

+#### syntactic-indexing: idle -Indicates the max delay between repo creation and their permissions sync +

Idle

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101231` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100311` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13173,23 +13209,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101231` on Query: ``` -max(max_over_time(src_repo_perms_syncer_perms_first_sync_delay{type="repo"}[1m])) +sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="syntactic-code-intel-worker"}) ```

-#### worker: permissions_found_count - -

Number of permissions found during user/repo permissions sync

+#### syntactic-indexing: mean_blocked_seconds_per_conn_request -Indicates the number permissions found during users/repos permissions sync. +

Mean blocked seconds per conn request

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101240` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100320` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13197,23 +13231,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101240` on Query: ``` -sum by (type) (src_repo_perms_syncer_perms_found) +sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="syntactic-code-intel-worker"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="syntactic-code-intel-worker"}[5m])) ```

-#### worker: permissions_found_avg - -

Average number of permissions found during permissions sync per user/repo

+#### syntactic-indexing: closed_max_idle -Indicates the average number permissions found during permissions sync per user/repo. +

Closed by SetMaxIdleConns

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101241` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100330` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13221,21 +13253,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101241` on Query: ``` -avg by (type) (src_repo_perms_syncer_perms_found) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="syntactic-code-intel-worker"}[5m])) ```

-#### worker: perms_syncer_outdated_perms +#### syntactic-indexing: closed_max_lifetime -

Number of entities with outdated permissions

+

Closed by SetConnMaxLifetime

-Refer to the [alerts reference](alerts#worker-perms-syncer-outdated-perms) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101250` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100331` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13243,21 +13275,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101250` on Query: ``` -max by (type) (src_repo_perms_syncer_outdated_perms) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="syntactic-code-intel-worker"}[5m])) ```

-#### worker: perms_syncer_sync_duration +#### syntactic-indexing: closed_max_idle_time -

95th permissions sync duration

+

Closed by SetConnMaxIdleTime

-Refer to the [alerts reference](alerts#worker-perms-syncer-sync-duration) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101260` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100332` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13265,21 +13297,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101260` on Query: ``` -histogram_quantile(0.95, max by (le, type) (rate(src_repo_perms_syncer_sync_duration_seconds_bucket[1m]))) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="syntactic-code-intel-worker"}[5m])) ```

-#### worker: perms_syncer_sync_errors +### Syntactic Indexing: Syntactic-code-intel-worker (CPU, Memory) -

Permissions sync error rate

+#### syntactic-indexing: cpu_usage_percentage -Refer to the [alerts reference](alerts#worker-perms-syncer-sync-errors) for 1 alert related to this panel. +

CPU usage

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101270` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#syntactic-indexing-cpu-usage-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100400` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -13287,24 +13321,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101270` on Query: ``` -max by (type) (ceil(rate(src_repo_perms_syncer_sync_errors_total[1m]))) +cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"} ```

-#### worker: perms_syncer_scheduled_repos_total +#### syntactic-indexing: memory_usage_percentage -

Total number of repos scheduled for permissions sync

+

Memory usage percentage (total)

-Indicates how many repositories have been scheduled for a permissions sync. -More about repository permissions synchronization [here](https://sourcegraph.com/docs/admin/permissions/syncing#scheduling) +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101271` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -13312,23 +13345,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101271` on Query: ``` -max(rate(src_repo_perms_syncer_schedule_repos_total[1m])) +cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"} ```

-### Worker: Gitserver: Gitserver Client +#### syntactic-indexing: memory_working_set_bytes -#### worker: gitserver_client_total +

Memory usage bytes (total)

-

Aggregate graphql operations every 5m

+An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100402` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -13336,21 +13369,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101300` on Query: ``` -sum(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) +max by (name) (container_memory_working_set_bytes{name=~"^syntactic-code-intel-worker.*"}) ```

-#### worker: gitserver_client_99th_percentile_duration +#### syntactic-indexing: memory_rss -

Aggregate successful graphql operation duration distribution over 5m

+

Memory (RSS)

-This panel has no related alerts. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101301` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#syntactic-indexing-memory-rss) for 1 alert related to this panel. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100410` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -13358,21 +13393,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101301` on Query: ``` -sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^worker.*"}[5m])) +max(container_memory_rss{name=~"^syntactic-code-intel-worker.*"} / container_spec_memory_limit_bytes{name=~"^syntactic-code-intel-worker.*"}) by (name) * 100.0 ```

-#### worker: gitserver_client_errors_total +#### syntactic-indexing: memory_total_active_file -

Aggregate graphql operation errors every 5m

+

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -13380,21 +13417,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101302` on Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) +max(container_memory_total_active_file_bytes{name=~"^syntactic-code-intel-worker.*"} / container_spec_memory_limit_bytes{name=~"^syntactic-code-intel-worker.*"}) by (name) * 100.0 ```

-#### worker: gitserver_client_error_rate +#### syntactic-indexing: memory_kernel_usage -

Aggregate graphql operation error rate over 5m

+

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100412` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -13402,21 +13441,33 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101303` on Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m]))) * 100 +max(container_memory_kernel_usage{name=~"^syntactic-code-intel-worker.*"} / container_spec_memory_limit_bytes{name=~"^syntactic-code-intel-worker.*"}) by (name) * 100.0 ```

-#### worker: gitserver_client_total +### Syntactic Indexing: Container monitoring (not available on server) -

Graphql operations every 5m

+#### syntactic-indexing: container_missing + +

Container missing

+ +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod syntactic-code-intel-worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p syntactic-code-intel-worker`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' syntactic-code-intel-worker` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the syntactic-code-intel-worker container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs syntactic-code-intel-worker` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13424,21 +13475,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101310` on Query: ``` -sum by (op,scope)(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) +count by(name) ((time() - container_last_seen{name=~"^syntactic-code-intel-worker.*"}) > 60) ```

-#### worker: gitserver_client_99th_percentile_duration +#### syntactic-indexing: container_cpu_usage -

99th percentile successful graphql operation duration over 5m

+

Container cpu usage total (1m average) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13446,21 +13497,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101311` on Query: ``` -histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"} ```

-#### worker: gitserver_client_errors_total +#### syntactic-indexing: container_memory_usage -

Graphql operation errors every 5m

+

Container memory usage by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13468,21 +13519,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101312` on Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) +cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"} ```

-#### worker: gitserver_client_error_rate +#### syntactic-indexing: fs_io_operations -

Graphql operation error rate over 5m

+

Filesystem reads and writes rate by instance over 1h

+ +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100503` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13490,23 +13544,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101313` on Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by(name) (rate(container_fs_reads_total{name=~"^syntactic-code-intel-worker.*"}[1h]) + rate(container_fs_writes_total{name=~"^syntactic-code-intel-worker.*"}[1h])) ```

-### Worker: Batches: dbstore stats +### Syntactic Indexing: Provisioning indicators (not available on server) -#### worker: batches_dbstore_total +#### syntactic-indexing: provisioning_container_cpu_usage_long_term -

Aggregate store operations every 5m

+

Container cpu usage total (90th percentile over 1d) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13514,21 +13568,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101400` on Query: ``` -sum(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[1d]) ```

-#### worker: batches_dbstore_99th_percentile_duration +#### syntactic-indexing: provisioning_container_memory_usage_long_term -

Aggregate successful store operation duration distribution over 5m

+

Container memory usage (1d maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100601` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13536,21 +13590,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101401` on Query: ``` -sum by (le)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[1d]) ```

-#### worker: batches_dbstore_errors_total +#### syntactic-indexing: provisioning_container_cpu_usage_short_term -

Aggregate store operation errors every 5m

+

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100610` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13558,21 +13612,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101402` on Query: ``` -sum(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[5m]) ```

-#### worker: batches_dbstore_error_rate +#### syntactic-indexing: provisioning_container_memory_usage_short_term -

Aggregate store operation error rate over 5m

+

Container memory usage (5m maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100611` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13580,21 +13634,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101403` on Query: ``` -sum(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) + sum(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m]))) * 100 +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntactic-code-intel-worker.*"}[5m]) ```

-#### worker: batches_dbstore_total +#### syntactic-indexing: container_oomkill_events_total -

Store operations every 5m

+

Container OOMKILL events total by instance

-This panel has no related alerts. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101410` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#syntactic-indexing-container-oomkill-events-total) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100612` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13602,21 +13659,25 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101410` on Query: ``` -sum by (op)(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) +max by (name) (container_oom_events_total{name=~"^syntactic-code-intel-worker.*"}) ```

-#### worker: batches_dbstore_99th_percentile_duration +### Syntactic Indexing: Golang runtime monitoring -

99th percentile successful store operation duration over 5m

+#### syntactic-indexing: go_goroutines -This panel has no related alerts. +

Maximum active goroutines

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101411` on your Sourcegraph instance. +A high value here indicates a possible goroutine leak. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +Refer to the [alerts reference](alerts#syntactic-indexing-go-goroutines) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100700` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13624,21 +13685,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101411` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +max by(instance) (go_goroutines{job=~".*syntactic-code-intel-worker"}) ```

-#### worker: batches_dbstore_errors_total +#### syntactic-indexing: go_gc_duration_seconds -

Store operation errors every 5m

+

Maximum go garbage collection duration

-This panel has no related alerts. +Refer to the [alerts reference](alerts#syntactic-indexing-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13646,21 +13707,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101412` on Query: ``` -sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) +max by(instance) (go_gc_duration_seconds{job=~".*syntactic-code-intel-worker"}) ```

-#### worker: batches_dbstore_error_rate +### Syntactic Indexing: Kubernetes monitoring (only available on Kubernetes) -

Store operation error rate over 5m

+#### syntactic-indexing: pods_available_percentage -This panel has no related alerts. +

Percentage pods available

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101413` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#syntactic-indexing-pods-available-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/syntactic-indexing/syntactic-indexing?viewPanel=100800` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -13668,23 +13731,31 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101413` on Query: ``` -sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by(app) (up{app=~".*syntactic-code-intel-worker"}) / count by (app) (up{app=~".*syntactic-code-intel-worker"}) * 100 ```

-### Worker: Batches: service stats +## Redis -#### worker: batches_service_total +

Metrics from both redis databases.

-

Aggregate service operations every 5m

+To see this dashboard, visit `/-/debug/grafana/d/redis/redis` on your Sourcegraph instance. -This panel has no related alerts. +### Redis: Redis Store -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101500` on your Sourcegraph instance. +#### redis: redis-store_up -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +

Redis-store availability

+ +A value of 1 indicates the service is currently running + +Refer to the [alerts reference](alerts#redis-redis-store-up) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100000` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13692,21 +13763,25 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101500` on Query: ``` -sum(increase(src_batches_service_total{job=~"^worker.*"}[5m])) +redis_up{app="redis-store"} ```

-#### worker: batches_service_99th_percentile_duration +### Redis: Redis Cache -

Aggregate successful service operation duration distribution over 5m

+#### redis: redis-cache_up -This panel has no related alerts. +

Redis-cache availability

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101501` on your Sourcegraph instance. +A value of 1 indicates the service is currently running -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +Refer to the [alerts reference](alerts#redis-redis-cache-up) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100100` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13714,21 +13789,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101501` on Query: ``` -sum by (le)(rate(src_batches_service_duration_seconds_bucket{job=~"^worker.*"}[5m])) +redis_up{app="redis-cache"} ```

-#### worker: batches_service_errors_total +### Redis: Provisioning indicators (not available on server) -

Aggregate service operation errors every 5m

+#### redis: provisioning_container_cpu_usage_long_term -This panel has no related alerts. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

+ +Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13736,21 +13813,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101502` on Query: ``` -sum(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[1d]) ```

-#### worker: batches_service_error_rate +#### redis: provisioning_container_memory_usage_long_term -

Aggregate service operation error rate over 5m

+

Container memory usage (1d maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13758,21 +13835,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101503` on Query: ``` -sum(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_batches_service_total{job=~"^worker.*"}[5m])) + sum(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m]))) * 100 +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[1d]) ```

-#### worker: batches_service_total +#### redis: provisioning_container_cpu_usage_short_term -

Service operations every 5m

+

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13780,21 +13857,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101510` on Query: ``` -sum by (op)(increase(src_batches_service_total{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-cache.*"}[5m]) ```

-#### worker: batches_service_99th_percentile_duration +#### redis: provisioning_container_memory_usage_short_term -

99th percentile successful service operation duration over 5m

+

Container memory usage (5m maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13802,21 +13879,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101511` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_service_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-cache.*"}[5m]) ```

-#### worker: batches_service_errors_total +#### redis: container_oomkill_events_total -

Service operation errors every 5m

+

Container OOMKILL events total by instance

-This panel has no related alerts. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101512` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#redis-container-oomkill-events-total) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100212` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13824,21 +13904,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101512` on Query: ``` -sum by (op)(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) +max by (name) (container_oom_events_total{name=~"^redis-cache.*"}) ```

-#### worker: batches_service_error_rate +### Redis: Provisioning indicators (not available on server) -

Service operation error rate over 5m

+#### redis: provisioning_container_cpu_usage_long_term -This panel has no related alerts. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101513` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13846,23 +13928,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101513` on Query: ``` -sum by (op)(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_batches_service_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m]))) * 100 +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[1d]) ```

-### Worker: Batches: Workspace resolver dbstore - -#### worker: workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_total +#### redis: provisioning_container_memory_usage_long_term -

Store operations every 5m

+

Container memory usage (1d maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13870,21 +13950,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101600` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_total{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[1d]) ```

-#### worker: workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_99th_percentile_duration +#### redis: provisioning_container_cpu_usage_short_term -

99th percentile successful store operation duration over 5m

+

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13892,21 +13972,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101601` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^redis-store.*"}[5m]) ```

-#### worker: workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_errors_total +#### redis: provisioning_container_memory_usage_short_term -

Store operation errors every 5m

+

Container memory usage (5m maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100311` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13914,21 +13994,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101602` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_errors_total{job=~"^worker.*"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^redis-store.*"}[5m]) ```

-#### worker: workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_error_rate +#### redis: container_oomkill_events_total -

Store operation error rate over 5m

+

Container OOMKILL events total by instance

-This panel has no related alerts. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101603` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#redis-container-oomkill-events-total) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100312` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13936,23 +14019,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101603` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_batch_changes_batch_spec_resolution_worker_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +max by (name) (container_oom_events_total{name=~"^redis-store.*"}) ```

-### Worker: Batches: Bulk operation processor dbstore +### Redis: Kubernetes monitoring (only available on Kubernetes) -#### worker: workerutil_dbworker_store_batches_bulk_worker_store_total +#### redis: pods_available_percentage -

Store operations every 5m

+

Percentage pods available

-This panel has no related alerts. +Refer to the [alerts reference](alerts#redis-pods-available-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13960,21 +14043,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101700` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batches_bulk_worker_store_total{job=~"^worker.*"}[5m])) +sum by(app) (up{app=~".*redis-cache"}) / count by (app) (up{app=~".*redis-cache"}) * 100 ```

-#### worker: workerutil_dbworker_store_batches_bulk_worker_store_99th_percentile_duration +### Redis: Kubernetes monitoring (only available on Kubernetes) -

99th percentile successful store operation duration over 5m

+#### redis: pods_available_percentage -This panel has no related alerts. +

Percentage pods available

-To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101701` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#redis-pods-available-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/redis/redis?viewPanel=100500` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -13982,21 +14067,31 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101701` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_batches_bulk_worker_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +sum by(app) (up{app=~".*redis-store"}) / count by (app) (up{app=~".*redis-store"}) * 100 ```

-#### worker: workerutil_dbworker_store_batches_bulk_worker_store_errors_total +## Worker -

Store operation errors every 5m

+

Manages background processes.

+ +To see this dashboard, visit `/-/debug/grafana/d/worker/worker` on your Sourcegraph instance. + +### Worker: Active jobs + +#### worker: worker_job_count + +

Number of worker instances running each job

+ +The number of worker instances running each job type. +It is necessary for each job type to be managed by at least one worker instance. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -14004,21 +14099,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101702` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batches_bulk_worker_store_errors_total{job=~"^worker.*"}[5m])) +sum by (job_name) (src_worker_jobs{job=~"^worker.*"}) ```

-#### worker: workerutil_dbworker_store_batches_bulk_worker_store_error_rate +#### worker: worker_job_codeintel-upload-janitor_count -

Store operation error rate over 5m

+

Number of worker instances running the codeintel-upload-janitor job

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-worker-job-codeintel-upload-janitor-count) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101703` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100010` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14026,23 +14121,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101703` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batches_bulk_worker_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_batches_bulk_worker_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_batches_bulk_worker_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-upload-janitor"}) ```

-### Worker: Batches: Changeset reconciler dbstore - -#### worker: workerutil_dbworker_store_batches_reconciler_worker_store_total +#### worker: worker_job_codeintel-commitgraph-updater_count -

Store operations every 5m

+

Number of worker instances running the codeintel-commitgraph-updater job

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-worker-job-codeintel-commitgraph-updater-count) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14050,21 +14143,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101800` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batches_reconciler_worker_store_total{job=~"^worker.*"}[5m])) +sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-commitgraph-updater"}) ```

-#### worker: workerutil_dbworker_store_batches_reconciler_worker_store_99th_percentile_duration +#### worker: worker_job_codeintel-autoindexing-scheduler_count -

99th percentile successful store operation duration over 5m

+

Number of worker instances running the codeintel-autoindexing-scheduler job

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-worker-job-codeintel-autoindexing-scheduler-count) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100012` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14072,21 +14165,25 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101801` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_batches_reconciler_worker_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-autoindexing-scheduler"}) ```

-#### worker: workerutil_dbworker_store_batches_reconciler_worker_store_errors_total +### Worker: Database record encrypter -

Store operation errors every 5m

+#### worker: records_encrypted_at_rest_percentage + +

Percentage of database records encrypted at rest

+ +Percentage of encrypted database records This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14094,21 +14191,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101802` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batches_reconciler_worker_store_errors_total{job=~"^worker.*"}[5m])) +(max(src_records_encrypted_at_rest_total) by (tableName)) / ((max(src_records_encrypted_at_rest_total) by (tableName)) + (max(src_records_unencrypted_at_rest_total) by (tableName))) * 100 ```

-#### worker: workerutil_dbworker_store_batches_reconciler_worker_store_error_rate +#### worker: records_encrypted_total -

Store operation error rate over 5m

+

Database records encrypted every 5m

+ +Number of encrypted database records every 5m This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101803` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14116,23 +14215,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101803` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batches_reconciler_worker_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_batches_reconciler_worker_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_batches_reconciler_worker_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (tableName)(increase(src_records_encrypted_total{job=~"^worker.*"}[5m])) ```

-### Worker: Batches: Workspace execution dbstore +#### worker: records_decrypted_total -#### worker: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_total +

Database records decrypted every 5m

-

Store operations every 5m

+Number of encrypted database records every 5m This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14140,21 +14239,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101900` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_total{job=~"^worker.*"}[5m])) +sum by (tableName)(increase(src_records_decrypted_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_99th_percentile_duration +#### worker: record_encryption_errors_total + +

Encryption operation errors every 5m

-

99th percentile successful store operation duration over 5m

+Number of database record encryption/decryption errors every 5m This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14162,21 +14263,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101901` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +sum(increase(src_record_encryption_errors_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total +### Worker: Codeintel: Repository commit graph updates -

Store operation errors every 5m

+#### worker: codeintel_commit_graph_processor_total + +

Update operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101902` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14184,21 +14287,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101902` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_commit_graph_processor_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_error_rate +#### worker: codeintel_commit_graph_processor_99th_percentile_duration -

Store operation error rate over 5m

+

Aggregate successful update operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101903` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14206,21 +14309,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101903` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_batch_spec_workspace_execution_worker_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (le)(rate(src_codeintel_commit_graph_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-### Worker: Batches: Executor jobs - -#### worker: executor_queue_size +#### worker: codeintel_commit_graph_processor_errors_total -

Unprocessed executor job queue size

+

Update operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100202` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14230,25 +14331,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102000` on Query: ``` -max by (queue)(src_executor_total{queue=~"batches",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}) +sum(increase(src_codeintel_commit_graph_processor_errors_total{job=~"^worker.*"}[5m])) ```
-#### worker: executor_queue_growth_rate - -

Unprocessed executor job queue growth rate over 30m

- -This value compares the rate of enqueues against the rate of finished jobs for the selected queue. +#### worker: codeintel_commit_graph_processor_error_rate - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +

Update operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100203` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14258,19 +14353,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102001` on Query: ``` -sum by (queue)(increase(src_executor_total{queue=~"batches",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}[30m])) / sum by (queue)(increase(src_executor_processor_total{queue=~"batches",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}[30m])) +sum(increase(src_codeintel_commit_graph_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_commit_graph_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_commit_graph_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 ```
-#### worker: executor_queued_max_age +### Worker: Codeintel: Auto-index scheduler + +#### worker: codeintel_autoindexing_total -

Unprocessed executor job queue longest time in queue

+

Auto-indexing job scheduler operations every 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102002` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14280,21 +14377,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102002` on Query: ``` -max by (queue)(src_executor_queued_duration_seconds_total{queue=~"batches",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}) +sum(increase(src_codeintel_autoindexing_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) ```
-### Worker: Codeintel: lsif_upload record resetter - -#### worker: codeintel_background_upload_record_resets_total +#### worker: codeintel_autoindexing_99th_percentile_duration -

Lsif upload records reset to queued state every 5m

+

Aggregate successful auto-indexing job scheduler operation duration distribution over 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100301` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14304,19 +14399,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102100` on Query: ``` -sum(increase(src_codeintel_background_upload_record_resets_total{job=~"^worker.*"}[5m])) +sum by (le)(rate(src_codeintel_autoindexing_duration_seconds_bucket{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) ```
-#### worker: codeintel_background_upload_record_reset_failures_total +#### worker: codeintel_autoindexing_errors_total -

Lsif upload records reset to errored state every 5m

+

Auto-indexing job scheduler operation errors every 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100302` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14326,19 +14421,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102101` on Query: ``` -sum(increase(src_codeintel_background_upload_record_reset_failures_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) ```
-#### worker: codeintel_background_upload_record_reset_errors_total +#### worker: codeintel_autoindexing_error_rate -

Lsif upload operation errors every 5m

+

Auto-indexing job scheduler operation error rate over 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100303` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14348,21 +14443,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102102` on Query: ``` -sum(increase(src_codeintel_background_upload_record_reset_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) / (sum(increase(src_codeintel_autoindexing_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m])) + sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',job=~"^worker.*"}[10m]))) * 100 ```
-### Worker: Codeintel: lsif_index record resetter +### Worker: Codeintel: dbstore stats -#### worker: codeintel_background_index_record_resets_total +#### worker: codeintel_uploads_store_total -

Lsif index records reset to queued state every 5m

+

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14372,19 +14467,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102200` on Query: ``` -sum(increase(src_codeintel_background_index_record_resets_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) ```
-#### worker: codeintel_background_index_record_reset_failures_total +#### worker: codeintel_uploads_store_99th_percentile_duration -

Lsif index records reset to errored state every 5m

+

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100401` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14394,19 +14489,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102201` on Query: ``` -sum(increase(src_codeintel_background_index_record_reset_failures_total{job=~"^worker.*"}[5m])) +sum by (le)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```
-#### worker: codeintel_background_index_record_reset_errors_total +#### worker: codeintel_uploads_store_errors_total -

Lsif index operation errors every 5m

+

Aggregate store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100402` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14416,21 +14511,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102202` on Query: ``` -sum(increase(src_codeintel_background_index_record_reset_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) ```
-### Worker: Codeintel: lsif_dependency_index record resetter - -#### worker: codeintel_background_dependency_index_record_resets_total +#### worker: codeintel_uploads_store_error_rate -

Lsif dependency index records reset to queued state every 5m

+

Aggregate store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100403` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14440,19 +14533,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102300` on Query: ``` -sum(increase(src_codeintel_background_dependency_index_record_resets_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m]))) * 100 ```
-#### worker: codeintel_background_dependency_index_record_reset_failures_total +#### worker: codeintel_uploads_store_total -

Lsif dependency index records reset to errored state every 5m

+

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100410` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14462,19 +14555,19 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102301` on Query: ``` -sum(increase(src_codeintel_background_dependency_index_record_reset_failures_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) ```
-#### worker: codeintel_background_dependency_index_record_reset_errors_total +#### worker: codeintel_uploads_store_99th_percentile_duration -

Lsif dependency index operation errors every 5m

+

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100411` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -14484,23 +14577,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102302` on Query: ``` -sum(increase(src_codeintel_background_dependency_index_record_reset_errors_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```
-### Worker: Codeinsights: Query Runner Queue - -#### worker: query_runner_worker_queue_size +#### worker: codeintel_uploads_store_errors_total -

Code insights query runner queue queue size

+

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100412` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14508,27 +14599,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102400` on Query: ``` -max(src_query_runner_worker_total{job=~"^worker.*"}) +sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) ```

-#### worker: query_runner_worker_queue_growth_rate - -

Code insights query runner queue queue growth rate over 30m

- -This value compares the rate of enqueues against the rate of finished jobs. +#### worker: codeintel_uploads_store_error_rate - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100413` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14536,23 +14621,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102401` on Query: ``` -sum(increase(src_query_runner_worker_total{job=~"^worker.*"}[30m])) / sum(increase(src_query_runner_worker_processor_total{job=~"^worker.*"}[30m])) +sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-### Worker: Codeinsights: insights queue processor +### Worker: Codeintel: lsifstore stats -#### worker: query_runner_worker_handlers +#### worker: codeintel_uploads_lsifstore_total -

Handler active handlers

+

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14560,21 +14645,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102500` on Query: ``` -sum(src_query_runner_worker_processor_handlers{job=~"^worker.*"}) +sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) ```

-#### worker: query_runner_worker_processor_total +#### worker: codeintel_uploads_lsifstore_99th_percentile_duration -

Handler operations every 5m

+

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14582,21 +14667,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102510` on Query: ``` -sum(increase(src_query_runner_worker_processor_total{job=~"^worker.*"}[5m])) +sum by (le)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-#### worker: query_runner_worker_processor_99th_percentile_duration +#### worker: codeintel_uploads_lsifstore_errors_total -

Aggregate successful handler operation duration distribution over 5m

+

Aggregate store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14604,21 +14689,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102511` on Query: ``` -sum by (le)(rate(src_query_runner_worker_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) ```

-#### worker: query_runner_worker_processor_errors_total +#### worker: codeintel_uploads_lsifstore_error_rate -

Handler operation errors every 5m

+

Aggregate store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100503` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14626,21 +14711,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102512` on Query: ``` -sum(increase(src_query_runner_worker_processor_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) + sum(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-#### worker: query_runner_worker_processor_error_rate +#### worker: codeintel_uploads_lsifstore_total -

Handler operation error rate over 5m

+

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100510` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14648,23 +14733,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102513` on Query: ``` -sum(increase(src_query_runner_worker_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_query_runner_worker_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_query_runner_worker_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) ```

-### Worker: Codeinsights: code insights query runner queue record resetter - -#### worker: query_runner_worker_record_resets_total +#### worker: codeintel_uploads_lsifstore_99th_percentile_duration -

Insights query runner queue records reset to queued state every 5m

+

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14672,21 +14755,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102600` on Query: ``` -sum(increase(src_query_runner_worker_record_resets_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_lsifstore_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```

-#### worker: query_runner_worker_record_reset_failures_total +#### worker: codeintel_uploads_lsifstore_errors_total -

Insights query runner queue records reset to errored state every 5m

+

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100512` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14694,21 +14777,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102601` on Query: ``` -sum(increase(src_query_runner_worker_record_reset_failures_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) ```

-#### worker: query_runner_worker_record_reset_errors_total +#### worker: codeintel_uploads_lsifstore_error_rate -

Insights query runner queue operation errors every 5m

+

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100513` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14716,23 +14799,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102602` on Query: ``` -sum(increase(src_query_runner_worker_record_reset_errors_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_lsifstore_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_lsifstore_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-### Worker: Codeinsights: dbstore stats +### Worker: Codeintel: gitserver client -#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_total +#### worker: gitserver_client_total -

Aggregate store operations every 5m

+

Aggregate client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14740,21 +14823,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102700` on Query: ``` -sum(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~"^worker.*"}[5m])) +sum(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_99th_percentile_duration +#### worker: gitserver_client_99th_percentile_duration -

Aggregate successful store operation duration distribution over 5m

+

Aggregate successful client operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100601` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14762,21 +14845,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102701` on Query: ``` -sum by (le)(rate(src_workerutil_dbworker_store_insights_query_runner_jobs_store_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total +#### worker: gitserver_client_errors_total -

Aggregate store operation errors every 5m

+

Aggregate client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100602` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14784,21 +14867,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102702` on Query: ``` -sum(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_error_rate +#### worker: gitserver_client_error_rate -

Aggregate store operation error rate over 5m

+

Aggregate client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102703` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100603` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14806,21 +14889,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102703` on Query: ``` -sum(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~"^worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_total +#### worker: gitserver_client_total -

Store operations every 5m

+

Client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100610` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14828,21 +14911,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102710` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_99th_percentile_duration +#### worker: gitserver_client_99th_percentile_duration -

99th percentile successful store operation duration over 5m

+

99th percentile successful client operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100611` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14850,21 +14933,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102711` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_insights_query_runner_jobs_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total +#### worker: gitserver_client_errors_total -

Store operation errors every 5m

+

Client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102712` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100612` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14872,21 +14955,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102712` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total{job=~"^worker.*"}[5m])) +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) ```

-#### worker: workerutil_dbworker_store_insights_query_runner_jobs_store_error_rate +#### worker: gitserver_client_error_rate -

Store operation error rate over 5m

+

Client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102713` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100613` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -14894,25 +14977,26 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102713` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum by (op)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-### Worker: Code Insights queue utilization +### Worker: Repositories -#### worker: insights_queue_unutilized_size +#### worker: syncer_sync_last_time -

Insights queue size that is not utilized (not processing)

+

Time since last sync

-Any value on this panel indicates code insights is not processing queries from its queue. This observable and alert only fire if there are records in the queue and there have been no dequeue attempts for 30 minutes. +A high value here indicates issues synchronizing repo metadata. +If the value is persistently high, make sure all external services have valid tokens. -Refer to the [alerts reference](alerts#worker-insights-queue-unutilized-size) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14920,23 +15004,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102800` on Query: ``` -max(src_query_runner_worker_total{job=~"^worker.*"}) > 0 and on(job) sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~"^worker.*",op="Dequeue"}[5m])) < 1 +max(timestamp(vector(time()))) - max(src_repoupdater_syncer_sync_last_time) ```

-### Worker: Database connections - -#### worker: max_open_conns +#### worker: src_repoupdater_max_sync_backoff -

Maximum open

+

Time since oldest sync

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-src-repoupdater-max-sync-backoff) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14944,21 +15026,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102900` on Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="worker"}) +max(src_repoupdater_max_sync_backoff) ```

-#### worker: open_conns +#### worker: src_repoupdater_syncer_sync_errors_total -

Established

+

Site level external service sync error rate

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-src-repoupdater-syncer-sync-errors-total) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100702` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14966,21 +15048,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102901` on Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="worker"}) +max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="invalid_npm_path",reason!="internal_rate_limit"}[5m])) ```

-#### worker: in_use +#### worker: syncer_sync_start -

Used

+

Repo metadata sync was started

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-syncer-sync-start) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100710` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -14988,21 +15070,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102910` on Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="worker"}) +max by (family) (rate(src_repoupdater_syncer_start_sync{family="Syncer.SyncExternalService"}[9h0m0s])) ```

-#### worker: idle +#### worker: syncer_sync_duration -

Idle

+

95th repositories sync duration

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-syncer-sync-duration) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100711` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15010,21 +15092,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102911` on Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="worker"}) +histogram_quantile(0.95, max by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m]))) ```

-#### worker: mean_blocked_seconds_per_conn_request +#### worker: source_duration -

Mean blocked seconds per conn request

+

95th repositories source duration

-Refer to the [alerts reference](alerts#worker-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +Refer to the [alerts reference](alerts#worker-source-duration) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102920` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100712` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15032,21 +15114,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102920` on Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="worker"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="worker"}[5m])) +histogram_quantile(0.95, max by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m]))) ```

-#### worker: closed_max_idle +#### worker: syncer_synced_repos -

Closed by SetMaxIdleConns

+

Repositories synced

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-syncer-synced-repos) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102930` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100720` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15054,21 +15136,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102930` on Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="worker"}[5m])) +max(rate(src_repoupdater_syncer_synced_repos_total[1m])) ```

-#### worker: closed_max_lifetime +#### worker: sourced_repos -

Closed by SetConnMaxLifetime

+

Repositories sourced

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-sourced-repos) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102931` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100721` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15076,21 +15158,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102931` on Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="worker"}[5m])) +max(rate(src_repoupdater_source_repos_total[1m])) ```

-#### worker: closed_max_idle_time +#### worker: sched_auto_fetch -

Closed by SetConnMaxIdleTime

+

Repositories scheduled due to hitting a deadline

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-sched-auto-fetch) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102932` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100730` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15098,33 +15180,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102932` on Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="worker"}[5m])) +max(rate(src_repoupdater_sched_auto_fetch[1m])) ```

-### Worker: Container monitoring (not available on server) - -#### worker: container_missing - -

Container missing

+#### worker: sched_manual_fetch -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +

Repositories scheduled due to user traffic

-- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p worker`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' worker` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the worker container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs worker` (note this will include logs from the previous and currently running container). +Check worker logs if this value is persistently high. +This does not indicate anything if there are no user added code hosts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100731` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15132,21 +15205,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103000` on Query: ``` -count by(name) ((time() - container_last_seen{name=~"^worker.*"}) > 60) +max(rate(src_repoupdater_sched_manual_fetch[1m])) ```

-#### worker: container_cpu_usage +#### worker: sched_loops -

Container cpu usage total (1m average) across all cores by instance

+

Scheduler loops

-Refer to the [alerts reference](alerts#worker-container-cpu-usage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-sched-loops) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100740` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15154,21 +15227,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103001` on Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"} +max(rate(src_repoupdater_sched_loops[1m])) ```

-#### worker: container_memory_usage +#### worker: src_repoupdater_stale_repos -

Container memory usage by instance

+

Repos that haven't been fetched in more than 8 hours

-Refer to the [alerts reference](alerts#worker-container-memory-usage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-src-repoupdater-stale-repos) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103002` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100741` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15176,24 +15249,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103002` on Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"} +max(src_repoupdater_stale_repos) ```

-#### worker: fs_io_operations - -

Filesystem reads and writes rate by instance over 1h

+#### worker: sched_error -This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +

Repositories schedule error rate

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-sched-error) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103003` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100742` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15201,23 +15271,25 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103003` on Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^worker.*"}[1h]) + rate(container_fs_writes_total{name=~"^worker.*"}[1h])) +max(rate(src_repoupdater_sched_error[1m])) ```

-### Worker: Provisioning indicators (not available on server) +### Worker: Repo state syncer -#### worker: provisioning_container_cpu_usage_long_term +#### worker: state_syncer_running -

Container cpu usage total (90th percentile over 1d) across all cores by instance

+

State syncer is running

-Refer to the [alerts reference](alerts#worker-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +1, if the state syncer is currently running -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103100` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100800` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15225,21 +15297,27 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103100` on Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[1d]) +max (src_repo_statesyncer_running) ```

-#### worker: provisioning_container_memory_usage_long_term +#### worker: repos_deleted_total -

Container memory usage (1d maximum) by instance

+

Total number of repos deleted

-Refer to the [alerts reference](alerts#worker-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +The total number of repos deleted across all gitservers by +the state syncer. +A high number here is not necessarily an issue, dig deeper into +the other charts in this section to make a call if those deletions +were correct. + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15247,21 +15325,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103101` on Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[1d]) +sum(src_repo_statesyncer_repos_deleted) ```

-#### worker: provisioning_container_cpu_usage_short_term +#### worker: repos_deleted_from_primary_total -

Container cpu usage total (5m maximum) across all cores by instance

+

Total number of repos deleted from primary

-Refer to the [alerts reference](alerts#worker-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +The total number of repos deleted from the primary shard. +Check the reasons for why they were deleted. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103110` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100802` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15269,21 +15350,24 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103110` on Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[5m]) +sum by (reason) (src_repo_statesyncer_repos_deleted{is_primary="true"}) ```

-#### worker: provisioning_container_memory_usage_short_term +#### worker: repos_deleted_from_secondary_total -

Container memory usage (5m maximum) by instance

+

Total number of repos deleted from secondary

-Refer to the [alerts reference](alerts#worker-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +The total number of repos deleted from secondary shards. +Check the reasons for why they were deleted. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103111` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100803` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15291,24 +15375,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103111` on Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[5m]) +sum by (reason) (src_repo_statesyncer_repos_deleted{is_primary="false"}) ```

-#### worker: container_oomkill_events_total +### Worker: External services -

Container OOMKILL events total by instance

+#### worker: src_repoupdater_external_services_total -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

The total number of external services

-Refer to the [alerts reference](alerts#worker-container-oomkill-events-total) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-src-repoupdater-external-services-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100900` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15316,25 +15399,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103112` on Query: ``` -max by (name) (container_oom_events_total{name=~"^worker.*"}) +max(src_repoupdater_external_services_total) ```

-### Worker: Golang runtime monitoring - -#### worker: go_goroutines - -

Maximum active goroutines

+#### worker: repoupdater_queued_sync_jobs_total -A high value here indicates a possible goroutine leak. +

The total number of queued sync jobs

-Refer to the [alerts reference](alerts#worker-go-goroutines) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-repoupdater-queued-sync-jobs-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100910` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15342,21 +15421,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103200` on Query: ``` -max by(instance) (go_goroutines{job=~".*worker"}) +max(src_repoupdater_queued_sync_jobs_total) ```

-#### worker: go_gc_duration_seconds +#### worker: repoupdater_completed_sync_jobs_total -

Maximum go garbage collection duration

+

The total number of completed sync jobs

-Refer to the [alerts reference](alerts#worker-go-gc-duration-seconds) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-repoupdater-completed-sync-jobs-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100911` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15364,23 +15443,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103201` on Query: ``` -max by(instance) (go_gc_duration_seconds{job=~".*worker"}) +max(src_repoupdater_completed_sync_jobs_total) ```

-### Worker: Kubernetes monitoring (only available on Kubernetes) - -#### worker: pods_available_percentage +#### worker: repoupdater_errored_sync_jobs_percentage -

Percentage pods available

+

The percentage of external services that have failed their most recent sync

-Refer to the [alerts reference](alerts#worker-pods-available-percentage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-repoupdater-errored-sync-jobs-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100912` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15388,23 +15465,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103300` on Query: ``` -sum by(app) (up{app=~".*worker"}) / count by (app) (up{app=~".*worker"}) * 100 +max(src_repoupdater_errored_sync_jobs_percentage) ```

-### Worker: Own: repo indexer dbstore - -#### worker: workerutil_dbworker_store_own_background_worker_store_total +#### worker: github_graphql_rate_limit_remaining -

Aggregate store operations every 5m

+

Remaining calls to GitHub graphql API before hitting the rate limit

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-github-graphql-rate-limit-remaining) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100920` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15412,21 +15487,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103400` on Query: ``` -sum(increase(src_workerutil_dbworker_store_own_background_worker_store_total{job=~"^worker.*"}[5m])) +max by (name) (src_github_rate_limit_remaining_v2{resource="graphql"}) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_99th_percentile_duration +#### worker: github_rest_rate_limit_remaining -

Aggregate successful store operation duration distribution over 5m

+

Remaining calls to GitHub rest API before hitting the rate limit

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-github-rest-rate-limit-remaining) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100921` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15434,21 +15509,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103401` on Query: ``` -sum by (le)(rate(src_workerutil_dbworker_store_own_background_worker_store_duration_seconds_bucket{job=~"^worker.*"}[5m])) +max by (name) (src_github_rate_limit_remaining_v2{resource="rest"}) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_errors_total +#### worker: github_search_rate_limit_remaining -

Aggregate store operation errors every 5m

+

Remaining calls to GitHub search API before hitting the rate limit

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-github-search-rate-limit-remaining) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100922` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15456,21 +15531,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103402` on Query: ``` -sum(increase(src_workerutil_dbworker_store_own_background_worker_store_errors_total{job=~"^worker.*"}[5m])) +max by (name) (src_github_rate_limit_remaining_v2{resource="search"}) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_error_rate +#### worker: github_graphql_rate_limit_wait_duration -

Aggregate store operation error rate over 5m

+

Time spent waiting for the GitHub graphql API rate limiter

+ +Indicates how long we`re waiting on the rate limit once it has been exceeded This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100930` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15478,21 +15555,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103403` on Query: ``` -sum(increase(src_workerutil_dbworker_store_own_background_worker_store_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_own_background_worker_store_total{job=~"^worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_own_background_worker_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="graphql"}[5m])) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_total +#### worker: github_rest_rate_limit_wait_duration -

Store operations every 5m

+

Time spent waiting for the GitHub rest API rate limiter

+ +Indicates how long we`re waiting on the rate limit once it has been exceeded This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100931` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15500,21 +15579,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103410` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_own_background_worker_store_total{job=~"^worker.*"}[5m])) +max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="rest"}[5m])) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_99th_percentile_duration +#### worker: github_search_rate_limit_wait_duration -

99th percentile successful store operation duration over 5m

+

Time spent waiting for the GitHub search API rate limiter

+ +Indicates how long we`re waiting on the rate limit once it has been exceeded This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100932` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15522,21 +15603,21 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103411` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_own_background_worker_store_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="search"}[5m])) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_errors_total +#### worker: gitlab_rest_rate_limit_remaining -

Store operation errors every 5m

+

Remaining calls to GitLab rest API before hitting the rate limit

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-gitlab-rest-rate-limit-remaining) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100940` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15544,21 +15625,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103412` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_own_background_worker_store_errors_total{job=~"^worker.*"}[5m])) +max by (name) (src_gitlab_rate_limit_remaining{resource="rest"}) ```

-#### worker: workerutil_dbworker_store_own_background_worker_store_error_rate +#### worker: gitlab_rest_rate_limit_wait_duration -

Store operation error rate over 5m

+

Time spent waiting for the GitLab rest API rate limiter

+ +Indicates how long we`re waiting on the rate limit once it has been exceeded This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100941` on your Sourcegraph instance. -*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15566,23 +15649,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103413` on Query: ``` -sum by (op)(increase(src_workerutil_dbworker_store_own_background_worker_store_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_own_background_worker_store_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_own_background_worker_store_errors_total{job=~"^worker.*"}[5m]))) * 100 +max by (name) (rate(src_gitlab_rate_limit_wait_duration_seconds{resource="rest"}[5m])) ```

-### Worker: Own: repo indexer worker queue +#### worker: src_internal_rate_limit_wait_duration_bucket -#### worker: own_background_worker_handlers +

95th percentile time spent successfully waiting on our internal rate limiter

-

Handler active handlers

+Indicates how long we`re waiting on our internal rate limiter when communicating with a code host This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100950` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15590,21 +15673,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103500` on Query: ``` -sum(src_own_background_worker_processor_handlers{job=~"^worker.*"}) +histogram_quantile(0.95, sum(rate(src_internal_rate_limit_wait_duration_bucket{failed="false"}[5m])) by (le, urn)) ```

-#### worker: own_background_worker_processor_total +#### worker: src_internal_rate_limit_wait_error_count -

Handler operations every 5m

+

Rate of failures waiting on our internal rate limiter

+ +The rate at which we fail our internal rate limiter. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100951` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15612,21 +15697,25 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103510` on Query: ``` -sum(increase(src_own_background_worker_processor_total{job=~"^worker.*"}[5m])) +sum by (urn) (rate(src_internal_rate_limit_wait_duration_count{failed="true"}[5m])) ```

-#### worker: own_background_worker_processor_99th_percentile_duration +### Worker: Permissions -

Aggregate successful handler operation duration distribution over 5m

+#### worker: user_success_syncs_total + +

Total number of user permissions syncs

+ +Indicates the total number of user permissions sync completed. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15634,21 +15723,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103511` on Query: ``` -sum by (le)(rate(src_own_background_worker_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) +sum(src_repo_perms_syncer_success_syncs{type="user"}) ```

-#### worker: own_background_worker_processor_errors_total +#### worker: user_success_syncs -

Handler operation errors every 5m

+

Number of user permissions syncs [5m]

+ +Indicates the number of users permissions syncs completed. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15656,21 +15747,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103512` on Query: ``` -sum(increase(src_own_background_worker_processor_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_repo_perms_syncer_success_syncs{type="user"}[5m])) ```

-#### worker: own_background_worker_processor_error_rate +#### worker: user_initial_syncs -

Handler operation error rate over 5m

+

Number of first user permissions syncs [5m]

+ +Indicates the number of permissions syncs done for the first time for the user. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101002` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15678,23 +15771,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103513` on Query: ``` -sum(increase(src_own_background_worker_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_own_background_worker_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_own_background_worker_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 +sum(increase(src_repo_perms_syncer_initial_syncs{type="user"}[5m])) ```

-### Worker: Own: own repo indexer record resetter +#### worker: repo_success_syncs_total -#### worker: own_background_worker_record_resets_total +

Total number of repo permissions syncs

-

Own repo indexer queue records reset to queued state every 5m

+Indicates the total number of repo permissions sync completed. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101010` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15702,21 +15795,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103600` on Query: ``` -sum(increase(src_own_background_worker_record_resets_total{job=~"^worker.*"}[5m])) +sum(src_repo_perms_syncer_success_syncs{type="repo"}) ```

-#### worker: own_background_worker_record_reset_failures_total +#### worker: repo_success_syncs + +

Number of repo permissions syncs over 5m

-

Own repo indexer queue records reset to errored state every 5m

+Indicates the number of repos permissions syncs completed. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15724,21 +15819,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103601` on Query: ``` -sum(increase(src_own_background_worker_record_reset_failures_total{job=~"^worker.*"}[5m])) +sum(increase(src_repo_perms_syncer_success_syncs{type="repo"}[5m])) ```

-#### worker: own_background_worker_record_reset_errors_total +#### worker: repo_initial_syncs + +

Number of first repo permissions syncs over 5m

-

Own repo indexer queue operation errors every 5m

+Indicates the number of permissions syncs done for the first time for the repo. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101012` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15746,23 +15843,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103602` on Query: ``` -sum(increase(src_own_background_worker_record_reset_errors_total{job=~"^worker.*"}[5m])) +sum(increase(src_repo_perms_syncer_initial_syncs{type="repo"}[5m])) ```

-### Worker: Own: index job scheduler +#### worker: users_consecutive_sync_delay -#### worker: own_background_index_scheduler_total +

Max duration between two consecutive permissions sync for user

-

Own index job scheduler operations every 10m

+Indicates the max delay between two consecutive permissions sync for a user during the period. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101020` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15770,21 +15867,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103700` on Query: ``` -sum by (op)(increase(src_own_background_index_scheduler_total{job=~"^worker.*"}[10m])) +max(max_over_time (src_repo_perms_syncer_perms_consecutive_sync_delay{type="user"} [1m])) ```

-#### worker: own_background_index_scheduler_99th_percentile_duration +#### worker: repos_consecutive_sync_delay -

99th percentile successful own index job scheduler operation duration over 10m

+

Max duration between two consecutive permissions sync for repo

+ +Indicates the max delay between two consecutive permissions sync for a repo during the period. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101021` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15792,21 +15891,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103701` on Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_own_background_index_scheduler_duration_seconds_bucket{job=~"^worker.*"}[10m]))) +max(max_over_time (src_repo_perms_syncer_perms_consecutive_sync_delay{type="repo"} [1m])) ```

-#### worker: own_background_index_scheduler_errors_total +#### worker: users_first_sync_delay -

Own index job scheduler operation errors every 10m

+

Max duration between user creation and first permissions sync

+ +Indicates the max delay between user creation and their permissions sync This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101030` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15814,21 +15915,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103702` on Query: ``` -sum by (op)(increase(src_own_background_index_scheduler_errors_total{job=~"^worker.*"}[10m])) +max(max_over_time(src_repo_perms_syncer_perms_first_sync_delay{type="user"}[1m])) ```

-#### worker: own_background_index_scheduler_error_rate +#### worker: repos_first_sync_delay -

Own index job scheduler operation error rate over 10m

+

Max duration between repo creation and first permissions sync over 1m

+ +Indicates the max delay between repo creation and their permissions sync This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103703` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101031` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15836,25 +15939,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103703` on Query: ``` -sum by (op)(increase(src_own_background_index_scheduler_errors_total{job=~"^worker.*"}[10m])) / (sum by (op)(increase(src_own_background_index_scheduler_total{job=~"^worker.*"}[10m])) + sum by (op)(increase(src_own_background_index_scheduler_errors_total{job=~"^worker.*"}[10m]))) * 100 +max(max_over_time(src_repo_perms_syncer_perms_first_sync_delay{type="repo"}[1m])) ```

-### Worker: Site configuration client update latency - -#### worker: worker_site_configuration_duration_since_last_successful_update_by_instance +#### worker: permissions_found_count -

Duration since last successful site configuration update (by instance)

+

Number of permissions found during user/repo permissions sync

-The duration since the configuration client used by the "worker" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. +Indicates the number permissions found during users/repos permissions sync. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101040` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15862,21 +15963,23 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103800` on Query: ``` -src_conf_client_time_since_last_successful_update_seconds{job=~`^worker.*`,instance=~`${instance:regex}`} +sum by (type) (src_repo_perms_syncer_perms_found) ```

-#### worker: worker_site_configuration_duration_since_last_successful_update_by_instance +#### worker: permissions_found_avg -

Maximum duration since last successful site configuration update (all "worker" instances)

+

Average number of permissions found during permissions sync per user/repo

-Refer to the [alerts reference](alerts#worker-worker-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. +Indicates the average number permissions found during permissions sync per user/repo. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103801` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101041` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -15884,30 +15987,41 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103801` on Query: ``` -max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`^worker.*`,instance=~`${instance:regex}`}[1m])) +avg by (type) (src_repo_perms_syncer_perms_found) ```

-## Repo Updater +#### worker: perms_syncer_outdated_perms -

Manages interaction with code hosts, instructs Gitserver to update repositories.

+

Number of entities with outdated permissions

-To see this dashboard, visit `/-/debug/grafana/d/repo-updater/repo-updater` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#worker-perms-syncer-outdated-perms) for 1 alert related to this panel. -### Repo Updater: Repositories +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101050` on your Sourcegraph instance. -#### repo-updater: syncer_sync_last_time +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* -

Time since last sync

+
+Technical details -A high value here indicates issues synchronizing repo metadata. -If the value is persistently high, make sure all external services have valid tokens. +Query: -This panel has no related alerts. +``` +max by (type) (src_repo_perms_syncer_outdated_perms) +``` +
+ +
-To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100000` on your Sourcegraph instance. +#### worker: perms_syncer_sync_duration + +

95th permissions sync duration

+ +Refer to the [alerts reference](alerts#worker-perms-syncer-sync-duration) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101060` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -15917,19 +16031,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(timestamp(vector(time()))) - max(src_repoupdater_syncer_sync_last_time) +histogram_quantile(0.95, max by (le, type) (rate(src_repo_perms_syncer_sync_duration_seconds_bucket[1m]))) ```
-#### repo-updater: src_repoupdater_max_sync_backoff +#### worker: perms_syncer_sync_errors -

Time since oldest sync

+

Permissions sync error rate

-Refer to the [alerts reference](alerts#repo-updater-src-repoupdater-max-sync-backoff) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#worker-perms-syncer-sync-errors) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101070` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -15939,19 +16053,22 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_max_sync_backoff) +max by (type) (ceil(rate(src_repo_perms_syncer_sync_errors_total[1m]))) ```
-#### repo-updater: src_repoupdater_syncer_sync_errors_total +#### worker: perms_syncer_scheduled_repos_total -

Site level external service sync error rate

+

Total number of repos scheduled for permissions sync

-Refer to the [alerts reference](alerts#repo-updater-src-repoupdater-syncer-sync-errors-total) for 2 alerts related to this panel. +Indicates how many repositories have been scheduled for a permissions sync. +More about repository permissions synchronization [here](https://sourcegraph.com/docs/admin/permissions/syncing#scheduling) -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100002` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101071` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -15961,19 +16078,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="invalid_npm_path",reason!="internal_rate_limit"}[5m])) +max(rate(src_repo_perms_syncer_schedule_repos_total[1m])) ```
-#### repo-updater: syncer_sync_start +### Worker: Gitserver: Gitserver Client -

Repo metadata sync was started

+#### worker: gitserver_client_total + +

Aggregate client operations every 5m

-Refer to the [alerts reference](alerts#repo-updater-syncer-sync-start) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101100` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -15983,19 +16102,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (family) (rate(src_repoupdater_syncer_start_sync{family="Syncer.SyncExternalService"}[9h0m0s])) +sum(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: syncer_sync_duration +#### worker: gitserver_client_99th_percentile_duration -

95th repositories sync duration

+

Aggregate successful client operation duration distribution over 5m

-Refer to the [alerts reference](alerts#repo-updater-syncer-sync-duration) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101101` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16005,19 +16124,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.95, max by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m]))) +sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```
-#### repo-updater: source_duration +#### worker: gitserver_client_errors_total -

95th repositories source duration

+

Aggregate client operation errors every 5m

-Refer to the [alerts reference](alerts#repo-updater-source-duration) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101102` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16027,19 +16146,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.95, max by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m]))) +sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: syncer_synced_repos +#### worker: gitserver_client_error_rate -

Repositories synced

+

Aggregate client operation error rate over 5m

-Refer to the [alerts reference](alerts#repo-updater-syncer-synced-repos) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100020` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101103` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16049,19 +16168,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_syncer_synced_repos_total[1m])) +sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m]))) * 100 ```
-#### repo-updater: sourced_repos +#### worker: gitserver_client_total -

Repositories sourced

+

Client operations every 5m

-Refer to the [alerts reference](alerts#repo-updater-sourced-repos) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100021` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101110` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16071,19 +16190,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_source_repos_total[1m])) +sum by (op,scope)(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: purge_failed +#### worker: gitserver_client_99th_percentile_duration -

Repositories purge failed

+

99th percentile successful client operation duration over 5m

-Refer to the [alerts reference](alerts#repo-updater-purge-failed) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100030` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101111` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16093,19 +16212,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_purge_failed[1m])) +histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```
-#### repo-updater: sched_auto_fetch +#### worker: gitserver_client_errors_total -

Repositories scheduled due to hitting a deadline

+

Client operation errors every 5m

-Refer to the [alerts reference](alerts#repo-updater-sched-auto-fetch) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100040` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101112` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16115,22 +16234,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_sched_auto_fetch[1m])) +sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: sched_manual_fetch - -

Repositories scheduled due to user traffic

+#### worker: gitserver_client_error_rate -Check repo-updater logs if this value is persistently high. -This does not indicate anything if there are no user added code hosts. +

Client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100041` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101113` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16140,19 +16256,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_sched_manual_fetch[1m])) +sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^worker.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^worker.*"}[5m]))) * 100 ```
-#### repo-updater: sched_known_repos +### Worker: Gitserver: Gitserver Repository Service Client -

Repositories managed by the scheduler

+#### worker: gitserver_repositoryservice_client_total -Refer to the [alerts reference](alerts#repo-updater-sched-known-repos) for 1 alert related to this panel. +

Aggregate client operations every 5m

+ +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100050` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101200` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16162,19 +16280,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_sched_known_repos) +sum(increase(src_gitserver_repositoryservice_client_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: sched_update_queue_length +#### worker: gitserver_repositoryservice_client_99th_percentile_duration -

Rate of growth of update queue length over 5 minutes

+

Aggregate successful client operation duration distribution over 5m

-Refer to the [alerts reference](alerts#repo-updater-sched-update-queue-length) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100051` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101201` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16184,19 +16302,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(deriv(src_repoupdater_sched_update_queue_length[5m])) +sum by (le)(rate(src_gitserver_repositoryservice_client_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```
-#### repo-updater: sched_loops +#### worker: gitserver_repositoryservice_client_errors_total -

Scheduler loops

+

Aggregate client operation errors every 5m

-Refer to the [alerts reference](alerts#repo-updater-sched-loops) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100052` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101202` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16206,19 +16324,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_sched_loops[1m])) +sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: src_repoupdater_stale_repos +#### worker: gitserver_repositoryservice_client_error_rate -

Repos that haven't been fetched in more than 8 hours

+

Aggregate client operation error rate over 5m

-Refer to the [alerts reference](alerts#repo-updater-src-repoupdater-stale-repos) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100060` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101203` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16228,19 +16346,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_stale_repos) +sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_gitserver_repositoryservice_client_total{job=~"^worker.*"}[5m])) + sum(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^worker.*"}[5m]))) * 100 ```
-#### repo-updater: sched_error +#### worker: gitserver_repositoryservice_client_total -

Repositories schedule error rate

+

Client operations every 5m

-Refer to the [alerts reference](alerts#repo-updater-sched-error) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100061` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101210` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16250,21 +16368,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(rate(src_repoupdater_sched_error[1m])) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_total{job=~"^worker.*"}[5m])) ```
-### Repo Updater: External services - -#### repo-updater: src_repoupdater_external_services_total +#### worker: gitserver_repositoryservice_client_99th_percentile_duration -

The total number of external services

+

99th percentile successful client operation duration over 5m

-Refer to the [alerts reference](alerts#repo-updater-src-repoupdater-external-services-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101211` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16274,19 +16390,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_external_services_total) +histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_repositoryservice_client_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```
-#### repo-updater: repoupdater_queued_sync_jobs_total +#### worker: gitserver_repositoryservice_client_errors_total -

The total number of queued sync jobs

+

Client operation errors every 5m

-Refer to the [alerts reference](alerts#repo-updater-repoupdater-queued-sync-jobs-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101212` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16296,19 +16412,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_queued_sync_jobs_total) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^worker.*"}[5m])) ```
-#### repo-updater: repoupdater_completed_sync_jobs_total +#### worker: gitserver_repositoryservice_client_error_rate -

The total number of completed sync jobs

+

Client operation error rate over 5m

-Refer to the [alerts reference](alerts#repo-updater-repoupdater-completed-sync-jobs-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101213` on your Sourcegraph instance. *Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* @@ -16318,21 +16434,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_completed_sync_jobs_total) +sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^worker.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_repositoryservice_client_total{job=~"^worker.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_repositoryservice_client_errors_total{job=~"^worker.*"}[5m]))) * 100 ```
-#### repo-updater: repoupdater_errored_sync_jobs_percentage +### Worker: Batches: dbstore stats -

The percentage of external services that have failed their most recent sync

+#### worker: batches_dbstore_total -Refer to the [alerts reference](alerts#repo-updater-repoupdater-errored-sync-jobs-percentage) for 1 alert related to this panel. +

Aggregate store operations every 5m

-To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100112` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16340,21 +16458,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(src_repoupdater_errored_sync_jobs_percentage) +sum(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: github_graphql_rate_limit_remaining +#### worker: batches_dbstore_99th_percentile_duration -

Remaining calls to GitHub graphql API before hitting the rate limit

+

Aggregate successful store operation duration distribution over 5m

-Refer to the [alerts reference](alerts#repo-updater-github-graphql-rate-limit-remaining) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16362,21 +16480,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (name) (src_github_rate_limit_remaining_v2{resource="graphql"}) +sum by (le)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-#### repo-updater: github_rest_rate_limit_remaining +#### worker: batches_dbstore_errors_total -

Remaining calls to GitHub rest API before hitting the rate limit

+

Aggregate store operation errors every 5m

-Refer to the [alerts reference](alerts#repo-updater-github-rest-rate-limit-remaining) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100121` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101302` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16384,21 +16502,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (name) (src_github_rate_limit_remaining_v2{resource="rest"}) +sum(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: github_search_rate_limit_remaining +#### worker: batches_dbstore_error_rate -

Remaining calls to GitHub search API before hitting the rate limit

+

Aggregate store operation error rate over 5m

-Refer to the [alerts reference](alerts#repo-updater-github-search-rate-limit-remaining) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100122` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101303` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16406,23 +16524,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (name) (src_github_rate_limit_remaining_v2{resource="search"}) +sum(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) + sum(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-#### repo-updater: github_graphql_rate_limit_wait_duration - -

Time spent waiting for the GitHub graphql API rate limiter

+#### worker: batches_dbstore_total -Indicates how long we`re waiting on the rate limit once it has been exceeded +

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100130` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16430,23 +16546,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="graphql"}[5m])) +sum by (op)(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: github_rest_rate_limit_wait_duration - -

Time spent waiting for the GitHub rest API rate limiter

+#### worker: batches_dbstore_99th_percentile_duration -Indicates how long we`re waiting on the rate limit once it has been exceeded +

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100131` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101311` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16454,23 +16568,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="rest"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```

-#### repo-updater: github_search_rate_limit_wait_duration - -

Time spent waiting for the GitHub search API rate limiter

+#### worker: batches_dbstore_errors_total -Indicates how long we`re waiting on the rate limit once it has been exceeded +

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100132` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101312` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16478,21 +16590,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="search"}[5m])) +sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: gitlab_rest_rate_limit_remaining +#### worker: batches_dbstore_error_rate -

Remaining calls to GitLab rest API before hitting the rate limit

+

Store operation error rate over 5m

-Refer to the [alerts reference](alerts#repo-updater-gitlab-rest-rate-limit-remaining) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100140` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101313` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16500,23 +16612,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (name) (src_gitlab_rate_limit_remaining{resource="rest"}) +sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_batches_dbstore_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-#### repo-updater: gitlab_rest_rate_limit_wait_duration +### Worker: Batches: service stats -

Time spent waiting for the GitLab rest API rate limiter

+#### worker: batches_service_total -Indicates how long we`re waiting on the rate limit once it has been exceeded +

Aggregate service operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100141` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16524,23 +16636,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max by (name) (rate(src_gitlab_rate_limit_wait_duration_seconds{resource="rest"}[5m])) +sum(increase(src_batches_service_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: src_internal_rate_limit_wait_duration_bucket - -

95th percentile time spent successfully waiting on our internal rate limiter

+#### worker: batches_service_99th_percentile_duration -Indicates how long we`re waiting on our internal rate limiter when communicating with a code host +

Aggregate successful service operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100150` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16548,23 +16658,43 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.95, sum(rate(src_internal_rate_limit_wait_duration_bucket{failed="false"}[5m])) by (le, urn)) +sum by (le)(rate(src_batches_service_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-#### repo-updater: src_internal_rate_limit_wait_error_count +#### worker: batches_service_errors_total -

Rate of failures waiting on our internal rate limiter

+

Aggregate service operation errors every 5m

-The rate at which we fail our internal rate limiter. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101402` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* + +
+Technical details + +Query: + +``` +sum(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) +``` +
+ +
+ +#### worker: batches_service_error_rate + +

Aggregate service operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100151` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101403` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16572,23 +16702,43 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (urn) (rate(src_internal_rate_limit_wait_duration_count{failed="true"}[5m])) +sum(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_batches_service_total{job=~"^worker.*"}[5m])) + sum(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-### Repo Updater: Gitserver: Gitserver Client +#### worker: batches_service_total -#### repo-updater: gitserver_client_total +

Service operations every 5m

-

Aggregate graphql operations every 5m

+This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101410` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* + +
+Technical details + +Query: + +``` +sum by (op)(increase(src_batches_service_total{job=~"^worker.*"}[5m])) +``` +
+ +
+ +#### worker: batches_service_99th_percentile_duration + +

99th percentile successful service operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16596,21 +16746,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_gitserver_client_total{job=~"^repo-updater.*"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_batches_service_duration_seconds_bucket{job=~"^worker.*"}[5m]))) ```

-#### repo-updater: gitserver_client_99th_percentile_duration +#### worker: batches_service_errors_total -

Aggregate successful graphql operation duration distribution over 5m

+

Service operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101412` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16618,21 +16768,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (le)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^repo-updater.*"}[5m])) +sum by (op)(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: gitserver_client_errors_total +#### worker: batches_service_error_rate -

Aggregate graphql operation errors every 5m

+

Service operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101413` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16640,21 +16790,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^repo-updater.*"}[5m])) +sum by (op)(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_batches_service_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_batches_service_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-#### repo-updater: gitserver_client_error_rate +### Worker: Codeinsights: insights queue processor + +#### worker: query_runner_worker_handlers -

Aggregate graphql operation error rate over 5m

+

Handler active handlers

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16662,21 +16814,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_gitserver_client_errors_total{job=~"^repo-updater.*"}[5m])) / (sum(increase(src_gitserver_client_total{job=~"^repo-updater.*"}[5m])) + sum(increase(src_gitserver_client_errors_total{job=~"^repo-updater.*"}[5m]))) * 100 +sum(src_query_runner_worker_processor_handlers{job=~"^worker.*"}) ```

-#### repo-updater: gitserver_client_total +#### worker: query_runner_worker_processor_total -

Graphql operations every 5m

+

Handler operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101510` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16684,21 +16836,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op,scope)(increase(src_gitserver_client_total{job=~"^repo-updater.*"}[5m])) +sum(increase(src_query_runner_worker_processor_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: gitserver_client_99th_percentile_duration +#### worker: query_runner_worker_processor_99th_percentile_duration -

99th percentile successful graphql operation duration over 5m

+

Aggregate successful handler operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16706,21 +16858,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.99, sum by (le,op,scope)(rate(src_gitserver_client_duration_seconds_bucket{job=~"^repo-updater.*"}[5m]))) +sum by (le)(rate(src_query_runner_worker_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-#### repo-updater: gitserver_client_errors_total +#### worker: query_runner_worker_processor_errors_total -

Graphql operation errors every 5m

+

Handler operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101512` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16728,21 +16880,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^repo-updater.*"}[5m])) +sum(increase(src_query_runner_worker_processor_errors_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: gitserver_client_error_rate +#### worker: query_runner_worker_processor_error_rate -

Graphql operation error rate over 5m

+

Handler operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101513` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -16750,21 +16902,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^repo-updater.*"}[5m])) / (sum by (op,scope)(increase(src_gitserver_client_total{job=~"^repo-updater.*"}[5m])) + sum by (op,scope)(increase(src_gitserver_client_errors_total{job=~"^repo-updater.*"}[5m]))) * 100 +sum(increase(src_query_runner_worker_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_query_runner_worker_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_query_runner_worker_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-### Repo Updater: Batches: dbstore stats +### Worker: Codeinsights: dbstore stats -#### repo-updater: batches_dbstore_total +#### worker: workerutil_dbworker_store_total

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101600` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16774,19 +16926,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_batches_dbstore_total{job=~"^repo-updater.*"}[5m])) +sum(increase(src_workerutil_dbworker_store_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) ```
-#### repo-updater: batches_dbstore_99th_percentile_duration +#### worker: workerutil_dbworker_store_99th_percentile_duration

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101601` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16796,19 +16948,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (le)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^repo-updater.*"}[5m])) +sum by (le)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) ```
-#### repo-updater: batches_dbstore_errors_total +#### worker: workerutil_dbworker_store_errors_total

Aggregate store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101602` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16818,19 +16970,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_batches_dbstore_errors_total{job=~"^repo-updater.*"}[5m])) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) ```
-#### repo-updater: batches_dbstore_error_rate +#### worker: workerutil_dbworker_store_error_rate

Aggregate store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101603` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16840,19 +16992,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_batches_dbstore_errors_total{job=~"^repo-updater.*"}[5m])) / (sum(increase(src_batches_dbstore_total{job=~"^repo-updater.*"}[5m])) + sum(increase(src_batches_dbstore_errors_total{job=~"^repo-updater.*"}[5m]))) * 100 +sum(increase(src_workerutil_dbworker_store_errors_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_errors_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m]))) * 100 ```
-#### repo-updater: batches_dbstore_total +#### worker: workerutil_dbworker_store_total

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101610` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16862,19 +17014,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_batches_dbstore_total{job=~"^repo-updater.*"}[5m])) +sum by (op)(increase(src_workerutil_dbworker_store_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) ```
-#### repo-updater: batches_dbstore_99th_percentile_duration +#### worker: workerutil_dbworker_store_99th_percentile_duration

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101611` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16884,19 +17036,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_dbstore_duration_seconds_bucket{job=~"^repo-updater.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m]))) ```
-#### repo-updater: batches_dbstore_errors_total +#### worker: workerutil_dbworker_store_errors_total

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101612` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16906,19 +17058,19 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^repo-updater.*"}[5m])) +sum by (op)(increase(src_workerutil_dbworker_store_errors_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) ```
-#### repo-updater: batches_dbstore_error_rate +#### worker: workerutil_dbworker_store_error_rate

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101613` on your Sourcegraph instance. *Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* @@ -16928,23 +17080,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^repo-updater.*"}[5m])) / (sum by (op)(increase(src_batches_dbstore_total{job=~"^repo-updater.*"}[5m])) + sum by (op)(increase(src_batches_dbstore_errors_total{job=~"^repo-updater.*"}[5m]))) * 100 +sum by (op)(increase(src_workerutil_dbworker_store_errors_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_errors_total{domain='insights_query_runner_jobs',job=~"^worker.*"}[5m]))) * 100 ```
-### Repo Updater: Batches: service stats +### Worker: Completion Credits Entitlement Usage Aggregator: Completion credits entitlement usage aggregations -#### repo-updater: batches_service_total +#### worker: completioncredits_aggregator_total -

Aggregate service operations every 5m

+

Completion credits entitlement usage aggregator operations every 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -16952,21 +17104,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_batches_service_total{job=~"^repo-updater.*"}[5m])) +sum(increase(src_completioncredits_aggregator_total{job=~"^worker.*"}[30m])) ```

-#### repo-updater: batches_service_99th_percentile_duration +#### worker: completioncredits_aggregator_99th_percentile_duration -

Aggregate successful service operation duration distribution over 5m

+

Aggregate successful completion credits entitlement usage aggregator operation duration distribution over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -16974,21 +17126,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (le)(rate(src_batches_service_duration_seconds_bucket{job=~"^repo-updater.*"}[5m])) +sum by (le)(rate(src_completioncredits_aggregator_duration_seconds_bucket{job=~"^worker.*"}[30m])) ```

-#### repo-updater: batches_service_errors_total +#### worker: completioncredits_aggregator_errors_total -

Aggregate service operation errors every 5m

+

Completion credits entitlement usage aggregator operation errors every 30m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-completioncredits-aggregator-errors-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101702` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -16996,21 +17148,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_batches_service_errors_total{job=~"^repo-updater.*"}[5m])) +sum(increase(src_completioncredits_aggregator_errors_total{job=~"^worker.*"}[30m])) ```

-#### repo-updater: batches_service_error_rate +#### worker: completioncredits_aggregator_error_rate -

Aggregate service operation error rate over 5m

+

Completion credits entitlement usage aggregator operation error rate over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101703` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -17018,21 +17170,26 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_batches_service_errors_total{job=~"^repo-updater.*"}[5m])) / (sum(increase(src_batches_service_total{job=~"^repo-updater.*"}[5m])) + sum(increase(src_batches_service_errors_total{job=~"^repo-updater.*"}[5m]))) * 100 +sum(increase(src_completioncredits_aggregator_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_completioncredits_aggregator_total{job=~"^worker.*"}[30m])) + sum(increase(src_completioncredits_aggregator_errors_total{job=~"^worker.*"}[30m]))) * 100 ```

-#### repo-updater: batches_service_total +### Worker: Periodic Goroutines -

Service operations every 5m

+#### worker: running_goroutines + +

Number of currently running periodic goroutines

+ +The number of currently running periodic goroutines by name and job. +A value of 0 indicates the routine isn`t running currently, it awaits it`s next schedule. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101800` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17040,21 +17197,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_batches_service_total{job=~"^repo-updater.*"}[5m])) +sum by (name, job_name) (src_periodic_goroutine_running{job=~".*worker.*"}) ```

-#### repo-updater: batches_service_99th_percentile_duration +#### worker: goroutine_success_rate -

99th percentile successful service operation duration over 5m

+

Success rate for periodic goroutine executions

+ +The rate of successful executions of each periodic goroutine. +A low or zero value could indicate that a routine is stalled or encountering errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17062,21 +17222,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_batches_service_duration_seconds_bucket{job=~"^repo-updater.*"}[5m]))) +sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*worker.*"}[5m])) ```

-#### repo-updater: batches_service_errors_total +#### worker: goroutine_error_rate -

Service operation errors every 5m

+

Error rate for periodic goroutine executions

-This panel has no related alerts. +The rate of errors encountered by each periodic goroutine. +A sustained high error rate may indicate a problem with the routine`s configuration or dependencies. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100412` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#worker-goroutine-error-rate) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101810` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17084,21 +17247,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_batches_service_errors_total{job=~"^repo-updater.*"}[5m])) +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*worker.*"}[5m])) ```

-#### repo-updater: batches_service_error_rate +#### worker: goroutine_error_percentage -

Service operation error rate over 5m

+

Percentage of periodic goroutine executions that result in errors

-This panel has no related alerts. +The percentage of executions that result in errors for each periodic goroutine. +A value above 5% indicates that a significant portion of routine executions are failing. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100413` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#worker-goroutine-error-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101811` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17106,23 +17272,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_batches_service_errors_total{job=~"^repo-updater.*"}[5m])) / (sum by (op)(increase(src_batches_service_total{job=~"^repo-updater.*"}[5m])) + sum by (op)(increase(src_batches_service_errors_total{job=~"^repo-updater.*"}[5m]))) * 100 +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*worker.*"}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*worker.*"}[5m]) > 0) * 100 ```

-### Repo Updater: Codeintel: Coursier invocation stats +#### worker: goroutine_handler_duration -#### repo-updater: codeintel_coursier_total +

95th percentile handler execution time

-

Aggregate invocations operations every 5m

+The 95th percentile execution time for each periodic goroutine handler. +Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101820` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17130,21 +17297,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_duration_seconds_bucket{job=~".*worker.*"}[5m]))) ```

-#### repo-updater: codeintel_coursier_99th_percentile_duration +#### worker: goroutine_loop_duration -

Aggregate successful invocations operation duration distribution over 5m

+

95th percentile loop cycle time

+ +The 95th percentile loop cycle time for each periodic goroutine (excluding sleep time). +This represents how long a complete loop iteration takes before sleeping for the next interval. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101821` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17152,21 +17322,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (le)(rate(src_codeintel_coursier_duration_seconds_bucket{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_loop_duration_seconds_bucket{job=~".*worker.*"}[5m]))) ```

-#### repo-updater: codeintel_coursier_errors_total +#### worker: tenant_processing_duration + +

95th percentile tenant processing time

-

Aggregate invocations operation errors every 5m

+The 95th percentile processing time for individual tenants within periodic goroutines. +Higher values indicate that tenant processing is taking longer and may affect overall performance. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101830` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17174,21 +17347,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_tenant_duration_seconds_bucket{job=~".*worker.*"}[5m]))) ```

-#### repo-updater: codeintel_coursier_error_rate +#### worker: tenant_processing_max -

Aggregate invocations operation error rate over 5m

+

Maximum tenant processing time

+ +The maximum processing time for individual tenants within periodic goroutines. +Consistently high values might indicate problematic tenants or inefficient processing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101831` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17196,21 +17372,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) / (sum(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) + sum(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m]))) * 100 +max by (name, job_name) (rate(src_periodic_goroutine_tenant_duration_seconds_sum{job=~".*worker.*"}[5m]) / rate(src_periodic_goroutine_tenant_duration_seconds_count{job=~".*worker.*"}[5m])) ```

-#### repo-updater: codeintel_coursier_total +#### worker: tenant_count + +

Number of tenants processed per routine

-

Invocations operations every 5m

+The number of tenants processed by each periodic goroutine. +Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101840` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17218,21 +17397,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +max by (name, job_name) (src_periodic_goroutine_tenant_count{job=~".*worker.*"}) ```

-#### repo-updater: codeintel_coursier_99th_percentile_duration +#### worker: tenant_success_rate -

99th percentile successful invocations operation duration over 5m

+

Rate of successful tenant processing operations

+ +The rate of successful tenant processing operations. +A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101841` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17240,21 +17422,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_coursier_duration_seconds_bucket{op!="RunCommand",job=~"^repo-updater.*"}[5m]))) +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*worker.*"}[5m])) ```

-#### repo-updater: codeintel_coursier_errors_total +#### worker: tenant_error_rate + +

Rate of tenant processing errors

-

Invocations operation errors every 5m

+The rate of tenant processing operations that result in errors. +Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101850` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17262,21 +17447,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*worker.*"}[5m])) ```

-#### repo-updater: codeintel_coursier_error_rate +#### worker: tenant_error_percentage -

Invocations operation error rate over 5m

+

Percentage of tenant operations resulting in errors

+ +The percentage of tenant operations that result in errors. +Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101851` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17284,23 +17472,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) / (sum by (op)(increase(src_codeintel_coursier_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) + sum by (op)(increase(src_codeintel_coursier_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m]))) * 100 +(sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*worker.*"}[5m])) / (sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*worker.*"}[5m])) + sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*worker.*"}[5m])))) * 100 ```

-### Repo Updater: Codeintel: npm invocation stats +### Worker: Database connections -#### repo-updater: codeintel_npm_total +#### worker: max_open_conns -

Aggregate invocations operations every 5m

+

Maximum open

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101900` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17308,21 +17496,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="worker"}) ```

-#### repo-updater: codeintel_npm_99th_percentile_duration +#### worker: open_conns -

Aggregate successful invocations operation duration distribution over 5m

+

Established

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101901` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17330,21 +17518,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (le)(rate(src_codeintel_npm_duration_seconds_bucket{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_open{app_name="worker"}) ```

-#### repo-updater: codeintel_npm_errors_total +#### worker: in_use -

Aggregate invocations operation errors every 5m

+

Used

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101910` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17352,21 +17540,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="worker"}) ```

-#### repo-updater: codeintel_npm_error_rate +#### worker: idle -

Aggregate invocations operation error rate over 5m

+

Idle

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100603` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101911` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17374,21 +17562,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) / (sum(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) + sum(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m]))) * 100 +sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="worker"}) ```

-#### repo-updater: codeintel_npm_total +#### worker: mean_blocked_seconds_per_conn_request -

Invocations operations every 5m

+

Mean blocked seconds per conn request

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101920` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17396,21 +17584,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="worker"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="worker"}[5m])) ```

-#### repo-updater: codeintel_npm_99th_percentile_duration +#### worker: closed_max_idle -

99th percentile successful invocations operation duration over 5m

+

Closed by SetMaxIdleConns

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101930` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17418,21 +17606,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_npm_duration_seconds_bucket{op!="RunCommand",job=~"^repo-updater.*"}[5m]))) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="worker"}[5m])) ```

-#### repo-updater: codeintel_npm_errors_total +#### worker: closed_max_lifetime -

Invocations operation errors every 5m

+

Closed by SetConnMaxLifetime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101931` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17440,21 +17628,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="worker"}[5m])) ```

-#### repo-updater: codeintel_npm_error_rate +#### worker: closed_max_idle_time -

Invocations operation error rate over 5m

+

Closed by SetConnMaxIdleTime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100613` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101932` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -17462,25 +17650,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (op)(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) / (sum by (op)(increase(src_codeintel_npm_total{op!="RunCommand",job=~"^repo-updater.*"}[5m])) + sum by (op)(increase(src_codeintel_npm_errors_total{op!="RunCommand",job=~"^repo-updater.*"}[5m]))) * 100 +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="worker"}[5m])) ```

-### Repo Updater: Repo Updater GRPC server metrics +### Worker: Worker (CPU, Memory) -#### repo-updater: repo_updater_grpc_request_rate_all_methods - -

Request rate across all methods over 2m

+#### worker: cpu_usage_percentage -The number of gRPC requests received per second across all methods, aggregated across all instances. +

CPU usage

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-cpu-usage-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -17488,23 +17674,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(rate(grpc_server_started_total{instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) +cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"} ```

-#### repo-updater: repo_updater_grpc_request_rate_per_method +#### worker: memory_usage_percentage -

Request rate per-method over 2m

+

Memory usage percentage (total)

-The number of gRPC requests received per second broken out per method, aggregated across all instances. +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -17512,23 +17698,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(rate(grpc_server_started_total{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) by (grpc_method) +cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"} ```

-#### repo-updater: repo_updater_error_percentage_all_methods +#### worker: memory_working_set_bytes -

Error percentage across all methods over 2m

+

Memory usage bytes (total)

-The percentage of gRPC requests that fail across all methods, aggregated across all instances. +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102002` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -17536,23 +17722,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) )) +max by (name) (container_memory_working_set_bytes{name=~"^worker.*"}) ```

-#### repo-updater: repo_updater_grpc_error_percentage_per_method +#### worker: memory_rss -

Error percentage per-method over 2m

+

Memory (RSS)

-The percentage of gRPC requests that fail per method, aggregated across all instances. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-memory-rss) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102010` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -17560,23 +17746,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${repo_updater_method:regex}`,grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) by (grpc_method)) )) +max(container_memory_rss{name=~"^worker.*"} / container_spec_memory_limit_bytes{name=~"^worker.*"}) by (name) * 100.0 ```

-#### repo-updater: repo_updater_p99_response_time_per_method +#### worker: memory_total_active_file -

99th percentile response time per method over 2m

+

Memory usage (active file)

-The 99th percentile response time per method, aggregated across all instances. +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100720` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -17584,23 +17770,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +max(container_memory_total_active_file_bytes{name=~"^worker.*"} / container_spec_memory_limit_bytes{name=~"^worker.*"}) by (name) * 100.0 ```

-#### repo-updater: repo_updater_p90_response_time_per_method +#### worker: memory_kernel_usage -

90th percentile response time per method over 2m

+

Memory usage (kernel)

-The 90th percentile response time per method, aggregated across all instances. +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100721` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102012` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -17608,23 +17794,33 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +max(container_memory_kernel_usage{name=~"^worker.*"} / container_spec_memory_limit_bytes{name=~"^worker.*"}) by (name) * 100.0 ```

-#### repo-updater: repo_updater_p75_response_time_per_method +### Worker: Container monitoring (not available on server) -

75th percentile response time per method over 2m

+#### worker: container_missing -The 75th percentile response time per method, aggregated across all instances. +

Container missing

+ +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod worker` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p worker`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' worker` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the worker container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs worker` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100722` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17632,23 +17828,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +count by(name) ((time() - container_last_seen{name=~"^worker.*"}) > 60) ```

-#### repo-updater: repo_updater_p99_9_response_size_per_method - -

99.9th percentile total response size per method over 2m

+#### worker: container_cpu_usage -The 99.9th percentile total per-RPC response size per method, aggregated across all instances. +

Container cpu usage total (1m average) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100730` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17656,23 +17850,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"} ```

-#### repo-updater: repo_updater_p90_response_size_per_method - -

90th percentile total response size per method over 2m

+#### worker: container_memory_usage -The 90th percentile total per-RPC response size per method, aggregated across all instances. +

Container memory usage by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100731` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17680,23 +17872,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"} ```

-#### repo-updater: repo_updater_p75_response_size_per_method +#### worker: fs_io_operations -

75th percentile total response size per method over 2m

+

Filesystem reads and writes rate by instance over 1h

-The 75th percentile total per-RPC response size per method, aggregated across all instances. +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100732` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17704,23 +17897,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +sum by(name) (rate(container_fs_reads_total{name=~"^worker.*"}[1h]) + rate(container_fs_writes_total{name=~"^worker.*"}[1h])) ```

-#### repo-updater: repo_updater_p99_9_invididual_sent_message_size_per_method +### Worker: Provisioning indicators (not available on server) -

99.9th percentile individual sent message size per method over 2m

+#### worker: provisioning_container_cpu_usage_long_term -The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100740` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17728,23 +17921,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[1d]) ```

-#### repo-updater: repo_updater_p90_invididual_sent_message_size_per_method - -

90th percentile individual sent message size per method over 2m

+#### worker: provisioning_container_memory_usage_long_term -The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +

Container memory usage (1d maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100741` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17752,23 +17943,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[1d]) ```

-#### repo-updater: repo_updater_p75_invididual_sent_message_size_per_method - -

75th percentile individual sent message size per method over 2m

+#### worker: provisioning_container_cpu_usage_short_term -The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100742` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17776,23 +17965,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^worker.*"}[5m]) ```

-#### repo-updater: repo_updater_grpc_response_stream_message_count_per_method - -

Average streaming response message count per-method over 2m

+#### worker: provisioning_container_memory_usage_short_term -The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. +

Container memory usage (5m maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100750` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17800,23 +17987,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) by (grpc_method))) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^worker.*"}[5m]) ```

-#### repo-updater: repo_updater_grpc_all_codes_per_method +#### worker: container_oomkill_events_total -

Response codes rate per-method over 2m

+

Container OOMKILL events total by instance

-The rate of all generated gRPC response codes per method, aggregated across all instances. +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-container-oomkill-events-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100760` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102212` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17824,25 +18012,25 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum(rate(grpc_server_handled_total{grpc_method=~`${repo_updater_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m])) by (grpc_method, grpc_code) +max by (name) (container_oom_events_total{name=~"^worker.*"}) ```

-### Repo Updater: Repo Updater GRPC "internal error" metrics +### Worker: Golang runtime monitoring -#### repo-updater: repo_updater_grpc_clients_error_percentage_all_methods +#### worker: go_goroutines -

Client baseline error percentage across all methods over 2m

+

Maximum active goroutines

-The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "repo_updater" clients. +A high value here indicates a possible goroutine leak. -This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-go-goroutines) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17850,23 +18038,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))))))) +max by(instance) (go_goroutines{job=~".*worker"}) ```

-#### repo-updater: repo_updater_grpc_clients_error_percentage_per_method - -

Client baseline error percentage per-method over 2m

+#### worker: go_gc_duration_seconds -The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "repo_updater" clients. +

Maximum go garbage collection duration

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17874,23 +18060,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}"}[2m])) by (grpc_method)))))) +max by(instance) (go_gc_duration_seconds{job=~".*worker"}) ```

-#### repo-updater: repo_updater_grpc_clients_all_codes_per_method +### Worker: Kubernetes monitoring (only available on Kubernetes) -

Client baseline response codes rate per-method over 2m

+#### worker: pods_available_percentage -The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "repo_updater" clients. +

Percentage pods available

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-pods-available-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -17898,29 +18084,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum by(app) (up{app=~".*worker"}) / count by (app) (up{app=~".*worker"}) * 100 ```

-#### repo-updater: repo_updater_grpc_clients_internal_error_percentage_all_methods - -

Client-observed gRPC internal error percentage across all methods over 2m

- -The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "repo_updater" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "repo_updater" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +### Worker: Own: repo indexer dbstore -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +#### worker: workerutil_dbworker_store_total -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -17928,29 +18108,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))))))) +sum(increase(src_workerutil_dbworker_store_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) ```

-#### repo-updater: repo_updater_grpc_clients_internal_error_percentage_per_method - -

Client-observed gRPC internal error percentage per-method over 2m

- -The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "repo_updater" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "repo_updater" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +#### worker: workerutil_dbworker_store_99th_percentile_duration -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -17958,29 +18130,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}"}[2m])) by (grpc_method)))))) +sum by (le)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='own_background_worker_store',job=~"^worker.*"}[5m])) ```

-#### repo-updater: repo_updater_grpc_clients_internal_error_all_codes_per_method - -

Client-observed gRPC internal error response code rate per-method over 2m

- -The rate of gRPC internal-error response codes per method, aggregated across all "repo_updater" clients. +#### worker: workerutil_dbworker_store_errors_total -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "repo_updater" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. - -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +

Aggregate store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100812` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -17988,25 +18152,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"repoupdater.v1.RepoUpdaterService",is_internal_error="true",grpc_method=~"${repo_updater_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) ```

-### Repo Updater: Repo Updater GRPC retry metrics - -#### repo-updater: repo_updater_grpc_clients_retry_percentage_across_all_methods +#### worker: workerutil_dbworker_store_error_rate -

Client retry percentage across all methods over 2m

- -The percentage of gRPC requests that were retried across all methods, aggregated across all "repo_updater" clients. +

Aggregate store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102503` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -18014,23 +18174,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"repoupdater.v1.RepoUpdaterService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"repoupdater.v1.RepoUpdaterService"}[2m]))))))) +sum(increase(src_workerutil_dbworker_store_errors_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) / (sum(increase(src_workerutil_dbworker_store_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) + sum(increase(src_workerutil_dbworker_store_errors_total{domain='own_background_worker_store',job=~"^worker.*"}[5m]))) * 100 ```

-#### repo-updater: repo_updater_grpc_clients_retry_percentage_per_method - -

Client retry percentage per-method over 2m

+#### worker: workerutil_dbworker_store_total -The percentage of gRPC requests that were retried aggregated across all "repo_updater" clients, broken out per method. +

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102510` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -18038,23 +18196,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"repoupdater.v1.RepoUpdaterService",is_retried="true",grpc_method=~"${repo_updater_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}"}[2m])) by (grpc_method)))))) +sum by (op)(increase(src_workerutil_dbworker_store_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) ```

-#### repo-updater: repo_updater_grpc_clients_retry_count_per_method - -

Client retry count per-method over 2m

+#### worker: workerutil_dbworker_store_99th_percentile_duration -The count of gRPC requests that were retried aggregated across all "repo_updater" clients, broken out per method +

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=100902` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -18062,25 +18218,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"repoupdater.v1.RepoUpdaterService",grpc_method=~"${repo_updater_method:regex}",is_retried="true"}[2m])) by (grpc_method)) +histogram_quantile(0.99, sum by (le,op)(rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain='own_background_worker_store',job=~"^worker.*"}[5m]))) ```

-### Repo Updater: Site configuration client update latency - -#### repo-updater: repo_updater_site_configuration_duration_since_last_successful_update_by_instance - -

Duration since last successful site configuration update (by instance)

+#### worker: workerutil_dbworker_store_errors_total -The duration since the configuration client used by the "repo_updater" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. +

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102512` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -18088,21 +18240,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -src_conf_client_time_since_last_successful_update_seconds{job=~`.*repo-updater`,instance=~`${instance:regex}`} +sum by (op)(increase(src_workerutil_dbworker_store_errors_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) ```

-#### repo-updater: repo_updater_site_configuration_duration_since_last_successful_update_by_instance +#### worker: workerutil_dbworker_store_error_rate -

Maximum duration since last successful site configuration update (all "repo_updater" instances)

+

Store operation error rate over 5m

-Refer to the [alerts reference](alerts#repo-updater-repo-updater-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102513` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph own team](https://handbook.sourcegraph.com/departments/engineering/teams/own).*
Technical details @@ -18110,25 +18262,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*repo-updater`,instance=~`${instance:regex}`}[1m])) +sum by (op)(increase(src_workerutil_dbworker_store_errors_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_workerutil_dbworker_store_total{domain='own_background_worker_store',job=~"^worker.*"}[5m])) + sum by (op)(increase(src_workerutil_dbworker_store_errors_total{domain='own_background_worker_store',job=~"^worker.*"}[5m]))) * 100 ```

-### Repo Updater: HTTP handlers - -#### repo-updater: healthy_request_rate +### Worker: Own: repo indexer worker queue -

Requests per second, by route, when status code is 200

+#### worker: own_background_worker_handlers -The number of healthy HTTP requests per second to internal HTTP api +

Handler active handlers

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -18136,23 +18286,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (route) (rate(src_http_request_duration_seconds_count{app="repo-updater",code=~"2.."}[5m])) +sum(src_own_background_worker_processor_handlers{job=~"^worker.*"}) ```

-#### repo-updater: unhealthy_request_rate - -

Requests per second, by route, when status code is not 200

+#### worker: own_background_worker_processor_total -The number of unhealthy HTTP requests per second to internal HTTP api +

Handler operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102610` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -18160,23 +18308,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (route) (rate(src_http_request_duration_seconds_count{app="repo-updater",code!~"2.."}[5m])) +sum(increase(src_own_background_worker_processor_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: request_rate_by_code - -

Requests per second, by status code

+#### worker: own_background_worker_processor_99th_percentile_duration -The number of HTTP requests per second by code +

Aggregate successful handler operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102611` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -18184,23 +18330,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (code) (rate(src_http_request_duration_seconds_count{app="repo-updater"}[5m])) +sum by (le)(rate(src_own_background_worker_processor_duration_seconds_bucket{job=~"^worker.*"}[5m])) ```

-#### repo-updater: 95th_percentile_healthy_requests - -

95th percentile duration by route, when status code is 200

+#### worker: own_background_worker_processor_errors_total -The 95th percentile duration by route when the status code is 200 +

Handler operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102612` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -18208,23 +18352,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app="repo-updater",code=~"2.."}[5m])) by (le, route)) +sum(increase(src_own_background_worker_processor_errors_total{job=~"^worker.*"}[5m])) ```

-#### repo-updater: 95th_percentile_unhealthy_requests - -

95th percentile duration by route, when status code is not 200

+#### worker: own_background_worker_processor_error_rate -The 95th percentile duration by route when the status code is not 200 +

Handler operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102613` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -18232,23 +18374,23 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app="repo-updater",code!~"2.."}[5m])) by (le, route)) +sum(increase(src_own_background_worker_processor_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_own_background_worker_processor_total{job=~"^worker.*"}[5m])) + sum(increase(src_own_background_worker_processor_errors_total{job=~"^worker.*"}[5m]))) * 100 ```

-### Repo Updater: Database connections +### Worker: Own: index job scheduler -#### repo-updater: max_open_conns +#### worker: own_background_index_scheduler_total -

Maximum open

+

Own index job scheduler operations every 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -18256,21 +18398,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="repo-updater"}) +sum by (op)(increase(src_own_background_index_scheduler_total{job=~"^worker.*"}[10m])) ```

-#### repo-updater: open_conns +#### worker: own_background_index_scheduler_99th_percentile_duration -

Established

+

99th percentile successful own index job scheduler operation duration over 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -18278,21 +18420,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="repo-updater"}) +histogram_quantile(0.99, sum by (le,op)(rate(src_own_background_index_scheduler_duration_seconds_bucket{job=~"^worker.*"}[10m]))) ```

-#### repo-updater: in_use +#### worker: own_background_index_scheduler_errors_total -

Used

+

Own index job scheduler operation errors every 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102702` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -18300,21 +18442,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="repo-updater"}) +sum by (op)(increase(src_own_background_index_scheduler_errors_total{job=~"^worker.*"}[10m])) ```

-#### repo-updater: idle +#### worker: own_background_index_scheduler_error_rate -

Idle

+

Own index job scheduler operation error rate over 10m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102703` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -18322,21 +18464,25 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="repo-updater"}) +sum by (op)(increase(src_own_background_index_scheduler_errors_total{job=~"^worker.*"}[10m])) / (sum by (op)(increase(src_own_background_index_scheduler_total{job=~"^worker.*"}[10m])) + sum by (op)(increase(src_own_background_index_scheduler_errors_total{job=~"^worker.*"}[10m]))) * 100 ```

-#### repo-updater: mean_blocked_seconds_per_conn_request +### Worker: Site configuration client update latency -

Mean blocked seconds per conn request

+#### worker: worker_site_configuration_duration_since_last_successful_update_by_instance + +

Duration since last successful site configuration update (by instance)

-Refer to the [alerts reference](alerts#repo-updater-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +The duration since the configuration client used by the "worker" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101220` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102800` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -18344,21 +18490,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="repo-updater"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="repo-updater"}[5m])) +src_conf_client_time_since_last_successful_update_seconds{job=~`^worker.*`,instance=~`${instance:regex}`} ```

-#### repo-updater: closed_max_idle +#### worker: worker_site_configuration_duration_since_last_successful_update_by_instance -

Closed by SetMaxIdleConns

+

Maximum duration since last successful site configuration update (all "worker" instances)

-This panel has no related alerts. +Refer to the [alerts reference](alerts#worker-worker-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=102801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -18366,43 +18512,38 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="repo-updater"}[5m])) +max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`^worker.*`,instance=~`${instance:regex}`}[1m])) ```

-#### repo-updater: closed_max_lifetime - -

Closed by SetConnMaxLifetime

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101231` on your Sourcegraph instance. +## Searcher -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +

Performs unindexed searches (diff and commit search, text search for unindexed branches).

-
-Technical details +To see this dashboard, visit `/-/debug/grafana/d/searcher/searcher` on your Sourcegraph instance. -Query: +#### searcher: traffic -``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="repo-updater"}[5m])) -``` -
+

Requests per second by code over 10m

-
+This graph is the average number of requests per second searcher is +experiencing over the last 10 minutes. -#### repo-updater: closed_max_idle_time +The code is the HTTP Status code. 200 is success. We have a special code +"canceled" which is common when doing a large search request and we find +enough results before searching all possible repos. -

Closed by SetConnMaxIdleTime

+Note: A search query is translated into an unindexed search query per unique +(repo, commit). This means a single user query may result in thousands of +requests to searcher. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101232` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -18410,33 +18551,32 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="repo-updater"}[5m])) +sum by (code) (rate(searcher_service_request_total{instance=~`${instance:regex}`}[10m])) ```

-### Repo Updater: Container monitoring (not available on server) +#### searcher: replica_traffic -#### repo-updater: container_missing +

Requests per second per replica over 10m

-

Container missing

+This graph is the average number of requests per second searcher is +experiencing over the last 10 minutes broken down per replica. -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +The code is the HTTP Status code. 200 is success. We have a special code +"canceled" which is common when doing a large search request and we find +enough results before searching all possible repos. -- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod repo-updater` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p repo-updater`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' repo-updater` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the repo-updater container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs repo-updater` (note this will include logs from the previous and currently running container). +Note: A search query is translated into an unindexed search query per unique +(repo, commit). This means a single user query may result in thousands of +requests to searcher. -This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-replica-traffic) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -18444,21 +18584,24 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -count by(name) ((time() - container_last_seen{name=~"^repo-updater.*"}) > 60) +sum by (instance) (rate(searcher_service_request_total{instance=~`${instance:regex}`}[10m])) ```

-#### repo-updater: container_cpu_usage +#### searcher: concurrent_requests -

Container cpu usage total (1m average) across all cores by instance

+

Amount of in-flight unindexed search requests (per instance)

-Refer to the [alerts reference](alerts#repo-updater-container-cpu-usage) for 1 alert related to this panel. +This graph is the amount of in-flight unindexed search requests per instance. +Consistently high numbers here indicate you may need to scale out searcher. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101301` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100010` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -18466,21 +18609,21 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^repo-updater.*"} +sum by (instance) (searcher_service_running{instance=~`${instance:regex}`}) ```

-#### repo-updater: container_memory_usage +#### searcher: unindexed_search_request_errors -

Container memory usage by instance

+

Unindexed search request errors every 5m by code

-Refer to the [alerts reference](alerts#repo-updater-container-memory-usage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#searcher-unindexed-search-request-errors) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -18488,24 +18631,32 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^repo-updater.*"} +sum by (code)(increase(searcher_service_request_total{code!="200",code!="canceled",instance=~`${instance:regex}`}[5m])) / ignoring(code) group_left sum(increase(searcher_service_request_total{instance=~`${instance:regex}`}[5m])) * 100 ```

-#### repo-updater: fs_io_operations +### Searcher: Cache store -

Filesystem reads and writes rate by instance over 1h

+#### searcher: store_fetching -This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +

Amount of in-flight unindexed search requests fetching code from gitserver (per instance)

+ +Before we can search a commit we fetch the code from gitserver then cache it +for future search requests. This graph is the current number of search +requests which are in the state of fetching code from gitserver. + +Generally this number should remain low since fetching code is fast, but +expect bursts. In the case of instances with a monorepo you would expect this +number to stay low for the duration of fetching the code (which in some cases +can take many minutes). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -18513,354 +18664,15 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^repo-updater.*"}[1h]) + rate(container_fs_writes_total{name=~"^repo-updater.*"}[1h])) +sum by (instance) (searcher_store_fetching{instance=~`${instance:regex}`}) ```

-### Repo Updater: Provisioning indicators (not available on server) +#### searcher: store_fetching_waiting -#### repo-updater: provisioning_container_cpu_usage_long_term - -

Container cpu usage total (90th percentile over 1d) across all cores by instance

- -Refer to the [alerts reference](alerts#repo-updater-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101400` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^repo-updater.*"}[1d]) -``` -
- -
- -#### repo-updater: provisioning_container_memory_usage_long_term - -

Container memory usage (1d maximum) by instance

- -Refer to the [alerts reference](alerts#repo-updater-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101401` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^repo-updater.*"}[1d]) -``` -
- -
- -#### repo-updater: provisioning_container_cpu_usage_short_term - -

Container cpu usage total (5m maximum) across all cores by instance

- -Refer to the [alerts reference](alerts#repo-updater-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101410` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^repo-updater.*"}[5m]) -``` -
- -
- -#### repo-updater: provisioning_container_memory_usage_short_term - -

Container memory usage (5m maximum) by instance

- -Refer to the [alerts reference](alerts#repo-updater-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101411` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^repo-updater.*"}[5m]) -``` -
- -
- -#### repo-updater: container_oomkill_events_total - -

Container OOMKILL events total by instance

- -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. - -Refer to the [alerts reference](alerts#repo-updater-container-oomkill-events-total) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101412` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -max by (name) (container_oom_events_total{name=~"^repo-updater.*"}) -``` -
- -
- -### Repo Updater: Golang runtime monitoring - -#### repo-updater: go_goroutines - -

Maximum active goroutines

- -A high value here indicates a possible goroutine leak. - -Refer to the [alerts reference](alerts#repo-updater-go-goroutines) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101500` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -max by(instance) (go_goroutines{job=~".*repo-updater"}) -``` -
- -
- -#### repo-updater: go_gc_duration_seconds - -

Maximum go garbage collection duration

- -Refer to the [alerts reference](alerts#repo-updater-go-gc-duration-seconds) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101501` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -max by(instance) (go_gc_duration_seconds{job=~".*repo-updater"}) -``` -
- -
- -### Repo Updater: Kubernetes monitoring (only available on Kubernetes) - -#### repo-updater: pods_available_percentage - -

Percentage pods available

- -Refer to the [alerts reference](alerts#repo-updater-pods-available-percentage) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel=101600` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).* - -
-Technical details - -Query: - -``` -sum by(app) (up{app=~".*repo-updater"}) / count by (app) (up{app=~".*repo-updater"}) * 100 -``` -
- -
- -## Searcher - -

Performs unindexed searches (diff and commit search, text search for unindexed branches).

- -To see this dashboard, visit `/-/debug/grafana/d/searcher/searcher` on your Sourcegraph instance. - -#### searcher: traffic - -

Requests per second by code over 10m

- -This graph is the average number of requests per second searcher is -experiencing over the last 10 minutes. - -The code is the HTTP Status code. 200 is success. We have a special code -"canceled" which is common when doing a large search request and we find -enough results before searching all possible repos. - -Note: A search query is translated into an unindexed search query per unique -(repo, commit). This means a single user query may result in thousands of -requests to searcher. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100000` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (code) (rate(searcher_service_request_total{instance=~`${instance:regex}`}[10m])) -``` -
- -
- -#### searcher: replica_traffic - -

Requests per second per replica over 10m

- -This graph is the average number of requests per second searcher is -experiencing over the last 10 minutes broken down per replica. - -The code is the HTTP Status code. 200 is success. We have a special code -"canceled" which is common when doing a large search request and we find -enough results before searching all possible repos. - -Note: A search query is translated into an unindexed search query per unique -(repo, commit). This means a single user query may result in thousands of -requests to searcher. - -Refer to the [alerts reference](alerts#searcher-replica-traffic) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100001` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (instance) (rate(searcher_service_request_total{instance=~`${instance:regex}`}[10m])) -``` -
- -
- -#### searcher: concurrent_requests - -

Amount of in-flight unindexed search requests (per instance)

- -This graph is the amount of in-flight unindexed search requests per instance. -Consistently high numbers here indicate you may need to scale out searcher. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100010` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (instance) (searcher_service_running{instance=~`${instance:regex}`}) -``` -
- -
- -#### searcher: unindexed_search_request_errors - -

Unindexed search request errors every 5m by code

- -Refer to the [alerts reference](alerts#searcher-unindexed-search-request-errors) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100011` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (code)(increase(searcher_service_request_total{code!="200",code!="canceled",instance=~`${instance:regex}`}[5m])) / ignoring(code) group_left sum(increase(searcher_service_request_total{instance=~`${instance:regex}`}[5m])) * 100 -``` -
- -
- -### Searcher: Cache store - -#### searcher: store_fetching - -

Amount of in-flight unindexed search requests fetching code from gitserver (per instance)

- -Before we can search a commit we fetch the code from gitserver then cache it -for future search requests. This graph is the current number of search -requests which are in the state of fetching code from gitserver. - -Generally this number should remain low since fetching code is fast, but -expect bursts. In the case of instances with a monorepo you would expect this -number to stay low for the duration of fetching the code (which in some cases -can take many minutes). - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100100` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (instance) (searcher_store_fetching{instance=~`${instance:regex}`}) -``` -
- -
- -#### searcher: store_fetching_waiting - -

Amount of in-flight unindexed search requests waiting to fetch code from gitserver (per instance)

+

Amount of in-flight unindexed search requests waiting to fetch code from gitserver (per instance)

We limit the number of requests which can fetch code to prevent overwhelming gitserver. This gauge is the number of requests waiting to be allowed to speak @@ -18932,7 +18744,7 @@ indexing a subset of repositories. Otherwise every other state should occur rarely. For a full list of possible state see -[recordHybridFinalState](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+f:cmd/searcher+recordHybridFinalState). +[recordHybridFinalState](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph-public-snapshot%24+f:cmd/searcher+recordHybridFinalState). This panel has no related alerts. @@ -18946,554 +18758,24 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100200` Query: ``` -sum by (state)(increase(searcher_hybrid_final_state_total{instance=~`${instance:regex}`}[10m])) -``` - - -
- -#### searcher: searcher_hybrid_retry_total - -

Hybrid search retrying over 10m

- -Expectation is that this graph should mostly be 0. It will trigger if a user -manages to do a search and the underlying index changes while searching or -Zoekt goes down. So occasional bursts can be expected, but if this graph is -regularly above 0 it is a sign for further investigation. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100201` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (reason)(increase(searcher_hybrid_retry_total{instance=~`${instance:regex}`}[10m])) -``` -
- -
- -### Searcher: Cache disk I/O metrics - -#### searcher: cache_disk_reads_sec - -

Read request rate over 1m (per instance)

- -The number of read requests that were issued to the device per second. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100300` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### searcher: cache_disk_writes_sec - -

Write request rate over 1m (per instance)

- -The number of write requests that were issued to the device per second. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100301` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### searcher: cache_disk_read_throughput - -

Read throughput over 1m (per instance)

- -The amount of data that was read from the device per second. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100310` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### searcher: cache_disk_write_throughput - -

Write throughput over 1m (per instance)

- -The amount of data that was written to the device per second. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100311` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### searcher: cache_disk_read_duration - -

Average read duration over 1m (per instance)

- -The average time for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100320` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) -``` -
- -
- -#### searcher: cache_disk_write_duration - -

Average write duration over 1m (per instance)

- -The average time for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100321` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_write_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) -``` -
- -
- -#### searcher: cache_disk_read_request_size - -

Average read request size over 1m (per instance)

- -The average size of read requests that were issued to the device. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100330` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) -``` -
- -
- -#### searcher: cache_disk_write_request_size) - -

Average write request size over 1m (per instance)

- -The average size of write requests that were issued to the device. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100331` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) -``` -
- -
- -#### searcher: cache_disk_reads_merged_sec - -

Merged read request rate over 1m (per instance)

- -The number of read requests merged per second that were queued to the device. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100340` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_merged_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### searcher: cache_disk_writes_merged_sec - -

Merged writes request rate over 1m (per instance)

- -The number of write requests merged per second that were queued to the device. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100341` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_merged_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### searcher: cache_disk_average_queue_size - -

Average queue size over 1m (per instance)

- -The number of I/O operations that were being queued or being serviced. See https://blog.actorsfit.com/a?ID=00200-428fa2ac-e338-4540-848c-af9a3eb1ebd2 for background (avgqu-sz). - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100350` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_io_time_weighted_seconds_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -### Searcher: Searcher GRPC server metrics - -#### searcher: searcher_grpc_request_rate_all_methods - -

Request rate across all methods over 2m

- -The number of gRPC requests received per second across all methods, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100400` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum(rate(grpc_server_started_total{instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) -``` -
- -
- -#### searcher: searcher_grpc_request_rate_per_method - -

Request rate per-method over 2m

- -The number of gRPC requests received per second broken out per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100401` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum(rate(grpc_server_started_total{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method) -``` -
- -
- -#### searcher: searcher_error_percentage_all_methods - -

Error percentage across all methods over 2m

- -The percentage of gRPC requests that fail across all methods, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100410` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) )) -``` -
- -
- -#### searcher: searcher_grpc_error_percentage_per_method - -

Error percentage per-method over 2m

- -The percentage of gRPC requests that fail per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100411` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${searcher_method:regex}`,grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method)) )) -``` -
- -
- -#### searcher: searcher_p99_response_time_per_method - -

99th percentile response time per method over 2m

- -The 99th percentile response time per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100420` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) -``` -
- -
- -#### searcher: searcher_p90_response_time_per_method - -

90th percentile response time per method over 2m

- -The 90th percentile response time per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100421` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) -``` -
- -
- -#### searcher: searcher_p75_response_time_per_method - -

75th percentile response time per method over 2m

- -The 75th percentile response time per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100422` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) -``` -
- -
- -#### searcher: searcher_p99_9_response_size_per_method - -

99.9th percentile total response size per method over 2m

- -The 99.9th percentile total per-RPC response size per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100430` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) -``` -
- -
- -#### searcher: searcher_p90_response_size_per_method - -

90th percentile total response size per method over 2m

- -The 90th percentile total per-RPC response size per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100431` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) +sum by (state)(increase(searcher_hybrid_final_state_total{instance=~`${instance:regex}`}[10m])) ```

-#### searcher: searcher_p75_response_size_per_method +#### searcher: searcher_hybrid_retry_total -

75th percentile total response size per method over 2m

+

Hybrid search retrying over 10m

-The 75th percentile total per-RPC response size per method, aggregated across all instances. +Expectation is that this graph should mostly be 0. It will trigger if a user +manages to do a search and the underlying index changes while searching or +Zoekt goes down. So occasional bursts can be expected, but if this graph is +regularly above 0 it is a sign for further investigation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100432` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100201` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19503,21 +18785,25 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100432` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) +sum by (reason)(increase(searcher_hybrid_retry_total{instance=~`${instance:regex}`}[10m])) ```
-#### searcher: searcher_p99_9_invididual_sent_message_size_per_method +### Searcher: Cache disk I/O metrics -

99.9th percentile individual sent message size per method over 2m

+#### searcher: cache_disk_reads_sec -The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +

Read request rate over 1m (per instance)

+ +The number of read requests that were issued to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100440` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19527,21 +18813,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100440` Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### searcher: searcher_p90_invididual_sent_message_size_per_method +#### searcher: cache_disk_writes_sec -

90th percentile individual sent message size per method over 2m

+

Write request rate over 1m (per instance)

-The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The number of write requests that were issued to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100441` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100301` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19551,21 +18839,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100441` Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### searcher: searcher_p75_invididual_sent_message_size_per_method +#### searcher: cache_disk_read_throughput -

75th percentile individual sent message size per method over 2m

+

Read throughput over 1m (per instance)

-The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The amount of data that was read from the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100442` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100310` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19575,21 +18865,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100442` Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### searcher: searcher_grpc_response_stream_message_count_per_method +#### searcher: cache_disk_write_throughput -

Average streaming response message count per-method over 2m

+

Write throughput over 1m (per instance)

-The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. +The amount of data that was written to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100450` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100311` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19599,21 +18891,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100450` Query: ``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method))) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### searcher: searcher_grpc_all_codes_per_method +#### searcher: cache_disk_read_duration -

Response codes rate per-method over 2m

+

Average read duration over 1m (per instance)

-The rate of all generated gRPC response codes per method, aggregated across all instances. +The average time for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100460` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100320` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19623,23 +18917,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100460` Query: ``` -sum(rate(grpc_server_handled_total{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method, grpc_code) +(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-### Searcher: Searcher GRPC "internal error" metrics +#### searcher: cache_disk_write_duration -#### searcher: searcher_grpc_clients_error_percentage_all_methods +

Average write duration over 1m (per instance)

-

Client baseline error percentage across all methods over 2m

+The average time for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. -The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "searcher" clients. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100321` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19649,21 +18943,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100500` Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService"}[2m]))))))) +(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_write_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### searcher: searcher_grpc_clients_error_percentage_per_method +#### searcher: cache_disk_read_request_size -

Client baseline error percentage per-method over 2m

+

Average read request size over 1m (per instance)

-The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "searcher" clients. +The average size of read requests that were issued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100330` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19673,21 +18969,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100501` Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method)))))) +(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### searcher: searcher_grpc_clients_all_codes_per_method +#### searcher: cache_disk_write_request_size) -

Client baseline response codes rate per-method over 2m

+

Average write request size over 1m (per instance)

-The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "searcher" clients. +The average size of write requests that were issued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100331` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19697,27 +18995,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100502` Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method, grpc_code)) +(((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### searcher: searcher_grpc_clients_internal_error_percentage_all_methods - -

Client-observed gRPC internal error percentage across all methods over 2m

- -The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "searcher" clients. +#### searcher: cache_disk_reads_merged_sec -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "searcher" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +

Merged read request rate over 1m (per instance)

-When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +The number of read requests merged per second that were queued to the device. -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100340` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19727,27 +19021,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100510` Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService"}[2m]))))))) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_merged_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### searcher: searcher_grpc_clients_internal_error_percentage_per_method - -

Client-observed gRPC internal error percentage per-method over 2m

- -The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "searcher" clients. +#### searcher: cache_disk_writes_merged_sec -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "searcher" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +

Merged writes request rate over 1m (per instance)

-When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +The number of write requests merged per second that were queued to the device. -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100341` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19757,27 +19047,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100511` Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method)))))) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_merged_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### searcher: searcher_grpc_clients_internal_error_all_codes_per_method - -

Client-observed gRPC internal error response code rate per-method over 2m

- -The rate of gRPC internal-error response codes per method, aggregated across all "searcher" clients. +#### searcher: cache_disk_average_queue_size -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "searcher" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +

Average queue size over 1m (per instance)

-When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +The number of I/O operations that were being queued or being serviced. See https://blog.actorsfit.com/a?ID=00200-428fa2ac-e338-4540-848c-af9a3eb1ebd2 for background (avgqu-sz). -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), searcher could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device searcher is using, not the load searcher is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100350` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19787,23 +19073,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100512` Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",is_internal_error="true",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method, grpc_code)) +(max by (instance) (searcher_mount_point_info{mount_name="cacheDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_io_time_weighted_seconds_total{instance=~`node-exporter.*`}[1m]))))) ```
-### Searcher: Searcher GRPC retry metrics +### Searcher: Searcher GRPC server metrics -#### searcher: searcher_grpc_clients_retry_percentage_across_all_methods +#### searcher: searcher_grpc_request_rate_all_methods -

Client retry percentage across all methods over 2m

+

Request rate across all methods over 2m

-The percentage of gRPC requests that were retried across all methods, aggregated across all "searcher" clients. +The number of gRPC requests received per second across all methods, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19813,21 +19099,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100600` Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService"}[2m]))))))) +sum(rate(grpc_server_started_total{instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) ```
-#### searcher: searcher_grpc_clients_retry_percentage_per_method +#### searcher: searcher_grpc_request_rate_per_method -

Client retry percentage per-method over 2m

+

Request rate per-method over 2m

-The percentage of gRPC requests that were retried aggregated across all "searcher" clients, broken out per method. +The number of gRPC requests received per second broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100401` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19837,21 +19123,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100601` Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",is_retried="true",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method)))))) +sum(rate(grpc_server_started_total{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method) ```
-#### searcher: searcher_grpc_clients_retry_count_per_method +#### searcher: searcher_error_percentage_all_methods -

Client retry count per-method over 2m

+

Error percentage across all methods over 2m

-The count of gRPC requests that were retried aggregated across all "searcher" clients, broken out per method +The percentage of gRPC requests that fail across all methods, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100410` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -19861,25 +19147,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100602` Query: ``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}",is_retried="true"}[2m])) by (grpc_method)) +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) )) ```
-### Searcher: Site configuration client update latency - -#### searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance +#### searcher: searcher_grpc_error_percentage_per_method -

Duration since last successful site configuration update (by instance)

+

Error percentage per-method over 2m

-The duration since the configuration client used by the "searcher" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. +The percentage of gRPC requests that fail per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -19887,21 +19171,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100700` Query: ``` -src_conf_client_time_since_last_successful_update_seconds{job=~`.*searcher`,instance=~`${instance:regex}`} +(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${searcher_method:regex}`,grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method)) )) ```

-#### searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance +#### searcher: searcher_p99_response_time_per_method -

Maximum duration since last successful site configuration update (all "searcher" instances)

+

99th percentile response time per method over 2m

-Refer to the [alerts reference](alerts#searcher-searcher-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. +The 99th percentile response time per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100701` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100420` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -19909,23 +19195,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100701` Query: ``` -max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*searcher`,instance=~`${instance:regex}`}[1m])) +histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-### Searcher: Database connections +#### searcher: searcher_p90_response_time_per_method -#### searcher: max_open_conns +

90th percentile response time per method over 2m

-

Maximum open

+The 90th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100421` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -19933,21 +19219,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100800` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="searcher"}) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: open_conns +#### searcher: searcher_p75_response_time_per_method -

Established

+

75th percentile response time per method over 2m

+ +The 75th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100422` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -19955,21 +19243,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100801` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="searcher"}) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: in_use +#### searcher: searcher_p99_9_response_size_per_method -

Used

+

99.9th percentile total response size per method over 2m

+ +The 99.9th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100430` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -19977,21 +19267,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100810` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="searcher"}) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: idle +#### searcher: searcher_p90_response_size_per_method -

Idle

+

90th percentile total response size per method over 2m

+ +The 90th percentile total per-RPC response size per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100431` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -19999,21 +19291,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100811` Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="searcher"}) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: mean_blocked_seconds_per_conn_request +#### searcher: searcher_p75_response_size_per_method -

Mean blocked seconds per conn request

+

75th percentile total response size per method over 2m

-Refer to the [alerts reference](alerts#searcher-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +The 75th percentile total per-RPC response size per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100820` on your Sourcegraph instance. +This panel has no related alerts. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100432` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -20021,21 +19315,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100820` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="searcher"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="searcher"}[5m])) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: closed_max_idle +#### searcher: searcher_p99_9_invididual_sent_message_size_per_method -

Closed by SetMaxIdleConns

+

99.9th percentile individual sent message size per method over 2m

+ +The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100830` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100440` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -20043,21 +19339,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100830` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="searcher"}[5m])) +histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: closed_max_lifetime +#### searcher: searcher_p90_invididual_sent_message_size_per_method -

Closed by SetConnMaxLifetime

+

90th percentile individual sent message size per method over 2m

+ +The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100831` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100441` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -20065,21 +19363,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100831` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="searcher"}[5m])) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-#### searcher: closed_max_idle_time +#### searcher: searcher_p75_invididual_sent_message_size_per_method -

Closed by SetConnMaxIdleTime

+

75th percentile individual sent message size per method over 2m

+ +The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100832` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100442` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -20087,31 +19387,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100832` Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="searcher"}[5m])) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m]))) ```

-### Searcher: Container monitoring (not available on server) - -#### searcher: container_missing - -

Container missing

+#### searcher: searcher_grpc_response_stream_message_count_per_method -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +

Average streaming response message count per-method over 2m

-- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod searcher` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p searcher`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' searcher` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the searcher container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs searcher` (note this will include logs from the previous and currently running container). +The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100450` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20121,19 +19411,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100900` Query: ``` -count by(name) ((time() - container_last_seen{name=~"^searcher.*"}) > 60) +((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method))) ```
-#### searcher: container_cpu_usage +#### searcher: searcher_grpc_all_codes_per_method -

Container cpu usage total (1m average) across all cores by instance

+

Response codes rate per-method over 2m

-Refer to the [alerts reference](alerts#searcher-container-cpu-usage) for 1 alert related to this panel. +The rate of all generated gRPC response codes per method, aggregated across all instances. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100901` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100460` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20143,19 +19435,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100901` Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"} +sum(rate(grpc_server_handled_total{grpc_method=~`${searcher_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"searcher.v1.SearcherService"}[2m])) by (grpc_method, grpc_code) ```
-#### searcher: container_memory_usage +### Searcher: Searcher GRPC "internal error" metrics -

Container memory usage by instance

+#### searcher: searcher_grpc_clients_error_percentage_all_methods -Refer to the [alerts reference](alerts#searcher-container-memory-usage) for 1 alert related to this panel. +

Client baseline error percentage across all methods over 2m

-To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100902` on your Sourcegraph instance. +The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "searcher" clients. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100500` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20165,22 +19461,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100902` Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"} +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService"}[2m]))))))) ```
-#### searcher: fs_io_operations +#### searcher: searcher_grpc_clients_error_percentage_per_method -

Filesystem reads and writes rate by instance over 1h

+

Client baseline error percentage per-method over 2m

-This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "searcher" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100903` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100501` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20190,21 +19485,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100903` Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^searcher.*"}[1h]) + rate(container_fs_writes_total{name=~"^searcher.*"}[1h])) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method)))))) ```
-### Searcher: Provisioning indicators (not available on server) +#### searcher: searcher_grpc_clients_all_codes_per_method -#### searcher: provisioning_container_cpu_usage_long_term +

Client baseline response codes rate per-method over 2m

-

Container cpu usage total (90th percentile over 1d) across all cores by instance

+The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "searcher" clients. -Refer to the [alerts reference](alerts#searcher-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100502` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20214,41 +19509,27 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101000` Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[1d]) +(sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```
-#### searcher: provisioning_container_memory_usage_long_term - -

Container memory usage (1d maximum) by instance

- -Refer to the [alerts reference](alerts#searcher-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101001` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details +#### searcher: searcher_grpc_clients_internal_error_percentage_all_methods -Query: +

Client-observed gRPC internal error percentage across all methods over 2m

-``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[1d]) -``` -
+The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "searcher" clients. -
+**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "searcher" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. -#### searcher: provisioning_container_cpu_usage_short_term +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. -

Container cpu usage total (5m maximum) across all cores by instance

+**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. -Refer to the [alerts reference](alerts#searcher-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100510` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20258,19 +19539,27 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101010` Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[5m]) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService"}[2m]))))))) ```
-#### searcher: provisioning_container_memory_usage_short_term +#### searcher: searcher_grpc_clients_internal_error_percentage_per_method -

Container memory usage (5m maximum) by instance

+

Client-observed gRPC internal error percentage per-method over 2m

-Refer to the [alerts reference](alerts#searcher-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "searcher" clients. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101011` on your Sourcegraph instance. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "searcher" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100511` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20280,22 +19569,27 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101011` Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[5m]) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### searcher: container_oomkill_events_total +#### searcher: searcher_grpc_clients_internal_error_all_codes_per_method -

Container OOMKILL events total by instance

+

Client-observed gRPC internal error response code rate per-method over 2m

-This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +The rate of gRPC internal-error response codes per method, aggregated across all "searcher" clients. -Refer to the [alerts reference](alerts#searcher-container-oomkill-events-total) for 1 alert related to this panel. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "searcher" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101012` on your Sourcegraph instance. +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100512` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20305,23 +19599,23 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101012` Query: ``` -max by (name) (container_oom_events_total{name=~"^searcher.*"}) +(sum(rate(src_grpc_method_status{grpc_service=~"searcher.v1.SearcherService",is_internal_error="true",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```
-### Searcher: Golang runtime monitoring +### Searcher: Searcher GRPC retry metrics -#### searcher: go_goroutines +#### searcher: searcher_grpc_clients_retry_percentage_across_all_methods -

Maximum active goroutines

+

Client retry percentage across all methods over 2m

-A high value here indicates a possible goroutine leak. +The percentage of gRPC requests that were retried across all methods, aggregated across all "searcher" clients. -Refer to the [alerts reference](alerts#searcher-go-goroutines) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100600` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20331,19 +19625,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101100` Query: ``` -max by(instance) (go_goroutines{job=~".*searcher"}) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService"}[2m]))))))) ```
-#### searcher: go_gc_duration_seconds +#### searcher: searcher_grpc_clients_retry_percentage_per_method -

Maximum go garbage collection duration

+

Client retry percentage per-method over 2m

-Refer to the [alerts reference](alerts#searcher-go-gc-duration-seconds) for 1 alert related to this panel. +The percentage of gRPC requests that were retried aggregated across all "searcher" clients, broken out per method. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101101` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100601` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20353,21 +19649,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101101` Query: ``` -max by(instance) (go_gc_duration_seconds{job=~".*searcher"}) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",is_retried="true",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}"}[2m])) by (grpc_method)))))) ```
-### Searcher: Kubernetes monitoring (only available on Kubernetes) +#### searcher: searcher_grpc_clients_retry_count_per_method -#### searcher: pods_available_percentage +

Client retry count per-method over 2m

-

Percentage pods available

+The count of gRPC requests that were retried aggregated across all "searcher" clients, broken out per method -Refer to the [alerts reference](alerts#searcher-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100602` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -20377,27 +19673,21 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101200` Query: ``` -sum by(app) (up{app=~".*searcher"}) / count by (app) (up{app=~".*searcher"}) * 100 +(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"searcher.v1.SearcherService",grpc_method=~"${searcher_method:regex}",is_retried="true"}[2m])) by (grpc_method)) ```
-## Symbols - -

Handles symbol searches for unindexed branches.

+### Searcher: Codeintel: Symbols API -To see this dashboard, visit `/-/debug/grafana/d/symbols/symbols` on your Sourcegraph instance. - -### Symbols: Codeintel: Symbols API - -#### symbols: codeintel_symbols_api_total +#### searcher: codeintel_symbols_api_total

Aggregate API operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100700` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20407,19 +19697,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100000` o Query: ``` -sum(increase(src_codeintel_symbols_api_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_api_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_api_99th_percentile_duration +#### searcher: codeintel_symbols_api_99th_percentile_duration

Aggregate successful API operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100701` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20429,19 +19719,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100001` o Query: ``` -sum by (le)(rate(src_codeintel_symbols_api_duration_seconds_bucket{job=~"^symbols.*"}[5m])) +sum by (le)(rate(src_codeintel_symbols_api_duration_seconds_bucket{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_api_errors_total +#### searcher: codeintel_symbols_api_errors_total

Aggregate API operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100002` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100702` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20451,19 +19741,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100002` o Query: ``` -sum(increase(src_codeintel_symbols_api_errors_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_api_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_api_error_rate +#### searcher: codeintel_symbols_api_error_rate

Aggregate API operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100003` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100703` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20473,19 +19763,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100003` o Query: ``` -sum(increase(src_codeintel_symbols_api_errors_total{job=~"^symbols.*"}[5m])) / (sum(increase(src_codeintel_symbols_api_total{job=~"^symbols.*"}[5m])) + sum(increase(src_codeintel_symbols_api_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum(increase(src_codeintel_symbols_api_errors_total{job=~"^searcher.*"}[5m])) / (sum(increase(src_codeintel_symbols_api_total{job=~"^searcher.*"}[5m])) + sum(increase(src_codeintel_symbols_api_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-#### symbols: codeintel_symbols_api_total +#### searcher: codeintel_symbols_api_total

API operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100710` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20495,19 +19785,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100010` o Query: ``` -sum by (op,parseAmount)(increase(src_codeintel_symbols_api_total{job=~"^symbols.*"}[5m])) +sum by (op,parseAmount)(increase(src_codeintel_symbols_api_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_api_99th_percentile_duration +#### searcher: codeintel_symbols_api_99th_percentile_duration

99th percentile successful API operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100711` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20517,19 +19807,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100011` o Query: ``` -histogram_quantile(0.99, sum by (le,op,parseAmount)(rate(src_codeintel_symbols_api_duration_seconds_bucket{job=~"^symbols.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op,parseAmount)(rate(src_codeintel_symbols_api_duration_seconds_bucket{job=~"^searcher.*"}[5m]))) ```
-#### symbols: codeintel_symbols_api_errors_total +#### searcher: codeintel_symbols_api_errors_total

API operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100712` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20539,19 +19829,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100012` o Query: ``` -sum by (op,parseAmount)(increase(src_codeintel_symbols_api_errors_total{job=~"^symbols.*"}[5m])) +sum by (op,parseAmount)(increase(src_codeintel_symbols_api_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_api_error_rate +#### searcher: codeintel_symbols_api_error_rate

API operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100013` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100713` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20561,21 +19851,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100013` o Query: ``` -sum by (op,parseAmount)(increase(src_codeintel_symbols_api_errors_total{job=~"^symbols.*"}[5m])) / (sum by (op,parseAmount)(increase(src_codeintel_symbols_api_total{job=~"^symbols.*"}[5m])) + sum by (op,parseAmount)(increase(src_codeintel_symbols_api_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum by (op,parseAmount)(increase(src_codeintel_symbols_api_errors_total{job=~"^searcher.*"}[5m])) / (sum by (op,parseAmount)(increase(src_codeintel_symbols_api_total{job=~"^searcher.*"}[5m])) + sum by (op,parseAmount)(increase(src_codeintel_symbols_api_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-### Symbols: Codeintel: Symbols parser +### Searcher: Codeintel: Symbols parser -#### symbols: symbols +#### searcher: searcher

In-flight parse jobs

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100800` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20585,19 +19875,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100100` o Query: ``` -max(src_codeintel_symbols_parsing{job=~"^symbols.*"}) +max(src_codeintel_symbols_parsing{job=~"^searcher.*"}) ```
-#### symbols: symbols +#### searcher: searcher

Parser queue size

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100801` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20607,19 +19897,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100101` o Query: ``` -max(src_codeintel_symbols_parse_queue_size{job=~"^symbols.*"}) +max(src_codeintel_symbols_parse_queue_size{job=~"^searcher.*"}) ```
-#### symbols: symbols +#### searcher: searcher

Parse queue timeouts

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100802` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20629,19 +19919,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100102` o Query: ``` -max(src_codeintel_symbols_parse_queue_timeouts_total{job=~"^symbols.*"}) +max(src_codeintel_symbols_parse_queue_timeouts_total{job=~"^searcher.*"}) ```
-#### symbols: symbols +#### searcher: searcher

Parse failures every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100803` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20651,19 +19941,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100103` o Query: ``` -rate(src_codeintel_symbols_parse_failed_total{job=~"^symbols.*"}[5m]) +rate(src_codeintel_symbols_parse_failed_total{job=~"^searcher.*"}[5m]) ```
-#### symbols: codeintel_symbols_parser_total +#### searcher: codeintel_symbols_parser_total

Aggregate parser operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100810` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20673,19 +19963,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100110` o Query: ``` -sum(increase(src_codeintel_symbols_parser_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_parser_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_parser_99th_percentile_duration +#### searcher: codeintel_symbols_parser_99th_percentile_duration

Aggregate successful parser operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100811` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20695,19 +19985,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100111` o Query: ``` -sum by (le)(rate(src_codeintel_symbols_parser_duration_seconds_bucket{job=~"^symbols.*"}[5m])) +sum by (le)(rate(src_codeintel_symbols_parser_duration_seconds_bucket{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_parser_errors_total +#### searcher: codeintel_symbols_parser_errors_total

Aggregate parser operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100812` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20717,19 +20007,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100112` o Query: ``` -sum(increase(src_codeintel_symbols_parser_errors_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_parser_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_parser_error_rate +#### searcher: codeintel_symbols_parser_error_rate

Aggregate parser operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100813` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20739,19 +20029,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100113` o Query: ``` -sum(increase(src_codeintel_symbols_parser_errors_total{job=~"^symbols.*"}[5m])) / (sum(increase(src_codeintel_symbols_parser_total{job=~"^symbols.*"}[5m])) + sum(increase(src_codeintel_symbols_parser_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum(increase(src_codeintel_symbols_parser_errors_total{job=~"^searcher.*"}[5m])) / (sum(increase(src_codeintel_symbols_parser_total{job=~"^searcher.*"}[5m])) + sum(increase(src_codeintel_symbols_parser_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-#### symbols: codeintel_symbols_parser_total +#### searcher: codeintel_symbols_parser_total

Parser operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100820` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20761,19 +20051,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100120` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_parser_total{job=~"^symbols.*"}[5m])) +sum by (op)(increase(src_codeintel_symbols_parser_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_parser_99th_percentile_duration +#### searcher: codeintel_symbols_parser_99th_percentile_duration

99th percentile successful parser operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100121` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100821` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20783,19 +20073,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100121` o Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_symbols_parser_duration_seconds_bucket{job=~"^symbols.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_symbols_parser_duration_seconds_bucket{job=~"^searcher.*"}[5m]))) ```
-#### symbols: codeintel_symbols_parser_errors_total +#### searcher: codeintel_symbols_parser_errors_total

Parser operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100122` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100822` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20805,19 +20095,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100122` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_parser_errors_total{job=~"^symbols.*"}[5m])) +sum by (op)(increase(src_codeintel_symbols_parser_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_parser_error_rate +#### searcher: codeintel_symbols_parser_error_rate

Parser operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100123` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100823` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20827,15 +20117,15 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100123` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_parser_errors_total{job=~"^symbols.*"}[5m])) / (sum by (op)(increase(src_codeintel_symbols_parser_total{job=~"^symbols.*"}[5m])) + sum by (op)(increase(src_codeintel_symbols_parser_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_symbols_parser_errors_total{job=~"^searcher.*"}[5m])) / (sum by (op)(increase(src_codeintel_symbols_parser_total{job=~"^searcher.*"}[5m])) + sum by (op)(increase(src_codeintel_symbols_parser_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-### Symbols: Codeintel: Symbols cache janitor +### Searcher: Codeintel: Symbols cache janitor -#### symbols: symbols +#### searcher: searcher

Size in bytes of the on-disk cache

@@ -20843,7 +20133,7 @@ no This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100900` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20853,13 +20143,13 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100200` o Query: ``` -src_codeintel_symbols_store_cache_size_bytes +src_diskcache_store_symbols_cache_size_bytes ```
-#### symbols: symbols +#### searcher: searcher

Cache eviction operations every 5m

@@ -20867,7 +20157,7 @@ no This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100901` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20877,13 +20167,13 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100201` o Query: ``` -rate(src_codeintel_symbols_store_evictions_total[5m]) +rate(src_diskcache_store_symbols_evictions_total[5m]) ```
-#### symbols: symbols +#### searcher: searcher

Cache eviction operation errors every 5m

@@ -20891,7 +20181,7 @@ no This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100902` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20901,21 +20191,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100202` o Query: ``` -rate(src_codeintel_symbols_store_errors_total[5m]) +rate(src_diskcache_store_symbols_errors_total[5m]) ```
-### Symbols: Codeintel: Symbols repository fetcher +### Searcher: Codeintel: Symbols repository fetcher -#### symbols: symbols +#### searcher: searcher

In-flight repository fetch operations

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20931,13 +20221,13 @@ src_codeintel_symbols_fetching
-#### symbols: symbols +#### searcher: searcher

Repository fetch queue size

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20947,19 +20237,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100301` o Query: ``` -max(src_codeintel_symbols_fetch_queue_size{job=~"^symbols.*"}) +max(src_codeintel_symbols_fetch_queue_size{job=~"^searcher.*"}) ```
-#### symbols: codeintel_symbols_repository_fetcher_total +#### searcher: codeintel_symbols_repository_fetcher_total

Aggregate fetcher operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101010` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20969,19 +20259,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100310` o Query: ``` -sum(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_repository_fetcher_99th_percentile_duration +#### searcher: codeintel_symbols_repository_fetcher_99th_percentile_duration

Aggregate successful fetcher operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101011` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -20991,19 +20281,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100311` o Query: ``` -sum by (le)(rate(src_codeintel_symbols_repository_fetcher_duration_seconds_bucket{job=~"^symbols.*"}[5m])) +sum by (le)(rate(src_codeintel_symbols_repository_fetcher_duration_seconds_bucket{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_repository_fetcher_errors_total +#### searcher: codeintel_symbols_repository_fetcher_errors_total

Aggregate fetcher operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101012` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21013,19 +20303,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100312` o Query: ``` -sum(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_repository_fetcher_error_rate +#### searcher: codeintel_symbols_repository_fetcher_error_rate

Aggregate fetcher operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101013` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21035,19 +20325,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100313` o Query: ``` -sum(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^symbols.*"}[5m])) / (sum(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^symbols.*"}[5m])) + sum(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^searcher.*"}[5m])) / (sum(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^searcher.*"}[5m])) + sum(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-#### symbols: codeintel_symbols_repository_fetcher_total +#### searcher: codeintel_symbols_repository_fetcher_total

Fetcher operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100320` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101020` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21057,19 +20347,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100320` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^symbols.*"}[5m])) +sum by (op)(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_repository_fetcher_99th_percentile_duration +#### searcher: codeintel_symbols_repository_fetcher_99th_percentile_duration

99th percentile successful fetcher operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100321` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101021` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21079,19 +20369,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100321` o Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_symbols_repository_fetcher_duration_seconds_bucket{job=~"^symbols.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_symbols_repository_fetcher_duration_seconds_bucket{job=~"^searcher.*"}[5m]))) ```
-#### symbols: codeintel_symbols_repository_fetcher_errors_total +#### searcher: codeintel_symbols_repository_fetcher_errors_total

Fetcher operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100322` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101022` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21101,19 +20391,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100322` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^symbols.*"}[5m])) +sum by (op)(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_repository_fetcher_error_rate +#### searcher: codeintel_symbols_repository_fetcher_error_rate

Fetcher operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100323` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101023` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21123,21 +20413,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100323` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^symbols.*"}[5m])) / (sum by (op)(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^symbols.*"}[5m])) + sum by (op)(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^searcher.*"}[5m])) / (sum by (op)(increase(src_codeintel_symbols_repository_fetcher_total{job=~"^searcher.*"}[5m])) + sum by (op)(increase(src_codeintel_symbols_repository_fetcher_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-### Symbols: Codeintel: Symbols gitserver client +### Searcher: Codeintel: Symbols gitserver client -#### symbols: codeintel_symbols_gitserver_total +#### searcher: codeintel_symbols_gitserver_total

Aggregate gitserver client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101100` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21147,19 +20437,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100400` o Query: ``` -sum(increase(src_codeintel_symbols_gitserver_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_gitserver_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_gitserver_99th_percentile_duration +#### searcher: codeintel_symbols_gitserver_99th_percentile_duration

Aggregate successful gitserver client operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101101` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21169,19 +20459,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100401` o Query: ``` -sum by (le)(rate(src_codeintel_symbols_gitserver_duration_seconds_bucket{job=~"^symbols.*"}[5m])) +sum by (le)(rate(src_codeintel_symbols_gitserver_duration_seconds_bucket{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_gitserver_errors_total +#### searcher: codeintel_symbols_gitserver_errors_total

Aggregate gitserver client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101102` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21191,19 +20481,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100402` o Query: ``` -sum(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^symbols.*"}[5m])) +sum(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_gitserver_error_rate +#### searcher: codeintel_symbols_gitserver_error_rate

Aggregate gitserver client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101103` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21213,19 +20503,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100403` o Query: ``` -sum(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^symbols.*"}[5m])) / (sum(increase(src_codeintel_symbols_gitserver_total{job=~"^symbols.*"}[5m])) + sum(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^searcher.*"}[5m])) / (sum(increase(src_codeintel_symbols_gitserver_total{job=~"^searcher.*"}[5m])) + sum(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-#### symbols: codeintel_symbols_gitserver_total +#### searcher: codeintel_symbols_gitserver_total

Gitserver client operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101110` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21235,19 +20525,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100410` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_gitserver_total{job=~"^symbols.*"}[5m])) +sum by (op)(increase(src_codeintel_symbols_gitserver_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_gitserver_99th_percentile_duration +#### searcher: codeintel_symbols_gitserver_99th_percentile_duration

99th percentile successful gitserver client operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101111` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21257,19 +20547,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100411` o Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_symbols_gitserver_duration_seconds_bucket{job=~"^symbols.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_symbols_gitserver_duration_seconds_bucket{job=~"^searcher.*"}[5m]))) ```
-#### symbols: codeintel_symbols_gitserver_errors_total +#### searcher: codeintel_symbols_gitserver_errors_total

Gitserver client operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101112` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21279,19 +20569,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100412` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^symbols.*"}[5m])) +sum by (op)(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^searcher.*"}[5m])) ```
-#### symbols: codeintel_symbols_gitserver_error_rate +#### searcher: codeintel_symbols_gitserver_error_rate

Gitserver client operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101113` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -21301,15 +20591,15 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100413` o Query: ``` -sum by (op)(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^symbols.*"}[5m])) / (sum by (op)(increase(src_codeintel_symbols_gitserver_total{job=~"^symbols.*"}[5m])) + sum by (op)(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^symbols.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^searcher.*"}[5m])) / (sum by (op)(increase(src_codeintel_symbols_gitserver_total{job=~"^searcher.*"}[5m])) + sum by (op)(increase(src_codeintel_symbols_gitserver_errors_total{job=~"^searcher.*"}[5m]))) * 100 ```
-### Symbols: Rockskip +### Searcher: Rockskip -#### symbols: p95_rockskip_search_request_duration +#### searcher: p95_rockskip_search_request_duration

95th percentile search request duration over 5m

@@ -21317,7 +20607,7 @@ The 95th percentile duration of search requests to Rockskip in seconds. Lower is This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101200` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21333,7 +20623,7 @@ histogram_quantile(0.95, sum(rate(src_rockskip_service_search_request_duration_s
-#### symbols: rockskip_in_flight_search_requests +#### searcher: rockskip_in_flight_search_requests

Number of in-flight search requests

@@ -21343,7 +20633,7 @@ The number of search requests currently being processed by Rockskip. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101201` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21359,7 +20649,7 @@ sum(src_rockskip_service_in_flight_search_requests)
-#### symbols: rockskip_search_request_errors +#### searcher: rockskip_search_request_errors

Search request errors every 5m

@@ -21369,7 +20659,7 @@ The number of search requests that returned an error in the last 5 minutes. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101202` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21385,7 +20675,7 @@ sum(increase(src_rockskip_service_search_request_errors[5m]))
-#### symbols: p95_rockskip_index_job_duration +#### searcher: p95_rockskip_index_job_duration

95th percentile index job duration over 5m

@@ -21395,7 +20685,7 @@ The 95th percentile duration of index jobs in seconds. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101210` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21411,7 +20701,7 @@ histogram_quantile(0.95, sum(rate(src_rockskip_service_index_job_duration_second
-#### symbols: rockskip_in_flight_index_jobs +#### searcher: rockskip_in_flight_index_jobs

Number of in-flight index jobs

@@ -21420,7 +20710,7 @@ The number of index jobs currently being processed by Rockskip. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101211` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21436,7 +20726,7 @@ sum(src_rockskip_service_in_flight_index_jobs)
-#### symbols: rockskip_index_job_errors +#### searcher: rockskip_index_job_errors

Index job errors every 5m

@@ -21447,7 +20737,7 @@ The number of index jobs that returned an error in the last 5 minutes. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101212` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21463,7 +20753,7 @@ sum(increase(src_rockskip_service_index_job_errors[5m]))
-#### symbols: rockskip_number_of_repos_indexed +#### searcher: rockskip_number_of_repos_indexed

Number of repositories indexed by Rockskip

@@ -21474,7 +20764,7 @@ The number of repositories indexed by Rockskip. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100520` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101220` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -21490,19 +20780,19 @@ max(src_rockskip_service_repos_indexed)
-### Symbols: Symbols GRPC server metrics +#### searcher: p95_rockskip_index_queue_age -#### symbols: symbols_grpc_request_rate_all_methods +

95th percentile index queue delay over 5m

-

Request rate across all methods over 2m

- -The number of gRPC requests received per second across all methods, aggregated across all instances. +The 95th percentile age of index jobs in seconds. + A high delay might indicate a resource issue. + Consider increasing indexing bandwidth by either increasing the number of queues or the number of symbol services. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101221` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -21510,23 +20800,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100600` o Query: ``` -sum(rate(grpc_server_started_total{instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) +histogram_quantile(0.95, sum(rate(src_rockskip_service_index_queue_age_seconds_bucket[5m])) by (le)) ```

-#### symbols: symbols_grpc_request_rate_per_method +#### searcher: rockskip_file_parsing_requests -

Request rate per-method over 2m

+

File parsing requests every 5m

-The number of gRPC requests received per second broken out per method, aggregated across all instances. +The number of search requests in the last 5 minutes that were handled by parsing a single file, as opposed to searching the Rockskip index. + This is an optimization to speed up symbol sidebar queries. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101222` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -21534,23 +20825,25 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100601` o Query: ``` -sum(rate(grpc_server_started_total{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) by (grpc_method) +sum(increase(src_rockskip_service_file_parsing_requests[5m])) ```

-#### symbols: symbols_error_percentage_all_methods +### Searcher: Site configuration client update latency -

Error percentage across all methods over 2m

+#### searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance -The percentage of gRPC requests that fail across all methods, aggregated across all instances. +

Duration since last successful site configuration update (by instance)

+ +The duration since the configuration client used by the "searcher" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21558,23 +20851,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100610` o Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) / (sum(rate(grpc_server_handled_total{instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) )) +src_conf_client_time_since_last_successful_update_seconds{job=~`.*searcher`,instance=~`${instance:regex}`} ```

-#### symbols: symbols_grpc_error_percentage_per_method - -

Error percentage per-method over 2m

+#### searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance -The percentage of gRPC requests that fail per method, aggregated across all instances. +

Maximum duration since last successful site configuration update (all "searcher" instances)

-This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-searcher-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21582,23 +20873,26 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100611` o Query: ``` -(100.0 * ( (sum(rate(grpc_server_handled_total{grpc_method=~`${symbols_method:regex}`,grpc_code!="OK",instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) by (grpc_method)) / (sum(rate(grpc_server_handled_total{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) by (grpc_method)) )) +max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*searcher`,instance=~`${instance:regex}`}[1m])) ```

-#### symbols: symbols_p99_response_time_per_method +### Searcher: Periodic Goroutines -

99th percentile response time per method over 2m

+#### searcher: running_goroutines -The 99th percentile response time per method, aggregated across all instances. +

Number of currently running periodic goroutines

+ +The number of currently running periodic goroutines by name and job. +A value of 0 indicates the routine isn`t running currently, it awaits it`s next schedule. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100620` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21606,23 +20900,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100620` o Query: ``` -histogram_quantile(0.99, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +sum by (name, job_name) (src_periodic_goroutine_running{job=~".*searcher.*"}) ```

-#### symbols: symbols_p90_response_time_per_method +#### searcher: goroutine_success_rate -

90th percentile response time per method over 2m

+

Success rate for periodic goroutine executions

-The 90th percentile response time per method, aggregated across all instances. +The rate of successful executions of each periodic goroutine. +A low or zero value could indicate that a routine is stalled or encountering errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100621` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21630,23 +20925,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100621` o Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*searcher.*"}[5m])) ```

-#### symbols: symbols_p75_response_time_per_method +#### searcher: goroutine_error_rate -

75th percentile response time per method over 2m

+

Error rate for periodic goroutine executions

-The 75th percentile response time per method, aggregated across all instances. +The rate of errors encountered by each periodic goroutine. +A sustained high error rate may indicate a problem with the routine`s configuration or dependencies. -This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-goroutine-error-rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100622` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101410` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21654,23 +20950,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100622` o Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_handling_seconds_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*searcher.*"}[5m])) ```

-#### symbols: symbols_p99_9_response_size_per_method +#### searcher: goroutine_error_percentage -

99.9th percentile total response size per method over 2m

+

Percentage of periodic goroutine executions that result in errors

-The 99.9th percentile total per-RPC response size per method, aggregated across all instances. +The percentage of executions that result in errors for each periodic goroutine. +A value above 5% indicates that a significant portion of routine executions are failing. -This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-goroutine-error-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100630` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101411` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21678,23 +20975,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100630` o Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*searcher.*"}[5m])) / sum by (name, job_name) (rate(src_periodic_goroutine_total{job=~".*searcher.*"}[5m]) > 0) * 100 ```

-#### symbols: symbols_p90_response_size_per_method +#### searcher: goroutine_handler_duration -

90th percentile total response size per method over 2m

+

95th percentile handler execution time

-The 90th percentile total per-RPC response size per method, aggregated across all instances. +The 95th percentile execution time for each periodic goroutine handler. +Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100631` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101420` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21702,23 +21000,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100631` o Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_duration_seconds_bucket{job=~".*searcher.*"}[5m]))) ```

-#### symbols: symbols_p75_response_size_per_method +#### searcher: goroutine_loop_duration -

75th percentile total response size per method over 2m

+

95th percentile loop cycle time

-The 75th percentile total per-RPC response size per method, aggregated across all instances. +The 95th percentile loop cycle time for each periodic goroutine (excluding sleep time). +This represents how long a complete loop iteration takes before sleeping for the next interval. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100632` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101421` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21726,23 +21025,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100632` o Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_bytes_per_rpc_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_loop_duration_seconds_bucket{job=~".*searcher.*"}[5m]))) ```

-#### symbols: symbols_p99_9_invididual_sent_message_size_per_method +#### searcher: tenant_processing_duration -

99.9th percentile individual sent message size per method over 2m

+

95th percentile tenant processing time

-The 99.9th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The 95th percentile processing time for individual tenants within periodic goroutines. +Higher values indicate that tenant processing is taking longer and may affect overall performance. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100640` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101430` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21750,23 +21050,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100640` o Query: ``` -histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +histogram_quantile(0.95, sum by (name, job_name, le) (rate(src_periodic_goroutine_tenant_duration_seconds_bucket{job=~".*searcher.*"}[5m]))) ```

-#### symbols: symbols_p90_invididual_sent_message_size_per_method +#### searcher: tenant_processing_max -

90th percentile individual sent message size per method over 2m

+

Maximum tenant processing time

-The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The maximum processing time for individual tenants within periodic goroutines. +Consistently high values might indicate problematic tenants or inefficient processing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100641` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101431` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21774,23 +21075,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100641` o Query: ``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +max by (name, job_name) (rate(src_periodic_goroutine_tenant_duration_seconds_sum{job=~".*searcher.*"}[5m]) / rate(src_periodic_goroutine_tenant_duration_seconds_count{job=~".*searcher.*"}[5m])) ```

-#### symbols: symbols_p75_invididual_sent_message_size_per_method +#### searcher: tenant_count -

75th percentile individual sent message size per method over 2m

+

Number of tenants processed per routine

-The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. +The number of tenants processed by each periodic goroutine. +Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100642` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101440` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21798,23 +21100,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100642` o Query: ``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(src_grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m]))) +max by (name, job_name) (src_periodic_goroutine_tenant_count{job=~".*searcher.*"}) ```

-#### symbols: symbols_grpc_response_stream_message_count_per_method +#### searcher: tenant_success_rate -

Average streaming response message count per-method over 2m

+

Rate of successful tenant processing operations

-The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. +The rate of successful tenant processing operations. +A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100650` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101441` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21822,23 +21125,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100650` o Query: ``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) by (grpc_method))) +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*searcher.*"}[5m])) ```

-#### symbols: symbols_grpc_all_codes_per_method +#### searcher: tenant_error_rate -

Response codes rate per-method over 2m

+

Rate of tenant processing errors

-The rate of all generated gRPC response codes per method, aggregated across all instances. +The rate of tenant processing operations that result in errors. +Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100660` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101450` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21846,25 +21150,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100660` o Query: ``` -sum(rate(grpc_server_handled_total{grpc_method=~`${symbols_method:regex}`,instance=~`${instance:regex}`,grpc_service=~"symbols.v1.SymbolsService"}[2m])) by (grpc_method, grpc_code) +sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*searcher.*"}[5m])) ```

-### Symbols: Symbols GRPC "internal error" metrics - -#### symbols: symbols_grpc_clients_error_percentage_all_methods +#### searcher: tenant_error_percentage -

Client baseline error percentage across all methods over 2m

+

Percentage of tenant operations resulting in errors

-The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "symbols" clients. +The percentage of tenant operations that result in errors. +Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101451` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21872,23 +21175,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100700` o Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService"}[2m]))))))) +(sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*searcher.*"}[5m])) / (sum by (name, job_name) (rate(src_periodic_goroutine_tenant_success_total{job=~".*searcher.*"}[5m])) + sum by (name, job_name) (rate(src_periodic_goroutine_tenant_errors_total{job=~".*searcher.*"}[5m])))) * 100 ```

-#### symbols: symbols_grpc_clients_error_percentage_per_method +### Searcher: Database connections -

Client baseline error percentage per-method over 2m

+#### searcher: max_open_conns -The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "symbols" clients. +

Maximum open

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21896,23 +21199,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100701` o Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}"}[2m])) by (grpc_method)))))) +sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="searcher"}) ```

-#### symbols: symbols_grpc_clients_all_codes_per_method - -

Client baseline response codes rate per-method over 2m

+#### searcher: open_conns -The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "symbols" clients. +

Established

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21920,29 +21221,43 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100702` o Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum by (app_name, db_name) (src_pgsql_conns_open{app_name="searcher"}) ```

-#### symbols: symbols_grpc_clients_internal_error_percentage_all_methods +#### searcher: in_use -

Client-observed gRPC internal error percentage across all methods over 2m

+

Used

-The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "symbols" clients. +This panel has no related alerts. -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "symbols" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101510` on your Sourcegraph instance. -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +
+Technical details + +Query: + +``` +sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="searcher"}) +``` +
+ +
+ +#### searcher: idle + +

Idle

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101511` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21950,29 +21265,43 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100710` o Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService"}[2m]))))))) +sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="searcher"}) ```

-#### symbols: symbols_grpc_clients_internal_error_percentage_per_method +#### searcher: mean_blocked_seconds_per_conn_request -

Client-observed gRPC internal error percentage per-method over 2m

+

Mean blocked seconds per conn request

-The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "symbols" clients. +Refer to the [alerts reference](alerts#searcher-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "symbols" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101520` on your Sourcegraph instance. -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +
+Technical details + +Query: + +``` +sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="searcher"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="searcher"}[5m])) +``` +
+ +
+ +#### searcher: closed_max_idle + +

Closed by SetMaxIdleConns

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101530` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -21980,29 +21309,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100711` o Query: ``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}"}[2m])) by (grpc_method)))))) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="searcher"}[5m])) ```

-#### symbols: symbols_grpc_clients_internal_error_all_codes_per_method - -

Client-observed gRPC internal error response code rate per-method over 2m

- -The rate of gRPC internal-error response codes per method, aggregated across all "symbols" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "symbols" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. +#### searcher: closed_max_lifetime -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. +

Closed by SetConnMaxLifetime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100712` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101531` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -22010,25 +21331,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100712` o Query: ``` -(sum(rate(src_grpc_method_status{grpc_service=~"symbols.v1.SymbolsService",is_internal_error="true",grpc_method=~"${symbols_method:regex}"}[2m])) by (grpc_method, grpc_code)) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="searcher"}[5m])) ```

-### Symbols: Symbols GRPC retry metrics - -#### symbols: symbols_grpc_clients_retry_percentage_across_all_methods - -

Client retry percentage across all methods over 2m

+#### searcher: closed_max_idle_time -The percentage of gRPC requests that were retried across all methods, aggregated across all "symbols" clients. +

Closed by SetConnMaxIdleTime

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101532` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -22036,23 +21353,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100800` o Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"symbols.v1.SymbolsService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"symbols.v1.SymbolsService"}[2m]))))))) +sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="searcher"}[5m])) ```

-#### symbols: symbols_grpc_clients_retry_percentage_per_method +### Searcher: Searcher (CPU, Memory) -

Client retry percentage per-method over 2m

+#### searcher: cpu_usage_percentage -The percentage of gRPC requests that were retried aggregated across all "symbols" clients, broken out per method. +

CPU usage

-This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-cpu-usage-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101600` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22060,23 +21377,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100801` o Query: ``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"symbols.v1.SymbolsService",is_retried="true",grpc_method=~"${symbols_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}"}[2m])) by (grpc_method)))))) +cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"} ```

-#### symbols: symbols_grpc_clients_retry_count_per_method +#### searcher: memory_usage_percentage -

Client retry count per-method over 2m

+

Memory usage percentage (total)

-The count of gRPC requests that were retried aggregated across all "symbols" clients, broken out per method +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100802` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101601` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22084,25 +21401,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100802` o Query: ``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"symbols.v1.SymbolsService",grpc_method=~"${symbols_method:regex}",is_retried="true"}[2m])) by (grpc_method)) +cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"} ```

-### Symbols: Site configuration client update latency - -#### symbols: symbols_site_configuration_duration_since_last_successful_update_by_instance +#### searcher: memory_working_set_bytes -

Duration since last successful site configuration update (by instance)

+

Memory usage bytes (total)

-The duration since the configuration client used by the "symbols" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101602` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22110,21 +21425,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100900` o Query: ``` -src_conf_client_time_since_last_successful_update_seconds{job=~`.*symbols`,instance=~`${instance:regex}`} +max by (name) (container_memory_working_set_bytes{name=~"^searcher.*"}) ```

-#### symbols: symbols_site_configuration_duration_since_last_successful_update_by_instance +#### searcher: memory_rss -

Maximum duration since last successful site configuration update (all "symbols" instances)

+

Memory (RSS)

-Refer to the [alerts reference](alerts#symbols-symbols-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100901` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#searcher-memory-rss) for 1 alert related to this panel. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101610` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22132,23 +21449,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100901` o Query: ``` -max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*symbols`,instance=~`${instance:regex}`}[1m])) +max(container_memory_rss{name=~"^searcher.*"} / container_spec_memory_limit_bytes{name=~"^searcher.*"}) by (name) * 100.0 ```

-### Symbols: Database connections +#### searcher: memory_total_active_file -#### symbols: max_open_conns +

Memory usage (active file)

-

Maximum open

+This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101611` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22156,21 +21473,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101000` o Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="symbols"}) +max(container_memory_total_active_file_bytes{name=~"^searcher.*"} / container_spec_memory_limit_bytes{name=~"^searcher.*"}) by (name) * 100.0 ```

-#### symbols: open_conns +#### searcher: memory_kernel_usage -

Established

+

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101612` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22178,21 +21497,33 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101001` o Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="symbols"}) +max(container_memory_kernel_usage{name=~"^searcher.*"} / container_spec_memory_limit_bytes{name=~"^searcher.*"}) by (name) * 100.0 ```

-#### symbols: in_use +### Searcher: Container monitoring (not available on server) -

Used

+#### searcher: container_missing + +

Container missing

+ +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod searcher` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p searcher`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' searcher` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the searcher container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs searcher` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101700` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22200,21 +21531,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101010` o Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="symbols"}) +count by(name) ((time() - container_last_seen{name=~"^searcher.*"}) > 60) ```

-#### symbols: idle +#### searcher: container_cpu_usage -

Idle

+

Container cpu usage total (1m average) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101701` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22222,21 +21553,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101011` o Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="symbols"}) +cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"} ```

-#### symbols: mean_blocked_seconds_per_conn_request +#### searcher: container_memory_usage -

Mean blocked seconds per conn request

+

Container memory usage by instance

-Refer to the [alerts reference](alerts#symbols-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +Refer to the [alerts reference](alerts#searcher-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101020` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101702` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22244,21 +21575,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101020` o Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="symbols"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="symbols"}[5m])) +cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"} ```

-#### symbols: closed_max_idle +#### searcher: fs_io_operations -

Closed by SetMaxIdleConns

+

Filesystem reads and writes rate by instance over 1h

+ +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101030` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101703` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22266,21 +21600,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101030` o Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="symbols"}[5m])) +sum by(name) (rate(container_fs_reads_total{name=~"^searcher.*"}[1h]) + rate(container_fs_writes_total{name=~"^searcher.*"}[1h])) ```

-#### symbols: closed_max_lifetime +### Searcher: Provisioning indicators (not available on server) -

Closed by SetConnMaxLifetime

+#### searcher: provisioning_container_cpu_usage_long_term -This panel has no related alerts. +

Container cpu usage total (90th percentile over 1d) across all cores by instance

+ +Refer to the [alerts reference](alerts#searcher-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101031` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101800` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22288,21 +21624,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101031` o Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="symbols"}[5m])) +quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[1d]) ```

-#### symbols: closed_max_idle_time +#### searcher: provisioning_container_memory_usage_long_term -

Closed by SetConnMaxIdleTime

+

Container memory usage (1d maximum) by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101032` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101801` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22310,33 +21646,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101032` o Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="symbols"}[5m])) +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[1d]) ```

-### Symbols: Container monitoring (not available on server) - -#### symbols: container_missing - -

Container missing

- -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +#### searcher: provisioning_container_cpu_usage_short_term -- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod symbols` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p symbols`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' symbols` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the symbols container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs symbols` (note this will include logs from the previous and currently running container). +

Container cpu usage total (5m maximum) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#searcher-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101810` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22344,21 +21668,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101100` o Query: ``` -count by(name) ((time() - container_last_seen{name=~"^symbols.*"}) > 60) +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^searcher.*"}[5m]) ```

-#### symbols: container_cpu_usage +#### searcher: provisioning_container_memory_usage_short_term -

Container cpu usage total (1m average) across all cores by instance

+

Container memory usage (5m maximum) by instance

-Refer to the [alerts reference](alerts#symbols-container-cpu-usage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#searcher-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101811` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22366,21 +21690,24 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101101` o Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^symbols.*"} +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^searcher.*"}[5m]) ```

-#### symbols: container_memory_usage +#### searcher: container_oomkill_events_total -

Container memory usage by instance

+

Container OOMKILL events total by instance

+ +This value indicates the total number of times the container main process or child processes were terminated by OOM killer. +When it occurs frequently, it is an indicator of underprovisioning. -Refer to the [alerts reference](alerts#symbols-container-memory-usage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#searcher-container-oomkill-events-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101812` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22388,24 +21715,25 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101102` o Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^symbols.*"} +max by (name) (container_oom_events_total{name=~"^searcher.*"}) ```

-#### symbols: fs_io_operations +### Searcher: Golang runtime monitoring -

Filesystem reads and writes rate by instance over 1h

+#### searcher: go_goroutines -This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +

Maximum active goroutines

-This panel has no related alerts. +A high value here indicates a possible goroutine leak. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101103` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#searcher-go-goroutines) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101900` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22413,23 +21741,21 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101103` o Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^symbols.*"}[1h]) + rate(container_fs_writes_total{name=~"^symbols.*"}[1h])) +max by(instance) (go_goroutines{job=~".*searcher"}) ```

-### Symbols: Provisioning indicators (not available on server) - -#### symbols: provisioning_container_cpu_usage_long_term +#### searcher: go_gc_duration_seconds -

Container cpu usage total (90th percentile over 1d) across all cores by instance

+

Maximum go garbage collection duration

-Refer to the [alerts reference](alerts#symbols-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#searcher-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101901` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22437,21 +21763,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101200` o Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^symbols.*"}[1d]) +max by(instance) (go_gc_duration_seconds{job=~".*searcher"}) ```

-#### symbols: provisioning_container_memory_usage_long_term +### Searcher: Kubernetes monitoring (only available on Kubernetes) -

Container memory usage (1d maximum) by instance

+#### searcher: pods_available_percentage -Refer to the [alerts reference](alerts#symbols-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +

Percentage pods available

-To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101201` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#searcher-pods-available-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=102000` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).*
Technical details @@ -22459,19 +21787,25 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101201` o Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^symbols.*"}[1d]) +sum by(app) (up{app=~".*searcher"}) / count by (app) (up{app=~".*searcher"}) * 100 ```

-#### symbols: provisioning_container_cpu_usage_short_term +## Syntect Server -

Container cpu usage total (5m maximum) across all cores by instance

+

Handles syntax highlighting for code files.

-Refer to the [alerts reference](alerts#symbols-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +To see this dashboard, visit `/-/debug/grafana/d/syntect-server/syntect-server` on your Sourcegraph instance. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101210` on your Sourcegraph instance. +#### syntect-server: syntax_highlighting_errors + +

Syntax highlighting errors every 5m

+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -22481,19 +21815,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101210` o Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^symbols.*"}[5m]) +sum(increase(src_syntax_highlighting_requests{status="error"}[5m])) / sum(increase(src_syntax_highlighting_requests[5m])) * 100 ```
-#### symbols: provisioning_container_memory_usage_short_term +#### syntect-server: syntax_highlighting_timeouts -

Container memory usage (5m maximum) by instance

+

Syntax highlighting timeouts every 5m

-Refer to the [alerts reference](alerts#symbols-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -22503,22 +21837,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101211` o Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^symbols.*"}[5m]) +sum(increase(src_syntax_highlighting_requests{status="timeout"}[5m])) / sum(increase(src_syntax_highlighting_requests[5m])) * 100 ```
-#### symbols: container_oomkill_events_total - -

Container OOMKILL events total by instance

+#### syntect-server: syntax_highlighting_panics -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

Syntax highlighting panics every 5m

-Refer to the [alerts reference](alerts#symbols-container-oomkill-events-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100010` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -22528,23 +21859,19 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101212` o Query: ``` -max by (name) (container_oom_events_total{name=~"^symbols.*"}) +sum(increase(src_syntax_highlighting_requests{status="panic"}[5m])) ```
-### Symbols: Golang runtime monitoring - -#### symbols: go_goroutines - -

Maximum active goroutines

+#### syntect-server: syntax_highlighting_worker_deaths -A high value here indicates a possible goroutine leak. +

Syntax highlighter worker deaths every 5m

-Refer to the [alerts reference](alerts#symbols-go-goroutines) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100011` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -22554,21 +21881,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101300` o Query: ``` -max by(instance) (go_goroutines{job=~".*symbols"}) +sum(increase(src_syntax_highlighting_requests{status="hss_worker_timeout"}[5m])) ```
-#### symbols: go_gc_duration_seconds +### Syntect Server: Syntect-server (CPU, Memory) -

Maximum go garbage collection duration

+#### syntect-server: cpu_usage_percentage -Refer to the [alerts reference](alerts#symbols-go-gc-duration-seconds) for 1 alert related to this panel. +

CPU usage

-To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101301` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#syntect-server-cpu-usage-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100100` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22576,23 +21905,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101301` o Query: ``` -max by(instance) (go_gc_duration_seconds{job=~".*symbols"}) +cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"} ```

-### Symbols: Kubernetes monitoring (only available on Kubernetes) +#### syntect-server: memory_usage_percentage -#### symbols: pods_available_percentage +

Memory usage percentage (total)

-

Percentage pods available

+An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. -Refer to the [alerts reference](alerts#symbols-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22600,27 +21929,23 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=101400` o Query: ``` -sum by(app) (up{app=~".*symbols"}) / count by (app) (up{app=~".*symbols"}) * 100 +cadvisor_container_memory_usage_percentage_total{name=~"^syntect-server.*"} ```

-## Syntect Server - -

Handles syntax highlighting for code files.

+#### syntect-server: memory_working_set_bytes -To see this dashboard, visit `/-/debug/grafana/d/syntect-server/syntect-server` on your Sourcegraph instance. +

Memory usage bytes (total)

-#### syntect-server: syntax_highlighting_errors - -

Syntax highlighting errors every 5m

+An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22628,21 +21953,23 @@ To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewP Query: ``` -sum(increase(src_syntax_highlighting_requests{status="error"}[5m])) / sum(increase(src_syntax_highlighting_requests[5m])) * 100 +max by (name) (container_memory_working_set_bytes{name=~"^syntect-server.*"}) ```

-#### syntect-server: syntax_highlighting_timeouts +#### syntect-server: memory_rss -

Syntax highlighting timeouts every 5m

+

Memory (RSS)

-This panel has no related alerts. +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100001` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#syntect-server-memory-rss) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100110` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22650,21 +21977,23 @@ To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewP Query: ``` -sum(increase(src_syntax_highlighting_requests{status="timeout"}[5m])) / sum(increase(src_syntax_highlighting_requests[5m])) * 100 +max(container_memory_rss{name=~"^syntect-server.*"} / container_spec_memory_limit_bytes{name=~"^syntect-server.*"}) by (name) * 100.0 ```

-#### syntect-server: syntax_highlighting_panics +#### syntect-server: memory_total_active_file -

Syntax highlighting panics every 5m

+

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22672,21 +22001,23 @@ To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewP Query: ``` -sum(increase(src_syntax_highlighting_requests{status="panic"}[5m])) +max(container_memory_total_active_file_bytes{name=~"^syntect-server.*"} / container_spec_memory_limit_bytes{name=~"^syntect-server.*"}) by (name) * 100.0 ```

-#### syntect-server: syntax_highlighting_worker_deaths +#### syntect-server: memory_kernel_usage -

Syntax highlighter worker deaths every 5m

+

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100112` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -22694,7 +22025,7 @@ To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewP Query: ``` -sum(increase(src_syntax_highlighting_requests{status="hss_worker_timeout"}[5m])) +max(container_memory_kernel_usage{name=~"^syntect-server.*"} / container_spec_memory_limit_bytes{name=~"^syntect-server.*"}) by (name) * 100.0 ```
@@ -22718,7 +22049,7 @@ value change independent of deployment events (such as an upgrade), it could ind This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100200` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22740,7 +22071,7 @@ count by(name) ((time() - container_last_seen{name=~"^syntect-server.*"}) > 60) Refer to the [alerts reference](alerts#syntect-server-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100201` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22762,7 +22093,7 @@ cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-server.*"} Refer to the [alerts reference](alerts#syntect-server-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100202` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22787,7 +22118,7 @@ When extremely high, this can indicate a resource usage problem, or can cause pr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100203` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22811,7 +22142,7 @@ sum by(name) (rate(container_fs_reads_total{name=~"^syntect-server.*"}[1h]) + ra Refer to the [alerts reference](alerts#syntect-server-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22833,7 +22164,7 @@ quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^sy Refer to the [alerts reference](alerts#syntect-server-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100301` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22855,7 +22186,7 @@ max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^syntect-s Refer to the [alerts reference](alerts#syntect-server-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100310` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22877,7 +22208,7 @@ max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^syntect-serv Refer to the [alerts reference](alerts#syntect-server-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100311` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22902,7 +22233,7 @@ When it occurs frequently, it is an indicator of underprovisioning. Refer to the [alerts reference](alerts#syntect-server-container-oomkill-events-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100312` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -22926,7 +22257,7 @@ max by (name) (container_oom_events_total{name=~"^syntect-server.*"}) Refer to the [alerts reference](alerts#syntect-server-pods-available-percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/syntect-server/syntect-server?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -23110,6 +22441,351 @@ sum(increase(get_index_options_error_total[5m]))
+### Zoekt: Zoekt-indexserver (CPU, Memory) + +#### zoekt: cpu_usage_percentage + +

CPU usage

+ +Refer to the [alerts reference](alerts#zoekt-cpu-usage-percentage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100100` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"} +``` +
+ +
+ +#### zoekt: memory_usage_percentage + +

Memory usage percentage (total)

+ +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100101` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"} +``` +
+ +
+ +#### zoekt: memory_working_set_bytes + +

Memory usage bytes (total)

+ +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100102` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max by (name) (container_memory_working_set_bytes{name=~"^zoekt-indexserver.*"}) +``` +
+ +
+ +#### zoekt: memory_rss + +

Memory (RSS)

+ +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." + +Refer to the [alerts reference](alerts#zoekt-memory-rss) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100110` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max(container_memory_rss{name=~"^zoekt-indexserver.*"} / container_spec_memory_limit_bytes{name=~"^zoekt-indexserver.*"}) by (name) * 100.0 +``` +
+ +
+ +#### zoekt: memory_total_active_file + +

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100111` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max(container_memory_total_active_file_bytes{name=~"^zoekt-indexserver.*"} / container_spec_memory_limit_bytes{name=~"^zoekt-indexserver.*"}) by (name) * 100.0 +``` +
+ +
+ +#### zoekt: memory_kernel_usage + +

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100112` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max(container_memory_kernel_usage{name=~"^zoekt-indexserver.*"} / container_spec_memory_limit_bytes{name=~"^zoekt-indexserver.*"}) by (name) * 100.0 +``` +
+ +
+ +### Zoekt: Zoekt-webserver (CPU, Memory) + +#### zoekt: cpu_usage_percentage + +

CPU usage

+ +Refer to the [alerts reference](alerts#zoekt-cpu-usage-percentage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100200` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"} +``` +
+ +
+ +#### zoekt: memory_usage_percentage + +

Memory usage percentage (total)

+ +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100201` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"} +``` +
+ +
+ +#### zoekt: memory_working_set_bytes + +

Memory usage bytes (total)

+ +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100202` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max by (name) (container_memory_working_set_bytes{name=~"^zoekt-webserver.*"}) +``` +
+ +
+ +#### zoekt: memory_rss + +

Memory (RSS)

+ +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." + +Refer to the [alerts reference](alerts#zoekt-memory-rss) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100210` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max(container_memory_rss{name=~"^zoekt-webserver.*"} / container_spec_memory_limit_bytes{name=~"^zoekt-webserver.*"}) by (name) * 100.0 +``` +
+ +
+ +#### zoekt: memory_total_active_file + +

Memory usage (active file)

+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100211` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max(container_memory_total_active_file_bytes{name=~"^zoekt-webserver.*"} / container_spec_memory_limit_bytes{name=~"^zoekt-webserver.*"}) by (name) * 100.0 +``` +
+ +
+ +#### zoekt: memory_kernel_usage + +

Memory usage (kernel)

+ +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100212` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +max(container_memory_kernel_usage{name=~"^zoekt-webserver.*"} / container_spec_memory_limit_bytes{name=~"^zoekt-webserver.*"}) by (name) * 100.0 +``` +
+ +
+ +### Zoekt: Memory mapping metrics + +#### zoekt: memory_map_areas_percentage_used + +

Process memory map areas percentage used (per instance)

+ +Processes have a limited about of memory map areas that they can use. In Zoekt, memory map areas +are mainly used for loading shards into memory for queries (via mmap). However, memory map areas +are also used for loading shared libraries, etc. + +_See https://en.wikipedia.org/wiki/Memory-mapped_file and the related articles for more information about memory maps._ + +Once the memory map limit is reached, the Linux kernel will prevent the process from creating any +additional memory map areas. This could cause the process to crash. + +Refer to the [alerts reference](alerts#zoekt-memory-map-areas-percentage-used) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +(proc_metrics_memory_map_current_count{instance=~`${instance:regex}`} / proc_metrics_memory_map_max_limit{instance=~`${instance:regex}`}) * 100 +``` +
+ +
+ +#### zoekt: memory_major_page_faults + +

Webserver page faults

+ +The number of major page faults in a 5 minute window for Zoekt webservers. If this number increases significantly, it indicates that more searches need to load data from disk. There may not be enough memory to efficiently support amount of repo data being searched. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100301` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* + +
+Technical details + +Query: + +``` +rate(container_memory_failures_total{failure_type="pgmajfault", name=~"^zoekt-webserver.*"}[5m]) +``` +
+ +
+ ### Zoekt: Search requests #### zoekt: indexed_search_request_duration_p99_aggregate @@ -23122,7 +22798,7 @@ Large duration spikes can be an indicator of saturation and / or a performance r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23148,7 +22824,7 @@ Large duration spikes can be an indicator of saturation and / or a performance r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100401` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23174,7 +22850,7 @@ Large duration spikes can be an indicator of saturation and / or a performance r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100402` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23200,7 +22876,7 @@ Large duration spikes can be an indicator of saturation and / or a performance r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100410` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23226,7 +22902,7 @@ Large duration spikes can be an indicator of saturation and / or a performance r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100411` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23252,7 +22928,7 @@ Large duration spikes can be an indicator of saturation and / or a performance r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100412` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23280,7 +22956,7 @@ The number of in-flight requests can serve as a proxy for the general load that This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100420` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23308,7 +22984,7 @@ The number of in-flight requests can serve as a proxy for the general load that This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100121` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100421` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23338,7 +23014,7 @@ can indicate that the indexed-search backend is saturated. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100130` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100430` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23368,7 +23044,7 @@ can indicate that the indexed-search backend is saturated. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100131` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100431` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23390,7 +23066,7 @@ sum by (instance) (deriv(zoekt_search_running[1m])) Refer to the [alerts reference](alerts#zoekt-indexed-search-request-errors) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100140` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100440` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23421,7 +23097,7 @@ For a full explanation of the states see https://github.com/sourcegraph/zoekt/bl This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100150` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100450` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23452,7 +23128,7 @@ For a full explanation of the states see https://github.com/sourcegraph/zoekt/bl This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100151` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100451` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23478,7 +23154,7 @@ Long git fetch times can be a leading indicator of saturation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100500` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23502,7 +23178,7 @@ Long git fetch times can be a leading indicator of saturation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100501` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23537,7 +23213,7 @@ Legend: This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100600` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23572,7 +23248,7 @@ Legend: This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100601` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23596,7 +23272,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100610` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23620,7 +23296,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100611` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23646,7 +23322,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100320` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100620` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23672,7 +23348,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100321` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100621` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23698,7 +23374,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100322` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100622` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23724,7 +23400,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100330` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100630` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23750,7 +23426,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100331` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100631` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23776,7 +23452,7 @@ Latency increases can indicate bottlenecks in the indexserver. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100332` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100632` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23802,7 +23478,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100340` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100640` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23828,7 +23504,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100341` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100641` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23854,7 +23530,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100342` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100642` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23880,7 +23556,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100350` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100650` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23906,7 +23582,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100351` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100651` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23932,7 +23608,7 @@ Failures happening after a long time indicates timeouts. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100352` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100652` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23958,7 +23634,7 @@ A queue that is constantly growing could be a leading indicator of a bottleneck This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100700` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23982,7 +23658,7 @@ A queue that is constantly growing could be a leading indicator of a bottleneck This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100701` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -23998,81 +23674,21 @@ index_queue_len{instance=~`${instance:regex}`}
-#### zoekt: indexed_queueing_delay_heatmap +#### zoekt: indexed_indexing_delay_heatmap -

Job queuing delay heatmap

+

Repo indexing delay heatmap

-The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. +The indexing delay represents the amount of time between when Zoekt received a repo indexing job, to when the repo was indexed. +It includes the time the repo spent in the indexing queue, as well as the time it took to actually index the repo. This metric +only includes successfully indexed repos. -Large queueing delays can be an indicator of: +Large indexing delays can be an indicator of: - resource saturation - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better . This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100410` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum by (le) (increase(index_queue_age_seconds_bucket[$__rate_interval])) -``` -
- -
- -#### zoekt: indexed_queueing_delay_p99_9_aggregate - -

99.9th percentile job queuing delay over 5m (aggregate)

- -This dashboard shows the p99.9 job queueing delay aggregated across all Zoekt instances. - -The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. - -Large queueing delays can be an indicator of: - - resource saturation - - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better. - -The 99.9 percentile dashboard is useful for capturing the long tail of queueing delays (on the order of 24+ hours, etc.). - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100420` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.999, sum by (le, name)(rate(index_queue_age_seconds_bucket[5m]))) -``` -
- -
- -#### zoekt: indexed_queueing_delay_p90_aggregate - -

90th percentile job queueing delay over 5m (aggregate)

- -This dashboard shows the p90 job queueing delay aggregated across all Zoekt instances. - -The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. - -Large queueing delays can be an indicator of: - - resource saturation - - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100421` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100710` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24082,27 +23698,29 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100421` on yo Query: ``` -histogram_quantile(0.90, sum by (le, name)(rate(index_queue_age_seconds_bucket[5m]))) +sum by (le) (increase(index_indexing_delay_seconds_bucket{state=~"success|success_meta"}[$__rate_interval])) ```
-#### zoekt: indexed_queueing_delay_p75_aggregate +#### zoekt: indexed_indexing_delay_p90_aggregate -

75th percentile job queueing delay over 5m (aggregate)

+

90th percentile indexing delay over 5m (aggregate)

-This dashboard shows the p75 job queueing delay aggregated across all Zoekt instances. +This dashboard shows the p90 indexing delay aggregated across all Zoekt instances. -The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. +The indexing delay represents the amount of time between when Zoekt received a repo indexing job, to when the repo was indexed. +It includes the time the repo spent in the indexing queue, as well as the time it took to actually index the repo. This metric +only includes successfully indexed repos. -Large queueing delays can be an indicator of: +Large indexing delays can be an indicator of: - resource saturation - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100422` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100720` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24112,29 +23730,29 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100422` on yo Query: ``` -histogram_quantile(0.75, sum by (le, name)(rate(index_queue_age_seconds_bucket[5m]))) +histogram_quantile(0.90, sum by (le, name)(rate(index_indexing_delay_seconds_bucket{state=~"success|success_meta"}[5m]))) ```
-#### zoekt: indexed_queueing_delay_p99_9_per_instance +#### zoekt: indexed_indexing_delay_p50_aggregate -

99.9th percentile job queuing delay over 5m (per instance)

+

50th percentile indexing delay over 5m (aggregate)

-This dashboard shows the p99.9 job queueing delay, broken out per Zoekt instance. +This dashboard shows the p50 indexing delay aggregated across all Zoekt instances. -The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. +The indexing delay represents the amount of time between when Zoekt received a repo indexing job, to when the repo was indexed. +It includes the time the repo spent in the indexing queue, as well as the time it took to actually index the repo. This metric +only includes successfully indexed repos. -Large queueing delays can be an indicator of: +Large indexing delays can be an indicator of: - resource saturation - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better. -The 99.9 percentile dashboard is useful for capturing the long tail of queueing delays (on the order of 24+ hours, etc.). - This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100430` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100721` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24144,27 +23762,28 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100430` on yo Query: ``` -histogram_quantile(0.999, sum by (le, instance)(rate(index_queue_age_seconds_bucket{instance=~`${instance:regex}`}[5m]))) +histogram_quantile(0.50, sum by (le, name)(rate(index_indexing_delay_seconds_bucket{state=~"success|success_meta"}[5m]))) ```
-#### zoekt: indexed_queueing_delay_p90_per_instance +#### zoekt: indexed_indexing_delay_p90_per_instance -

90th percentile job queueing delay over 5m (per instance)

+

90th percentile indexing delay over 5m (per instance)

-This dashboard shows the p90 job queueing delay, broken out per Zoekt instance. +This dashboard shows the p90 indexing delay, broken out per Zoekt instance. -The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. +The indexing delay represents the amount of time between when Zoekt received a repo indexing job, to when the repo was indexed. +It includes the time the repo spent in the indexing queue, as well as the time it took to actually index the repo. -Large queueing delays can be an indicator of: +Large indexing delays can be an indicator of: - resource saturation - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100431` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100730` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24174,60 +23793,28 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100431` on yo Query: ``` -histogram_quantile(0.90, sum by (le, instance)(rate(index_queue_age_seconds_bucket{instance=~`${instance:regex}`}[5m]))) +histogram_quantile(0.90, sum by (le, instance)(rate(index_indexing_delay_seconds{instance=~`${instance:regex}`}[5m]))) ```
-#### zoekt: indexed_queueing_delay_p75_per_instance +#### zoekt: indexed_indexing_delay_p50_per_instance -

75th percentile job queueing delay over 5m (per instance)

+

50th percentile indexing delay over 5m (per instance)

-This dashboard shows the p75 job queueing delay, broken out per Zoekt instance. +This dashboard shows the p50 indexing delay, broken out per Zoekt instance. -The queueing delay represents the amount of time an indexing job spent in the queue before it was processed. +The indexing delay represents the amount of time between when Zoekt received a repo indexing job, to when the repo was indexed. +It includes the time the repo spent in the indexing queue, as well as the time it took to actually index the repo. -Large queueing delays can be an indicator of: +Large indexing delays can be an indicator of: - resource saturation - each Zoekt replica has too many jobs for it to be able to process all of them promptly. In this scenario, consider adding additional Zoekt replicas to distribute the work better. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100432` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.75, sum by (le, instance)(rate(index_queue_age_seconds_bucket{instance=~`${instance:regex}`}[5m]))) -``` -
- -
- -### Zoekt: Virtual Memory Statistics - -#### zoekt: memory_map_areas_percentage_used - -

Process memory map areas percentage used (per instance)

- -Processes have a limited about of memory map areas that they can use. In Zoekt, memory map areas -are mainly used for loading shards into memory for queries (via mmap). However, memory map areas -are also used for loading shared libraries, etc. - -_See https://en.wikipedia.org/wiki/Memory-mapped_file and the related articles for more information about memory maps._ - -Once the memory map limit is reached, the Linux kernel will prevent the process from creating any -additional memory map areas. This could cause the process to crash. - -Refer to the [alerts reference](alerts#zoekt-memory-map-areas-percentage-used) for 2 alerts related to this panel. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100731` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24237,7 +23824,7 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100500` on yo Query: ``` -(proc_metrics_memory_map_current_count{instance=~`${instance:regex}`} / proc_metrics_memory_map_max_limit{instance=~`${instance:regex}`}) * 100 +histogram_quantile(0.50, sum by (le, instance)(rate(index_indexing_delay_seconds{instance=~`${instance:regex}`}[5m]))) ``` @@ -24255,7 +23842,7 @@ This number should be consistent if the number of indexed repositories doesn`t c This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100800` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24281,7 +23868,7 @@ This number should be consistent if the number of indexed repositories doesn`t c This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100801` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24308,7 +23895,7 @@ Since the target compound shard size is set on start of zoekt-indexserver, the a This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100810` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24334,7 +23921,7 @@ This curve should be flat. Any deviation should be investigated. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100811` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24358,7 +23945,7 @@ Number of errors during shard merging aggregated over all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100620` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100820` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24382,7 +23969,7 @@ Number of errors during shard merging per instance. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100621` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100821` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24406,7 +23993,7 @@ Set to 1 if shard merging is running. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100630` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100830` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24430,7 +24017,7 @@ Set to 1 if vacuum is running. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100631` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100831` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24452,11 +24039,11 @@ max by (instance) (index_vacuum_running{instance=~`${instance:regex}`})

Transmission rate over 5m (aggregate)

-The rate of bytes sent over the network across all Zoekt pods +The rate of bytes sent over the network across all pods This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100900` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24476,11 +24063,11 @@ sum(rate(container_network_transmit_bytes_total{container_label_io_kubernetes_po

Transmission rate over 5m (per instance)

-The amount of bytes sent over the network by individual Zoekt pods +The amount of bytes sent over the network by individual pods This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100901` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24500,11 +24087,11 @@ sum by (container_label_io_kubernetes_pod_name) (rate(container_network_transmit

Receive rate over 5m (aggregate)

-The amount of bytes received from the network across Zoekt pods +The amount of bytes received from the network across pods This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100910` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24524,11 +24111,11 @@ sum(rate(container_network_receive_bytes_total{container_label_io_kubernetes_pod

Receive rate over 5m (per instance)

-The amount of bytes received from the network by individual Zoekt pods +The amount of bytes received from the network by individual pods This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100911` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24552,7 +24139,7 @@ An increase in dropped packets could be a leading indicator of network saturatio This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100720` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100920` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24576,7 +24163,7 @@ An increase in transmission errors could indicate a networking issue This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100721` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100921` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24600,7 +24187,7 @@ An increase in dropped packets could be a leading indicator of network saturatio This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100722` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100922` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24624,7 +24211,7 @@ An increase in errors while receiving could indicate a networking issue. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100723` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100923` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24650,7 +24237,7 @@ The number of gRPC requests received per second across all methods, aggregated a This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101000` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24674,7 +24261,7 @@ The number of gRPC requests received per second broken out per method, aggregate This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101001` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24698,7 +24285,7 @@ The percentage of gRPC requests that fail across all methods, aggregated across This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101010` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24722,7 +24309,7 @@ The percentage of gRPC requests that fail per method, aggregated across all inst This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101011` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24746,7 +24333,7 @@ The 99th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100820` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101020` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24770,7 +24357,7 @@ The 90th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100821` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101021` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24794,7 +24381,7 @@ The 75th percentile response time per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100822` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101022` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24818,7 +24405,7 @@ The 99.9th percentile total per-RPC response size per method, aggregated across This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100830` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101030` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24842,7 +24429,7 @@ The 90th percentile total per-RPC response size per method, aggregated across al This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100831` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101031` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24866,7 +24453,7 @@ The 75th percentile total per-RPC response size per method, aggregated across al This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100832` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101032` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24890,7 +24477,7 @@ The 99.9th percentile size of every individual protocol buffer size sent by the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100840` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101040` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -24904,381 +24491,17 @@ histogram_quantile(0.999, sum by (le, name, grpc_method)(rate(grpc_server_sent_i ``` -
- -#### zoekt: zoekt_webserver_p90_invididual_sent_message_size_per_method - -

90th percentile individual sent message size per method over 2m

- -The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100841` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_webserver_method:regex}`,instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))) -``` -
- -
- -#### zoekt: zoekt_webserver_p75_invididual_sent_message_size_per_method - -

75th percentile individual sent message size per method over 2m

- -The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100842` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_webserver_method:regex}`,instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_response_stream_message_count_per_method - -

Average streaming response message count per-method over 2m

- -The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100850` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m])) by (grpc_method))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_all_codes_per_method - -

Response codes rate per-method over 2m

- -The rate of all generated gRPC response codes per method, aggregated across all instances. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100860` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_webserver_method:regex}`,instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m])) by (grpc_method, grpc_code) -``` -
- -
- -### Zoekt: Zoekt Webserver GRPC "internal error" metrics - -#### zoekt: zoekt_webserver_grpc_clients_error_percentage_all_methods - -

Client baseline error percentage across all methods over 2m

- -The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "zoekt_webserver" clients. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100900` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))))))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_error_percentage_per_method - -

Client baseline error percentage per-method over 2m

- -The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_webserver" clients. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100901` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method)))))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_all_codes_per_method - -

Client baseline response codes rate per-method over 2m

- -The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_webserver" clients. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100902` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method, grpc_code)) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_internal_error_percentage_all_methods - -

Client-observed gRPC internal error percentage across all methods over 2m

- -The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "zoekt_webserver" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_webserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. - -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100910` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))))))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_internal_error_percentage_per_method - -

Client-observed gRPC internal error percentage per-method over 2m

- -The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "zoekt_webserver" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_webserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. - -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100911` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method)))))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_internal_error_all_codes_per_method - -

Client-observed gRPC internal error response code rate per-method over 2m

- -The rate of gRPC internal-error response codes per method, aggregated across all "zoekt_webserver" clients. - -**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_webserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. - -When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. - -**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=100912` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",is_internal_error="true",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method, grpc_code)) -``` -
- -
- -### Zoekt: Zoekt Webserver GRPC retry metrics - -#### zoekt: zoekt_webserver_grpc_clients_retry_percentage_across_all_methods - -

Client retry percentage across all methods over 2m

- -The percentage of gRPC requests that were retried across all methods, aggregated across all "zoekt_webserver" clients. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101000` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))))))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_retry_percentage_per_method - -

Client retry percentage per-method over 2m

- -The percentage of gRPC requests that were retried aggregated across all "zoekt_webserver" clients, broken out per method. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101001` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",is_retried="true",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method)))))) -``` -
- -
- -#### zoekt: zoekt_webserver_grpc_clients_retry_count_per_method - -

Client retry count per-method over 2m

- -The count of gRPC requests that were retried aggregated across all "zoekt_webserver" clients, broken out per method - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101002` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}",is_retried="true"}[2m])) by (grpc_method)) -``` -
- -
- -### Zoekt: Data disk I/O metrics - -#### zoekt: data_disk_reads_sec - -

Read request rate over 1m (per instance)

- -The number of read requests that were issued to the device per second. - -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101100` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* - -
-Technical details - -Query: - -``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))) -``` -
- -
- -#### zoekt: data_disk_writes_sec +
-

Write request rate over 1m (per instance)

+#### zoekt: zoekt_webserver_p90_invididual_sent_message_size_per_method -The number of write requests that were issued to the device per second. +

90th percentile individual sent message size per method over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The 90th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101041` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25288,23 +24511,21 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101101` on yo Query: ``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))) +histogram_quantile(0.90, sum by (le, name, grpc_method)(rate(grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_webserver_method:regex}`,instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))) ```
-#### zoekt: data_disk_read_throughput - -

Read throughput over 1m (per instance)

+#### zoekt: zoekt_webserver_p75_invididual_sent_message_size_per_method -The amount of data that was read from the device per second. +

75th percentile individual sent message size per method over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The 75th percentile size of every individual protocol buffer size sent by the service per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101042` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25314,23 +24535,21 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101110` on yo Query: ``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m]))))) +histogram_quantile(0.75, sum by (le, name, grpc_method)(rate(grpc_server_sent_individual_message_size_bytes_per_rpc_bucket{grpc_method=~`${zoekt_webserver_method:regex}`,instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))) ```
-#### zoekt: data_disk_write_throughput - -

Write throughput over 1m (per instance)

+#### zoekt: zoekt_webserver_grpc_response_stream_message_count_per_method -The amount of data that was written to the device per second. +

Average streaming response message count per-method over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The average number of response messages sent during a streaming RPC method, broken out per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101050` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25340,23 +24559,21 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101111` on yo Query: ``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m]))))) +((sum(rate(grpc_server_msg_sent_total{grpc_type="server_stream",instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m])) by (grpc_method))/(sum(rate(grpc_server_started_total{grpc_type="server_stream",instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m])) by (grpc_method))) ```
-#### zoekt: data_disk_read_duration - -

Average read duration over 1m (per instance)

+#### zoekt: zoekt_webserver_grpc_all_codes_per_method -The average time for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. +

Response codes rate per-method over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The rate of all generated gRPC response codes per method, aggregated across all instances. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101060` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25366,23 +24583,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101120` on yo Query: ``` -(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) +sum(rate(grpc_server_handled_total{grpc_method=~`${zoekt_webserver_method:regex}`,instance=~`${webserver_instance:regex}`,grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m])) by (grpc_method, grpc_code) ```
-#### zoekt: data_disk_write_duration +### Zoekt: Zoekt Webserver GRPC "internal error" metrics -

Average write duration over 1m (per instance)

+#### zoekt: zoekt_webserver_grpc_clients_error_percentage_all_methods -The average time for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. +

Client baseline error percentage across all methods over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The percentage of gRPC requests that fail across all methods (regardless of whether or not there was an internal error), aggregated across all "zoekt_webserver" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101121` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101100` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25392,23 +24609,21 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101121` on yo Query: ``` -(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_write_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_code!="OK"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))))))) ```
-#### zoekt: data_disk_read_request_size - -

Average read request size over 1m (per instance)

+#### zoekt: zoekt_webserver_grpc_clients_error_percentage_per_method -The average size of read requests that were issued to the device. +

Client baseline error percentage per-method over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The percentage of gRPC requests that fail per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_webserver" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101130` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101101` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25418,23 +24633,21 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101130` on yo Query: ``` -(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}",grpc_code!="OK"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### zoekt: data_disk_write_request_size) - -

Average write request size over 1m (per instance)

+#### zoekt: zoekt_webserver_grpc_clients_all_codes_per_method -The average size of write requests that were issued to the device. +

Client baseline response codes rate per-method over 2m

-Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +The rate of all generated gRPC response codes per method (regardless of whether or not there was an internal error), aggregated across all "zoekt_webserver" clients. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101131` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101102` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25444,23 +24657,27 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101131` on yo Query: ``` -(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) +(sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```
-#### zoekt: data_disk_reads_merged_sec +#### zoekt: zoekt_webserver_grpc_clients_internal_error_percentage_all_methods -

Merged read request rate over 1m (per instance)

+

Client-observed gRPC internal error percentage across all methods over 2m

-The number of read requests merged per second that were queued to the device. +The percentage of gRPC requests that appear to fail due to gRPC internal errors across all methods, aggregated across all "zoekt_webserver" clients. -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_webserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101140` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101110` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25470,23 +24687,27 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101140` on yo Query: ``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_merged_total{instance=~`node-exporter.*`}[1m]))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_code!="OK",is_internal_error="true"}[2m])))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))))))) ```
-#### zoekt: data_disk_writes_merged_sec +#### zoekt: zoekt_webserver_grpc_clients_internal_error_percentage_per_method -

Merged writes request rate over 1m (per instance)

+

Client-observed gRPC internal error percentage per-method over 2m

-The number of write requests merged per second that were queued to the device. +The percentage of gRPC requests that appear to fail to due to gRPC internal errors per method, aggregated across all "zoekt_webserver" clients. -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_webserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101141` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101111` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25496,23 +24717,27 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101141` on yo Query: ``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_merged_total{instance=~`node-exporter.*`}[1m]))))) +(100.0 * ((((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}",grpc_code!="OK",is_internal_error="true"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### zoekt: data_disk_average_queue_size +#### zoekt: zoekt_webserver_grpc_clients_internal_error_all_codes_per_method -

Average queue size over 1m (per instance)

+

Client-observed gRPC internal error response code rate per-method over 2m

-The number of I/O operations that were being queued or being serviced. See https://blog.actorsfit.com/a?ID=00200-428fa2ac-e338-4540-848c-af9a3eb1ebd2 for background (avgqu-sz). +The rate of gRPC internal-error response codes per method, aggregated across all "zoekt_webserver" clients. -Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. +**Note**: Internal errors are ones that appear to originate from the https://github.com/grpc/grpc-go library itself, rather than from any user-written application code. These errors can be caused by a variety of issues, and can originate from either the code-generated "zoekt_webserver" gRPC client or gRPC server. These errors might be solvable by adjusting the gRPC configuration, or they might indicate a bug from Sourcegraph`s use of gRPC. + +When debugging, knowing that a particular error comes from the grpc-go library itself (an `internal error`) as opposed to `normal` application code can be helpful when trying to fix it. + +**Note**: Internal errors are detected via a very coarse heuristic (seeing if the error starts with `grpc:`, etc.). Because of this, it`s possible that some gRPC-specific issues might not be categorized as internal errors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101150` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101112` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25522,27 +24747,19 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101150` on yo Query: ``` -(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_io_time_weighted_seconds_total{instance=~`node-exporter.*`}[1m]))))) +(sum(rate(src_grpc_method_status{grpc_service=~"zoekt.webserver.v1.WebserverService",is_internal_error="true",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method, grpc_code)) ```
-### Zoekt: [zoekt-indexserver] Container monitoring (not available on server) - -#### zoekt: container_missing +### Zoekt: Zoekt Webserver GRPC retry metrics -

Container missing

+#### zoekt: zoekt_webserver_grpc_clients_retry_percentage_across_all_methods -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +

Client retry percentage across all methods over 2m

-- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod zoekt-indexserver` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p zoekt-indexserver`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' zoekt-indexserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the zoekt-indexserver container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs zoekt-indexserver` (note this will include logs from the previous and currently running container). +The percentage of gRPC requests that were retried across all methods, aggregated across all "zoekt_webserver" clients. This panel has no related alerts. @@ -25556,17 +24773,19 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101200` on yo Query: ``` -count by(name) ((time() - container_last_seen{name=~"^zoekt-indexserver.*"}) > 60) +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",is_retried="true"}[2m])))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService"}[2m]))))))) ```
-#### zoekt: container_cpu_usage +#### zoekt: zoekt_webserver_grpc_clients_retry_percentage_per_method -

Container cpu usage total (1m average) across all cores by instance

+

Client retry percentage per-method over 2m

-Refer to the [alerts reference](alerts#zoekt-container-cpu-usage) for 1 alert related to this panel. +The percentage of gRPC requests that were retried aggregated across all "zoekt_webserver" clients, broken out per method. + +This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101201` on your Sourcegraph instance. @@ -25578,17 +24797,19 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101201` on yo Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"} +(100.0 * ((((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",is_retried="true",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method))) / ((sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}"}[2m])) by (grpc_method)))))) ```
-#### zoekt: container_memory_usage +#### zoekt: zoekt_webserver_grpc_clients_retry_count_per_method -

Container memory usage by instance

+

Client retry count per-method over 2m

-Refer to the [alerts reference](alerts#zoekt-container-memory-usage) for 1 alert related to this panel. +The count of gRPC requests that were retried aggregated across all "zoekt_webserver" clients, broken out per method + +This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101202` on your Sourcegraph instance. @@ -25600,22 +24821,25 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101202` on yo Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"} +(sum(rate(src_grpc_client_retry_attempts_total{grpc_service=~"zoekt.webserver.v1.WebserverService",grpc_method=~"${zoekt_webserver_method:regex}",is_retried="true"}[2m])) by (grpc_method)) ```
-#### zoekt: fs_io_operations +### Zoekt: Data disk I/O metrics -

Filesystem reads and writes rate by instance over 1h

+#### zoekt: data_disk_reads_sec -This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +

Read request rate over 1m (per instance)

+ +The number of read requests that were issued to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101300` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25625,31 +24849,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101203` on yo Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^zoekt-indexserver.*"}[1h]) + rate(container_fs_writes_total{name=~"^zoekt-indexserver.*"}[1h])) +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))) ```
-### Zoekt: [zoekt-webserver] Container monitoring (not available on server) - -#### zoekt: container_missing +#### zoekt: data_disk_writes_sec -

Container missing

+

Write request rate over 1m (per instance)

-This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +The number of write requests that were issued to the device per second. -- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod zoekt-webserver` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p zoekt-webserver`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' zoekt-webserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the zoekt-webserver container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs zoekt-webserver` (note this will include logs from the previous and currently running container). +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101301` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25659,19 +24875,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101300` on yo Query: ``` -count by(name) ((time() - container_last_seen{name=~"^zoekt-webserver.*"}) > 60) +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### zoekt: container_cpu_usage +#### zoekt: data_disk_read_throughput -

Container cpu usage total (1m average) across all cores by instance

+

Read throughput over 1m (per instance)

-Refer to the [alerts reference](alerts#zoekt-container-cpu-usage) for 1 alert related to this panel. +The amount of data that was read from the device per second. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101301` on your Sourcegraph instance. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101310` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25681,19 +24901,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101301` on yo Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"} +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### zoekt: container_memory_usage +#### zoekt: data_disk_write_throughput -

Container memory usage by instance

+

Write throughput over 1m (per instance)

-Refer to the [alerts reference](alerts#zoekt-container-memory-usage) for 1 alert related to this panel. +The amount of data that was written to the device per second. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101311` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25703,22 +24927,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101302` on yo Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"} +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### zoekt: fs_io_operations +#### zoekt: data_disk_read_duration + +

Average read duration over 1m (per instance)

-

Filesystem reads and writes rate by instance over 1h

+The average time for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. -This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101320` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25728,21 +24953,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101303` on yo Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^zoekt-webserver.*"}[1h]) + rate(container_fs_writes_total{name=~"^zoekt-webserver.*"}[1h])) +(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-### Zoekt: [zoekt-indexserver] Provisioning indicators (not available on server) +#### zoekt: data_disk_write_duration -#### zoekt: provisioning_container_cpu_usage_long_term +

Average write duration over 1m (per instance)

-

Container cpu usage total (90th percentile over 1d) across all cores by instance

+The average time for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. -Refer to the [alerts reference](alerts#zoekt-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101400` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101321` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25752,19 +24979,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101400` on yo Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"}[1d]) +(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_write_time_seconds_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### zoekt: provisioning_container_memory_usage_long_term +#### zoekt: data_disk_read_request_size -

Container memory usage (1d maximum) by instance

+

Average read request size over 1m (per instance)

-Refer to the [alerts reference](alerts#zoekt-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +The average size of read requests that were issued to the device. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101401` on your Sourcegraph instance. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101330` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25774,19 +25005,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101401` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"}[1d]) +(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_read_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### zoekt: provisioning_container_cpu_usage_short_term +#### zoekt: data_disk_write_request_size) + +

Average write request size over 1m (per instance)

-

Container cpu usage total (5m maximum) across all cores by instance

+The average size of write requests that were issued to the device. -Refer to the [alerts reference](alerts#zoekt-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101331` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25796,19 +25031,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101410` on yo Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-indexserver.*"}[5m]) +(((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_written_bytes_total{instance=~`node-exporter.*`}[1m])))))) / ((max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_completed_total{instance=~`node-exporter.*`}[1m]))))))) ```
-#### zoekt: provisioning_container_memory_usage_short_term +#### zoekt: data_disk_reads_merged_sec -

Container memory usage (5m maximum) by instance

+

Merged read request rate over 1m (per instance)

+ +The number of read requests merged per second that were queued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. -Refer to the [alerts reference](alerts#zoekt-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101340` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25818,22 +25057,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101411` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-indexserver.*"}[5m]) +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_reads_merged_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### zoekt: container_oomkill_events_total +#### zoekt: data_disk_writes_merged_sec -

Container OOMKILL events total by instance

+

Merged writes request rate over 1m (per instance)

-This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +The number of write requests merged per second that were queued to the device. + +Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. -Refer to the [alerts reference](alerts#zoekt-container-oomkill-events-total) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101341` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25843,21 +25083,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101412` on yo Query: ``` -max by (name) (container_oom_events_total{name=~"^zoekt-indexserver.*"}) +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_writes_merged_total{instance=~`node-exporter.*`}[1m]))))) ```
-### Zoekt: [zoekt-webserver] Provisioning indicators (not available on server) +#### zoekt: data_disk_average_queue_size + +

Average queue size over 1m (per instance)

-#### zoekt: provisioning_container_cpu_usage_long_term +The number of I/O operations that were being queued or being serviced. See https://blog.actorsfit.com/a?ID=00200-428fa2ac-e338-4540-848c-af9a3eb1ebd2 for background (avgqu-sz). -

Container cpu usage total (90th percentile over 1d) across all cores by instance

+Note: Disk statistics are per _device_, not per _service_. In certain environments (such as common docker-compose setups), zoekt could be one of _many services_ using this disk. These statistics are best interpreted as the load experienced by the device zoekt is using, not the load zoekt is solely responsible for causing. -Refer to the [alerts reference](alerts#zoekt-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101350` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25867,19 +25109,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101500` on yo Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"}[1d]) +(max by (instance) (zoekt_indexserver_mount_point_info{mount_name="indexDir",instance=~`${instance:regex}`} * on (device, nodename) group_left() (max by (device, nodename) (rate(node_disk_io_time_weighted_seconds_total{instance=~`node-exporter.*`}[1m]))))) ```
-#### zoekt: provisioning_container_memory_usage_long_term +### Zoekt: [indexed-search-indexer] Golang runtime monitoring -

Container memory usage (1d maximum) by instance

+#### zoekt: go_goroutines -Refer to the [alerts reference](alerts#zoekt-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +

Maximum active goroutines

-To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101501` on your Sourcegraph instance. +A high value here indicates a possible goroutine leak. + +Refer to the [alerts reference](alerts#zoekt-go-goroutines) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101400` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25889,19 +25135,19 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101501` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"}[1d]) +max by(instance) (go_goroutines{job=~".*indexed-search-indexer"}) ```
-#### zoekt: provisioning_container_cpu_usage_short_term +#### zoekt: go_gc_duration_seconds -

Container cpu usage total (5m maximum) across all cores by instance

+

Maximum go garbage collection duration

-Refer to the [alerts reference](alerts#zoekt-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#zoekt-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101401` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25911,19 +25157,23 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101510` on yo Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^zoekt-webserver.*"}[5m]) +max by(instance) (go_gc_duration_seconds{job=~".*indexed-search-indexer"}) ```
-#### zoekt: provisioning_container_memory_usage_short_term +### Zoekt: [indexed-search] Golang runtime monitoring -

Container memory usage (5m maximum) by instance

+#### zoekt: go_goroutines + +

Maximum active goroutines

+ +A high value here indicates a possible goroutine leak. -Refer to the [alerts reference](alerts#zoekt-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#zoekt-go-goroutines) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101500` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25933,22 +25183,19 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101511` on yo Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^zoekt-webserver.*"}[5m]) +max by(instance) (go_goroutines{job=~".*indexed-search"}) ```
-#### zoekt: container_oomkill_events_total +#### zoekt: go_gc_duration_seconds -

Container OOMKILL events total by instance

- -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

Maximum go garbage collection duration

-Refer to the [alerts reference](alerts#zoekt-container-oomkill-events-total) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#zoekt-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101501` on your Sourcegraph instance. *Managed by the [Sourcegraph Search Platform team](https://handbook.sourcegraph.com/departments/engineering/teams/search/core).* @@ -25958,7 +25205,7 @@ To see this panel, visit `/-/debug/grafana/d/zoekt/zoekt?viewPanel=101512` on yo Query: ``` -max by (name) (container_oom_events_total{name=~"^zoekt-webserver.*"}) +max by(instance) (go_gc_duration_seconds{job=~".*indexed-search"}) ``` @@ -26507,87 +25754,13 @@ To see this dashboard, visit `/-/debug/grafana/d/executor/executor` on your Sour ### Executor: Executor: Executor jobs -#### executor: executor_queue_size - -

Unprocessed executor job queue size

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100000` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -max by (queue)(src_executor_total{queue=~"$queue",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}) -``` -
- -
- -#### executor: executor_queue_growth_rate - -

Unprocessed executor job queue growth rate over 30m

- -This value compares the rate of enqueues against the rate of finished jobs for the selected queue. - - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100001` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (queue)(increase(src_executor_total{queue=~"$queue",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}[30m])) / sum by (queue)(increase(src_executor_processor_total{queue=~"$queue",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}[30m])) -``` -
- -
- -#### executor: executor_queued_max_age - -

Unprocessed executor job queue longest time in queue

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100002` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -max by (queue)(src_executor_queued_duration_seconds_total{queue=~"$queue",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}) -``` -
- -
- -### Executor: Executor: Executor jobs - #### executor: multiqueue_executor_dequeue_cache_size

Unprocessed executor job dequeue cache size for multiqueue executors

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26611,7 +25784,7 @@ multiqueue_executor_dequeue_cache_size{queue=~"$queue",job=~"^(executor|sourcegr Refer to the [alerts reference](alerts#executor-executor-handlers) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100100` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26633,7 +25806,7 @@ sum(src_executor_processor_handlers{queue=~"${queue:regex}",sg_job=~"^sourcegrap This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26655,7 +25828,7 @@ sum(increase(src_executor_processor_total{queue=~"${queue:regex}",sg_job=~"^sour This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100111` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26677,7 +25850,7 @@ sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100112` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26699,7 +25872,7 @@ sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job= Refer to the [alerts reference](alerts#executor-executor-processor-error-rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100113` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26723,7 +25896,7 @@ sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job= This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100200` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26745,7 +25918,7 @@ sum(increase(src_apiworker_apiclient_queue_total{sg_job=~"^sourcegraph-executors This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100201` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26767,7 +25940,7 @@ sum by (le)(rate(src_apiworker_apiclient_queue_duration_seconds_bucket{sg_job=~ This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100202` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26789,7 +25962,7 @@ sum(increase(src_apiworker_apiclient_queue_errors_total{sg_job=~"^sourcegraph-ex This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100203` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26811,7 +25984,7 @@ sum(increase(src_apiworker_apiclient_queue_errors_total{sg_job=~"^sourcegraph-ex This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100210` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26833,7 +26006,7 @@ sum by (op)(increase(src_apiworker_apiclient_queue_total{sg_job=~"^sourcegraph-e This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100211` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26855,7 +26028,7 @@ histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_queue_dura This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100212` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26877,7 +26050,7 @@ sum by (op)(increase(src_apiworker_apiclient_queue_errors_total{sg_job=~"^source This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100213` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26901,7 +26074,7 @@ sum by (op)(increase(src_apiworker_apiclient_queue_errors_total{sg_job=~"^source This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26923,7 +26096,7 @@ sum(increase(src_apiworker_apiclient_files_total{sg_job=~"^sourcegraph-executors This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100301` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26945,7 +26118,7 @@ sum by (le)(rate(src_apiworker_apiclient_files_duration_seconds_bucket{sg_job=~ This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100302` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26967,7 +26140,7 @@ sum(increase(src_apiworker_apiclient_files_errors_total{sg_job=~"^sourcegraph-ex This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100303` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -26989,7 +26162,7 @@ sum(increase(src_apiworker_apiclient_files_errors_total{sg_job=~"^sourcegraph-ex This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100310` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27011,7 +26184,7 @@ sum by (op)(increase(src_apiworker_apiclient_files_total{sg_job=~"^sourcegraph-e This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100311` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27033,7 +26206,7 @@ histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_files_dura This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100312` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27055,7 +26228,7 @@ sum by (op)(increase(src_apiworker_apiclient_files_errors_total{sg_job=~"^source This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100313` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27079,7 +26252,7 @@ sum by (op)(increase(src_apiworker_apiclient_files_errors_total{sg_job=~"^source This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27101,7 +26274,7 @@ sum(increase(src_apiworker_command_total{op=~"setup.*",sg_job=~"^sourcegraph-exe This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100401` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27123,7 +26296,7 @@ sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",sg This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100402` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27145,7 +26318,7 @@ sum(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100403` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27167,7 +26340,7 @@ sum(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100410` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27189,7 +26362,7 @@ sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",sg_job=~"^sourceg This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100411` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27211,7 +26384,7 @@ histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_sec This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100412` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27233,7 +26406,7 @@ sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^ This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100413` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27257,7 +26430,7 @@ sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^ This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100500` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27279,7 +26452,7 @@ sum(increase(src_apiworker_command_total{op=~"exec.*",sg_job=~"^sourcegraph-exec This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100501` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27301,7 +26474,7 @@ sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",sg_ This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100602` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100502` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27323,7 +26496,7 @@ sum(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegra This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100603` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100503` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27345,7 +26518,7 @@ sum(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegra This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100510` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27367,7 +26540,7 @@ sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",sg_job=~"^sourcegr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100511` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27389,7 +26562,7 @@ histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_sec This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100512` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27411,7 +26584,7 @@ sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^s This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100613` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100513` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27435,7 +26608,7 @@ sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^s This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100600` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27457,7 +26630,7 @@ sum(increase(src_apiworker_command_total{op=~"teardown.*",sg_job=~"^sourcegraph- This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100601` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27479,7 +26652,7 @@ sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*" This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100702` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100602` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27501,7 +26674,7 @@ sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourc This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100703` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100603` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27523,7 +26696,7 @@ sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourc This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100610` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27545,7 +26718,7 @@ sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",sg_job=~"^sour This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100611` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27567,7 +26740,7 @@ histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_sec This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100712` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100612` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27589,7 +26762,7 @@ sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job= This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100713` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100613` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -27615,7 +26788,7 @@ Indicates the amount of CPU time excluding idle and iowait time, divided by the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100700` on your Sourcegraph instance.
@@ -27638,7 +26811,7 @@ Indicates the average summed time a number of (but strictly not all) non-idle pr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100701` on your Sourcegraph instance.
@@ -27661,7 +26834,7 @@ Indicates the amount of available memory (including cache and buffers) as a perc This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100710` on your Sourcegraph instance.
@@ -27684,7 +26857,7 @@ Indicates the efficiency of page reclaim, calculated as pgsteal/pgscan. Optimal This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100711` on your Sourcegraph instance.
@@ -27707,7 +26880,7 @@ Indicates the amount of time all non-idle processes were stalled waiting on memo This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100812` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100712` on your Sourcegraph instance.
@@ -27730,7 +26903,7 @@ Indicates the percentage of time a disk was busy. If this is less than 100%, the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100820` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100720` on your Sourcegraph instance.
@@ -27753,7 +26926,7 @@ Indicates the number of outstanding/queued IO requests. High but short-lived que This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100821` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100721` on your Sourcegraph instance.
@@ -27776,7 +26949,7 @@ Indicates the averaged amount of time for which all non-idle processes were stal This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100822` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100722` on your Sourcegraph instance.
@@ -27799,7 +26972,7 @@ Indicates the average summed receiving throughput of all network interfaces. Thi This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100830` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100730` on your Sourcegraph instance.
@@ -27822,7 +26995,7 @@ Number of dropped received packets. This can happen if the receive queues/buffer This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100831` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100731` on your Sourcegraph instance.
@@ -27845,7 +27018,7 @@ Number of bad/malformed packets received. https://www.kernel.org/doc/html/latest This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100832` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100732` on your Sourcegraph instance.
@@ -27868,7 +27041,7 @@ Indicates the average summed transmitted throughput of all network interfaces. T This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100840` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100740` on your Sourcegraph instance.
@@ -27891,7 +27064,7 @@ Number of dropped transmitted packets. This can happen if the receiving side`s r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100841` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100741` on your Sourcegraph instance.
@@ -27914,7 +27087,7 @@ Number of packet transmission errors. This is distinct from tx packet dropping, This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100842` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100742` on your Sourcegraph instance.
@@ -27939,7 +27112,7 @@ Indicates the amount of CPU time excluding idle and iowait time, divided by the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100800` on your Sourcegraph instance.
@@ -27962,7 +27135,7 @@ Indicates the average summed time a number of (but strictly not all) non-idle pr This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100801` on your Sourcegraph instance.
@@ -27985,7 +27158,7 @@ Indicates the amount of available memory (including cache and buffers) as a perc This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100810` on your Sourcegraph instance.
@@ -28008,7 +27181,7 @@ Indicates the efficiency of page reclaim, calculated as pgsteal/pgscan. Optimal This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100811` on your Sourcegraph instance.
@@ -28031,7 +27204,7 @@ Indicates the amount of time all non-idle processes were stalled waiting on memo This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100912` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100812` on your Sourcegraph instance.
@@ -28054,7 +27227,7 @@ Indicates the percentage of time a disk was busy. If this is less than 100%, the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100920` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100820` on your Sourcegraph instance.
@@ -28077,7 +27250,7 @@ Indicates the number of outstanding/queued IO requests. High but short-lived que This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100921` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100821` on your Sourcegraph instance.
@@ -28100,7 +27273,7 @@ Indicates the averaged amount of time for which all non-idle processes were stal This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100922` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100822` on your Sourcegraph instance.
@@ -28123,7 +27296,7 @@ Indicates the average summed receiving throughput of all network interfaces. Thi This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100930` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100830` on your Sourcegraph instance.
@@ -28146,7 +27319,7 @@ Number of dropped received packets. This can happen if the receive queues/buffer This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100931` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100831` on your Sourcegraph instance.
@@ -28169,7 +27342,7 @@ Number of bad/malformed packets received. https://www.kernel.org/doc/html/latest This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100932` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100832` on your Sourcegraph instance.
@@ -28192,7 +27365,7 @@ Indicates the average summed transmitted throughput of all network interfaces. T This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100940` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100840` on your Sourcegraph instance.
@@ -28215,7 +27388,7 @@ Number of dropped transmitted packets. This can happen if the receiving side`s r This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100941` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100841` on your Sourcegraph instance.
@@ -28238,7 +27411,7 @@ Number of packet transmission errors. This is distinct from tx packet dropping, This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100942` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100842` on your Sourcegraph instance.
@@ -28263,7 +27436,7 @@ A high value here indicates a possible goroutine leak. Refer to the [alerts reference](alerts#executor-go-goroutines) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100900` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -28285,7 +27458,7 @@ max by(sg_instance) (go_goroutines{sg_job=~".*sourcegraph-executors"}) Refer to the [alerts reference](alerts#executor-go-gc-duration-seconds) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100901` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -28327,7 +27500,7 @@ To see this panel, visit `/-/debug/grafana/d/containers/containers?viewPanel=100 Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|repo-updater|searcher|symbols|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"} +cadvisor_container_memory_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|searcher|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"} ```
@@ -28351,7 +27524,7 @@ To see this panel, visit `/-/debug/grafana/d/containers/containers?viewPanel=100 Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|repo-updater|searcher|symbols|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"} +cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|searcher|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"} ```
@@ -28377,7 +27550,7 @@ To see this panel, visit `/-/debug/grafana/d/containers/containers?viewPanel=100 Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|repo-updater|searcher|symbols|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}[5m]) >= 80 +max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|searcher|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}[5m]) >= 80 ```
@@ -28401,7 +27574,7 @@ To see this panel, visit `/-/debug/grafana/d/containers/containers?viewPanel=100 Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|repo-updater|searcher|symbols|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}[5m]) >= 80 +max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|searcher|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}[5m]) >= 80 ```
@@ -28426,7 +27599,7 @@ To see this panel, visit `/-/debug/grafana/d/containers/containers?viewPanel=100 Query: ``` -max by (name) (container_oom_events_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|repo-updater|searcher|symbols|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}) >= 1 +max by (name) (container_oom_events_total{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|searcher|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}) >= 1 ```
@@ -28451,7 +27624,7 @@ To see this panel, visit `/-/debug/grafana/d/containers/containers?viewPanel=100 Query: ``` -count by(name) ((time() - container_last_seen{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|repo-updater|searcher|symbols|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}) > 60) +count by(name) ((time() - container_last_seen{name=~"^(frontend|sourcegraph-frontend|gitserver|pgsql|codeintel-db|codeinsights|precise-code-intel-worker|prometheus|redis-cache|redis-store|redis-exporter|searcher|syntect-server|worker|zoekt-indexserver|zoekt-webserver|indexed-search|grafana|blobstore|jaeger).*"}) > 60) ```
@@ -28509,78 +27682,6 @@ sum(increase(src_codeintel_autoindexing_errors_total{op='HandleIndexSchedule',jo
-#### codeintel-autoindexing: executor_queue_size - -

Unprocessed executor job queue size

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-autoindexing/codeintel-autoindexing?viewPanel=100010` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -max by (queue)(src_executor_total{queue=~"codeintel",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}) -``` -
- -
- -#### codeintel-autoindexing: executor_queue_growth_rate - -

Unprocessed executor job queue growth rate over 30m

- -This value compares the rate of enqueues against the rate of finished jobs for the selected queue. - - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-autoindexing/codeintel-autoindexing?viewPanel=100011` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (queue)(increase(src_executor_total{queue=~"codeintel",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}[30m])) / sum by (queue)(increase(src_executor_processor_total{queue=~"codeintel",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}[30m])) -``` -
- -
- -#### codeintel-autoindexing: executor_queued_max_age - -

Unprocessed executor job queue longest time in queue

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-autoindexing/codeintel-autoindexing?viewPanel=100012` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -max by (queue)(src_executor_queued_duration_seconds_total{queue=~"codeintel",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors).*"}) -``` -
- -
- ### Code Intelligence > Autoindexing: Codeintel: Autoindexing > Service #### codeintel-autoindexing: codeintel_autoindexing_total @@ -31347,21 +30448,21 @@ sum(increase(src_codeintel_background_policies_updated_total_total{job=~"^${sour
-## Code Intelligence > Ranking +## Code Intelligence > Uploads -

The service at `internal/codeintel/ranking`.

+

The service at `internal/codeintel/uploads`.

-To see this dashboard, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking` on your Sourcegraph instance. +To see this dashboard, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads` on your Sourcegraph instance. -### Code Intelligence > Ranking: Codeintel: Ranking > Service +### Code Intelligence > Uploads: Codeintel: Uploads > Service -#### codeintel-ranking: codeintel_ranking_total +#### codeintel-uploads: codeintel_uploads_total

Aggregate service operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31371,19 +30472,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_99th_percentile_duration

Aggregate successful service operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31393,19 +30494,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (le)(rate(src_codeintel_ranking_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) +sum by (le)(rate(src_codeintel_uploads_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_errors_total +#### codeintel-uploads: codeintel_uploads_errors_total

Aggregate service operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100002` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100002` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31415,19 +30516,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_error_rate +#### codeintel-uploads: codeintel_uploads_error_rate

Aggregate service operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100003` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100003` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31437,19 +30538,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_ranking_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_ranking_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```

-#### codeintel-ranking: codeintel_ranking_total +#### codeintel-uploads: codeintel_uploads_total

Service operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100010` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31459,19 +30560,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_99th_percentile_duration

99th percentile successful service operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100011` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31481,19 +30582,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```

-#### codeintel-ranking: codeintel_ranking_errors_total +#### codeintel-uploads: codeintel_uploads_errors_total

Service operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100012` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31503,197 +30604,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_error_rate +#### codeintel-uploads: codeintel_uploads_error_rate

Service operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100013` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 -``` -
- -
- -### Code Intelligence > Ranking: Codeintel: Ranking > Store - -#### codeintel-ranking: codeintel_ranking_store_total - -

Aggregate store operations every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100100` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_store_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_99th_percentile_duration - -

Aggregate successful store operation duration distribution over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100101` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (le)(rate(src_codeintel_ranking_store_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_errors_total - -

Aggregate store operation errors every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100102` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_store_errors_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_error_rate - -

Aggregate store operation error rate over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100103` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_store_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_ranking_store_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_ranking_store_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_total - -

Store operations every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100110` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_store_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_99th_percentile_duration - -

99th percentile successful store operation duration over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100111` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_store_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_errors_total - -

Store operation errors every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100112` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_store_errors_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_store_error_rate - -

Store operation error rate over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100013` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31703,21 +30626,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_store_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_store_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_store_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```

-### Code Intelligence > Ranking: Codeintel: Ranking > LSIFStore +### Code Intelligence > Uploads: Codeintel: Uploads > Store (internal) -#### codeintel-ranking: codeintel_ranking_lsifstore_total +#### codeintel-uploads: codeintel_uploads_store_total

Aggregate store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100100` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31727,19 +30650,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_lsifstore_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_lsifstore_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_store_99th_percentile_duration

Aggregate successful store operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100101` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31749,19 +30672,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (le)(rate(src_codeintel_ranking_lsifstore_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) +sum by (le)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_lsifstore_errors_total +#### codeintel-uploads: codeintel_uploads_store_errors_total

Aggregate store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100102` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31771,19 +30694,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_lsifstore_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_lsifstore_error_rate +#### codeintel-uploads: codeintel_uploads_store_error_rate

Aggregate store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100103` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31793,19 +30716,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_lsifstore_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_ranking_lsifstore_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_ranking_lsifstore_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```

-#### codeintel-ranking: codeintel_ranking_lsifstore_total +#### codeintel-uploads: codeintel_uploads_store_total

Store operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100110` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31815,19 +30738,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_lsifstore_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_lsifstore_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_store_99th_percentile_duration

99th percentile successful store operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100111` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -31837,251 +30760,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_lsifstore_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```

-#### codeintel-ranking: codeintel_ranking_lsifstore_errors_total +#### codeintel-uploads: codeintel_uploads_store_errors_total

Store operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100212` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_lsifstore_errors_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_lsifstore_error_rate - -

Store operation error rate over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100213` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_lsifstore_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_lsifstore_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_lsifstore_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 -``` -
- -
- -### Code Intelligence > Ranking: Codeintel: Uploads > Pipeline task > Codeintel ranking symbol exporter - -#### codeintel-ranking: codeintel_ranking_symbol_exporter_records_processed_total - -

Records processed every 5m

- -The number of candidate records considered for cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100300` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_symbol_exporter_records_processed_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_symbol_exporter_records_altered_total - -

Records altered every 5m

- -The number of candidate records altered as part of cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100301` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_symbol_exporter_records_altered_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_symbol_exporter_total - -

Job invocation operations every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100310` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_symbol_exporter_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_symbol_exporter_99th_percentile_duration - -

99th percentile successful job invocation operation duration over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100311` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_symbol_exporter_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_symbol_exporter_errors_total - -

Job invocation operation errors every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100312` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_symbol_exporter_errors_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_symbol_exporter_error_rate - -

Job invocation operation error rate over 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100313` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum by (op)(increase(src_codeintel_ranking_symbol_exporter_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_symbol_exporter_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_symbol_exporter_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 -``` -
- -
- -### Code Intelligence > Ranking: Codeintel: Uploads > Pipeline task > Codeintel ranking file reference count seed mapper - -#### codeintel-ranking: codeintel_ranking_file_reference_count_seed_mapper_records_processed_total - -

Records processed every 5m

- -The number of candidate records considered for cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100400` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_file_reference_count_seed_mapper_records_processed_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_file_reference_count_seed_mapper_records_altered_total - -

Records altered every 5m

- -The number of candidate records altered as part of cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100401` on your Sourcegraph instance. - -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* - -
-Technical details - -Query: - -``` -sum(increase(src_codeintel_ranking_file_reference_count_seed_mapper_records_altered_total{job=~"^${source:regex}.*"}[5m])) -``` -
- -
- -#### codeintel-ranking: codeintel_ranking_file_reference_count_seed_mapper_total - -

Job invocation operations every 5m

- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100112` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32091,19 +30782,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_seed_mapper_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_seed_mapper_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_store_error_rate -

99th percentile successful job invocation operation duration over 5m

+

Store operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100113` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32113,19 +30804,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_file_reference_count_seed_mapper_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_seed_mapper_errors_total +### Code Intelligence > Uploads: Codeintel: Uploads > GQL Transport + +#### codeintel-uploads: codeintel_uploads_transport_graphql_total -

Job invocation operation errors every 5m

+

Aggregate resolver operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100200` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32135,19 +30828,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_seed_mapper_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_seed_mapper_error_rate +#### codeintel-uploads: codeintel_uploads_transport_graphql_99th_percentile_duration -

Job invocation operation error rate over 5m

+

Aggregate successful resolver operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100201` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32157,23 +30850,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_seed_mapper_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_file_reference_count_seed_mapper_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_file_reference_count_seed_mapper_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (le)(rate(src_codeintel_uploads_transport_graphql_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) ```

-### Code Intelligence > Ranking: Codeintel: Uploads > Pipeline task > Codeintel ranking file reference count mapper - -#### codeintel-ranking: codeintel_ranking_file_reference_count_mapper_records_processed_total - -

Records processed every 5m

+#### codeintel-uploads: codeintel_uploads_transport_graphql_errors_total -The number of candidate records considered for cleanup. +

Aggregate resolver operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100202` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32183,21 +30872,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_file_reference_count_mapper_records_processed_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_mapper_records_altered_total - -

Records altered every 5m

+#### codeintel-uploads: codeintel_uploads_transport_graphql_error_rate -The number of candidate records altered as part of cleanup. +

Aggregate resolver operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100203` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32207,19 +30894,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_file_reference_count_mapper_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_mapper_total +#### codeintel-uploads: codeintel_uploads_transport_graphql_total -

Job invocation operations every 5m

+

Resolver operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100210` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32229,19 +30916,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_mapper_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_mapper_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_transport_graphql_99th_percentile_duration -

99th percentile successful job invocation operation duration over 5m

+

99th percentile successful resolver operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100211` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32251,19 +30938,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_file_reference_count_mapper_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_transport_graphql_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```

-#### codeintel-ranking: codeintel_ranking_file_reference_count_mapper_errors_total +#### codeintel-uploads: codeintel_uploads_transport_graphql_errors_total -

Job invocation operation errors every 5m

+

Resolver operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100212` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32273,19 +30960,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_mapper_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_file_reference_count_mapper_error_rate +#### codeintel-uploads: codeintel_uploads_transport_graphql_error_rate -

Job invocation operation error rate over 5m

+

Resolver operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100213` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32295,23 +30982,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_mapper_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_file_reference_count_mapper_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_file_reference_count_mapper_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Pipeline task > Codeintel ranking file reference count reducer - -#### codeintel-ranking: codeintel_ranking_file_reference_count_reducer_records_processed_total +### Code Intelligence > Uploads: Codeintel: Uploads > HTTP Transport -

Records processed every 5m

+#### codeintel-uploads: codeintel_uploads_transport_http_total -The number of candidate records considered for cleanup. +

Aggregate http handler operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100300` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32321,21 +31006,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_file_reference_count_reducer_records_processed_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_file_reference_count_reducer_records_altered_total - -

Records altered every 5m

+#### codeintel-uploads: codeintel_uploads_transport_http_99th_percentile_duration -The number of candidate records altered as part of cleanup. +

Aggregate successful http handler operation duration distribution over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100301` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32345,19 +31028,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_file_reference_count_reducer_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum by (le)(rate(src_codeintel_uploads_transport_http_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_file_reference_count_reducer_total +#### codeintel-uploads: codeintel_uploads_transport_http_errors_total -

Job invocation operations every 5m

+

Aggregate http handler operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100610` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100302` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32367,19 +31050,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_reducer_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_file_reference_count_reducer_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_transport_http_error_rate -

99th percentile successful job invocation operation duration over 5m

+

Aggregate http handler operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100303` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32389,19 +31072,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_file_reference_count_reducer_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-#### codeintel-ranking: codeintel_ranking_file_reference_count_reducer_errors_total +#### codeintel-uploads: codeintel_uploads_transport_http_total -

Job invocation operation errors every 5m

+

Http handler operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100310` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32411,19 +31094,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_reducer_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_file_reference_count_reducer_error_rate +#### codeintel-uploads: codeintel_uploads_transport_http_99th_percentile_duration -

Job invocation operation error rate over 5m

+

99th percentile successful http handler operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100613` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100311` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32433,23 +31116,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_file_reference_count_reducer_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_file_reference_count_reducer_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_file_reference_count_reducer_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_transport_http_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking processed references janitor - -#### codeintel-ranking: codeintel_ranking_processed_references_janitor_records_scanned_total - -

Records scanned every 5m

+#### codeintel-uploads: codeintel_uploads_transport_http_errors_total -The number of candidate records considered for cleanup. +

Http handler operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100312` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32459,21 +31138,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_processed_references_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_references_janitor_records_altered_total - -

Records altered every 5m

+#### codeintel-uploads: codeintel_uploads_transport_http_error_rate -The number of candidate records altered as part of cleanup. +

Http handler operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100313` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32483,19 +31160,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_processed_references_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-#### codeintel-ranking: codeintel_ranking_processed_references_janitor_total +### Code Intelligence > Uploads: Codeintel: Uploads > Expiration task -

Job invocation operations every 5m

+#### codeintel-uploads: codeintel_background_repositories_scanned_total + +

Lsif upload repository scan repositories scanned every 5m

+ +Number of repositories scanned for data retention This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100710` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100400` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32505,19 +31186,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_processed_references_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_background_repositories_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_references_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_background_upload_records_scanned_total + +

Lsif upload records scan records scanned every 5m

-

99th percentile successful job invocation operation duration over 5m

+Number of codeintel upload records scanned for data retention This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100401` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32527,19 +31210,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_processed_references_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum(increase(src_codeintel_background_upload_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_references_janitor_errors_total +#### codeintel-uploads: codeintel_background_commits_scanned_total -

Job invocation operation errors every 5m

+

Lsif upload commits scanned commits scanned every 5m

+ +Number of commits reachable from a codeintel upload record scanned for data retention This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100712` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100402` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32549,19 +31234,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_processed_references_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_background_commits_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_references_janitor_error_rate +#### codeintel-uploads: codeintel_background_upload_records_expired_total -

Job invocation operation error rate over 5m

+

Lsif upload records expired uploads scanned every 5m

+ +Number of codeintel upload records marked as expired This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100713` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100403` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32571,15 +31258,15 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_processed_references_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_processed_references_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_processed_references_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_codeintel_background_upload_records_expired_total{job=~"^${source:regex}.*"}[5m])) ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking processed paths janitor +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor unknown repository -#### codeintel-ranking: codeintel_ranking_processed_paths_janitor_records_scanned_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_records_scanned_total

Records scanned every 5m

@@ -32587,7 +31274,7 @@ The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100500` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32597,13 +31284,13 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_processed_paths_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_unknown_repository_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_paths_janitor_records_altered_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_records_altered_total

Records altered every 5m

@@ -32611,7 +31298,7 @@ The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100501` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32621,19 +31308,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_processed_paths_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_unknown_repository_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_paths_janitor_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_total

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100810` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100510` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32643,19 +31330,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_processed_paths_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_paths_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_99th_percentile_duration

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100511` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32665,19 +31352,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_processed_paths_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_unknown_repository_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-ranking: codeintel_ranking_processed_paths_janitor_errors_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_errors_total

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100812` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100512` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32687,19 +31374,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_processed_paths_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_processed_paths_janitor_error_rate +#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_error_rate

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100813` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100513` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32709,15 +31396,15 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_processed_paths_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_processed_paths_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_processed_paths_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking exported uploads janitor +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor unknown commit -#### codeintel-ranking: codeintel_ranking_exported_uploads_janitor_records_scanned_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_records_scanned_total

Records scanned every 5m

@@ -32725,7 +31412,7 @@ The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100600` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32735,13 +31422,13 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_exported_uploads_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_unknown_commit_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_exported_uploads_janitor_records_altered_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_records_altered_total

Records altered every 5m

@@ -32749,7 +31436,7 @@ The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100601` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32759,19 +31446,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_exported_uploads_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_unknown_commit_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_exported_uploads_janitor_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_total

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100610` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32781,19 +31468,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_exported_uploads_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_exported_uploads_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_99th_percentile_duration

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100611` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32803,19 +31490,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_exported_uploads_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_unknown_commit_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-ranking: codeintel_ranking_exported_uploads_janitor_errors_total +#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_errors_total

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100912` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100612` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32825,19 +31512,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_exported_uploads_janitor_error_rate +#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_error_rate

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=100913` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100613` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32847,15 +31534,15 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_exported_uploads_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking deleted exported uploads janitor +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor abandoned -#### codeintel-ranking: codeintel_ranking_deleted_exported_uploads_janitor_records_scanned_total +#### codeintel-uploads: codeintel_uploads_janitor_abandoned_records_scanned_total

Records scanned every 5m

@@ -32863,7 +31550,7 @@ The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100700` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32873,13 +31560,13 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_abandoned_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_deleted_exported_uploads_janitor_records_altered_total +#### codeintel-uploads: codeintel_uploads_janitor_abandoned_records_altered_total

Records altered every 5m

@@ -32887,7 +31574,7 @@ The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100701` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32897,19 +31584,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_abandoned_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_deleted_exported_uploads_janitor_total +#### codeintel-uploads: codeintel_uploads_janitor_abandoned_total

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100710` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32919,19 +31606,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_deleted_exported_uploads_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_janitor_abandoned_99th_percentile_duration

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100711` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32941,19 +31628,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_deleted_exported_uploads_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_abandoned_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-ranking: codeintel_ranking_deleted_exported_uploads_janitor_errors_total +#### codeintel-uploads: codeintel_uploads_janitor_abandoned_errors_total

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100712` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32963,19 +31650,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_deleted_exported_uploads_janitor_error_rate +#### codeintel-uploads: codeintel_uploads_janitor_abandoned_error_rate

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101013` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100713` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -32985,15 +31672,15 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_deleted_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking abandoned exported uploads janitor +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads expirer unreferenced -#### codeintel-ranking: codeintel_ranking_abandoned_exported_uploads_janitor_records_scanned_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_records_scanned_total

Records scanned every 5m

@@ -33001,7 +31688,7 @@ The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100800` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33011,13 +31698,13 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_expirer_unreferenced_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_abandoned_exported_uploads_janitor_records_altered_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_records_altered_total

Records altered every 5m

@@ -33025,7 +31712,7 @@ The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100801` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33035,19 +31722,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_expirer_unreferenced_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_abandoned_exported_uploads_janitor_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_total

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100810` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33057,19 +31744,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_abandoned_exported_uploads_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_99th_percentile_duration

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100811` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33079,19 +31766,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_abandoned_exported_uploads_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_expirer_unreferenced_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-ranking: codeintel_ranking_abandoned_exported_uploads_janitor_errors_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_errors_total

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100812` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33101,19 +31788,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_abandoned_exported_uploads_janitor_error_rate +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_error_rate

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100813` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33123,15 +31810,15 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_abandoned_exported_uploads_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking rank counts janitor +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads expirer unreferenced graph -#### codeintel-ranking: codeintel_ranking_rank_counts_janitor_records_scanned_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_records_scanned_total

Records scanned every 5m

@@ -33139,7 +31826,7 @@ The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100900` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33149,13 +31836,13 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_rank_counts_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_expirer_unreferenced_graph_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_counts_janitor_records_altered_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_records_altered_total

Records altered every 5m

@@ -33163,7 +31850,7 @@ The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100901` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33173,19 +31860,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_rank_counts_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_expirer_unreferenced_graph_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_counts_janitor_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_total

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100910` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33195,19 +31882,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_rank_counts_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_counts_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_99th_percentile_duration

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100911` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33217,19 +31904,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_rank_counts_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_expirer_unreferenced_graph_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-ranking: codeintel_ranking_rank_counts_janitor_errors_total +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_errors_total

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100912` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33239,19 +31926,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_rank_counts_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_counts_janitor_error_rate +#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_error_rate

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100913` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33261,15 +31948,15 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_rank_counts_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_rank_counts_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_rank_counts_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Ranking: Codeintel: Uploads > Janitor task > Codeintel ranking rank janitor +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads hard deleter -#### codeintel-ranking: codeintel_ranking_rank_janitor_records_scanned_total +#### codeintel-uploads: codeintel_uploads_hard_deleter_records_scanned_total

Records scanned every 5m

@@ -33277,7 +31964,7 @@ The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101000` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33287,13 +31974,13 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_rank_janitor_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_hard_deleter_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_janitor_records_altered_total +#### codeintel-uploads: codeintel_uploads_hard_deleter_records_altered_total

Records altered every 5m

@@ -33301,7 +31988,7 @@ The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101001` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33311,19 +31998,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum(increase(src_codeintel_ranking_rank_janitor_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_hard_deleter_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_janitor_total +#### codeintel-uploads: codeintel_uploads_hard_deleter_total

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101010` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33333,19 +32020,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_rank_janitor_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_hard_deleter_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_janitor_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_hard_deleter_99th_percentile_duration

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101011` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33355,19 +32042,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_ranking_rank_janitor_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_hard_deleter_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-ranking: codeintel_ranking_rank_janitor_errors_total +#### codeintel-uploads: codeintel_uploads_hard_deleter_errors_total

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101012` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33377,19 +32064,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_rank_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_hard_deleter_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-ranking: codeintel_ranking_rank_janitor_error_rate +#### codeintel-uploads: codeintel_uploads_hard_deleter_error_rate

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking?viewPanel=101313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101013` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33399,27 +32086,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-ranking/codeintel-ranking Query: ``` -sum by (op)(increase(src_codeintel_ranking_rank_janitor_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_ranking_rank_janitor_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_ranking_rank_janitor_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_hard_deleter_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_hard_deleter_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_hard_deleter_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-## Code Intelligence > Uploads - -

The service at `internal/codeintel/uploads`.

- -To see this dashboard, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads` on your Sourcegraph instance. +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor audit logs -### Code Intelligence > Uploads: Codeintel: Uploads > Service +#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_records_scanned_total -#### codeintel-uploads: codeintel_uploads_total +

Records scanned every 5m

-

Aggregate service operations every 5m

+The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101100` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33429,19 +32112,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_audit_logs_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_records_altered_total -

Aggregate successful service operation duration distribution over 5m

+

Records altered every 5m

+ +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101101` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33451,19 +32136,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (le)(rate(src_codeintel_uploads_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_audit_logs_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_errors_total +#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_total -

Aggregate service operation errors every 5m

+

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100002` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101110` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33473,19 +32158,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_error_rate +#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_99th_percentile_duration -

Aggregate service operation error rate over 5m

+

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100003` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101111` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33495,19 +32180,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_audit_logs_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-uploads: codeintel_uploads_total +#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_errors_total -

Service operations every 5m

+

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101112` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33517,19 +32202,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_error_rate -

99th percentile successful service operation duration over 5m

+

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101113` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33539,19 +32224,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-#### codeintel-uploads: codeintel_uploads_errors_total +### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor scip documents -

Service operation errors every 5m

+#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_records_scanned_total + +

Records scanned every 5m

+ +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101200` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33561,19 +32250,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_janitor_scip_documents_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_error_rate +#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_records_altered_total -

Service operation error rate over 5m

+

Records altered every 5m

+ +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100013` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101201` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33583,21 +32274,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_codeintel_uploads_janitor_scip_documents_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-### Code Intelligence > Uploads: Codeintel: Uploads > Store (internal) - -#### codeintel-uploads: codeintel_uploads_store_total +#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_total -

Aggregate store operations every 5m

+

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101210` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33607,19 +32296,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_store_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_99th_percentile_duration -

Aggregate successful store operation duration distribution over 5m

+

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101211` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33629,19 +32318,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (le)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_scip_documents_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-uploads: codeintel_uploads_store_errors_total +#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_errors_total -

Aggregate store operation errors every 5m

+

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101212` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33651,19 +32340,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_store_error_rate +#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_error_rate -

Aggregate store operation error rate over 5m

+

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101213` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33673,19 +32362,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-#### codeintel-uploads: codeintel_uploads_store_total +### Code Intelligence > Uploads: Codeintel: Uploads > Reconciler task > Codeintel uploads reconciler scip metadata -

Store operations every 5m

+#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_records_scanned_total + +

Records scanned every 5m

+ +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101300` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33695,19 +32388,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_reconciler_scip_metadata_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_store_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_records_altered_total -

99th percentile successful store operation duration over 5m

+

Records altered every 5m

+ +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101301` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33717,19 +32412,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_store_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum(increase(src_codeintel_uploads_reconciler_scip_metadata_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_store_errors_total +#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_total -

Store operation errors every 5m

+

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101310` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33739,19 +32434,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_store_error_rate +#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_99th_percentile_duration -

Store operation error rate over 5m

+

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101311` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33761,21 +32456,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_store_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_store_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_reconciler_scip_metadata_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-### Code Intelligence > Uploads: Codeintel: Uploads > GQL Transport - -#### codeintel-uploads: codeintel_uploads_transport_graphql_total +#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_errors_total -

Aggregate resolver operations every 5m

+

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101312` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33785,19 +32478,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_error_rate -

Aggregate successful resolver operation duration distribution over 5m

+

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101313` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33807,19 +32500,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (le)(rate(src_codeintel_uploads_transport_graphql_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_errors_total +### Code Intelligence > Uploads: Codeintel: Uploads > Reconciler task > Codeintel uploads reconciler scip data -

Aggregate resolver operation errors every 5m

+#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_records_scanned_total + +

Records scanned every 5m

+ +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101400` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33829,19 +32526,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_codeintel_uploads_reconciler_scip_data_records_scanned_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_error_rate +#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_records_altered_total -

Aggregate resolver operation error rate over 5m

+

Records altered every 5m

+ +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101401` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33851,19 +32550,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_codeintel_uploads_reconciler_scip_data_records_altered_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_total +#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_total -

Resolver operations every 5m

+

Job invocation operations every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101410` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33873,19 +32572,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_99th_percentile_duration +#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_99th_percentile_duration -

99th percentile successful resolver operation duration over 5m

+

99th percentile successful job invocation operation duration over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101411` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33895,19 +32594,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_transport_graphql_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_reconciler_scip_data_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_errors_total +#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_errors_total -

Resolver operation errors every 5m

+

Job invocation operation errors every 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101412` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33917,19 +32616,19 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_errors_total{job=~"^${source:regex}.*"}[5m])) ```
-#### codeintel-uploads: codeintel_uploads_transport_graphql_error_rate +#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_error_rate -

Resolver operation error rate over 5m

+

Job invocation operation error rate over 5m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101413` on your Sourcegraph instance. *Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* @@ -33939,23 +32638,31 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_transport_graphql_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_transport_graphql_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 ```
-### Code Intelligence > Uploads: Codeintel: Uploads > HTTP Transport +## Telemetry -#### codeintel-uploads: codeintel_uploads_transport_http_total +

Monitoring telemetry services in Sourcegraph.

-

Aggregate http handler operations every 5m

+To see this dashboard, visit `/-/debug/grafana/d/telemetry/telemetry` on your Sourcegraph instance. + +### Telemetry: Telemetry Gateway Exporter: Events export and queue metrics + +#### telemetry: telemetry_gateway_exporter_queue_size + +

Telemetry event payloads pending export

+ +The number of events queued to be exported. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -33963,21 +32670,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) +sum(src_telemetrygatewayexporter_queue_size) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_99th_percentile_duration +#### telemetry: telemetry_gateway_exporter_queue_growth -

Aggregate successful http handler operation duration distribution over 5m

+

Rate of growth of events export queue over 30m

-This panel has no related alerts. +A positive value indicates the queue is growing. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100301` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#telemetry-telemetry-gateway-exporter-queue-growth) for 2 alerts related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100001` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -33985,21 +32694,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (le)(rate(src_codeintel_uploads_transport_http_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m])) +max(deriv(src_telemetrygatewayexporter_queue_size[30m])) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_errors_total +#### telemetry: src_telemetrygatewayexporter_exported_events -

Aggregate http handler operation errors every 5m

+

Events exported from queue per hour

+ +The number of events being exported. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100302` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100010` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34007,21 +32718,24 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) +max(increase(src_telemetrygatewayexporter_exported_events[1h])) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_error_rate +#### telemetry: telemetry_gateway_exporter_batch_size -

Aggregate http handler operation error rate over 5m

+

Number of events exported per batch over 30m

+ +The number of events exported in each batch. The largest bucket is the maximum number of events exported per batch. +If the distribution trends to the maximum bucket, then events export throughput is at or approaching saturation - try increasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_BATCH_SIZE` or decreasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_INTERVAL`. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34029,21 +32743,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) + sum(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (le) (rate(src_telemetrygatewayexporter_batch_size_bucket[30m])) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_total +### Telemetry: Telemetry Gateway Exporter: Events export job operations -

Http handler operations every 5m

+#### telemetry: telemetrygatewayexporter_exporter_total + +

Events exporter operations every 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34051,21 +32767,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_exporter_total{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_99th_percentile_duration +#### telemetry: telemetrygatewayexporter_exporter_99th_percentile_duration -

99th percentile successful http handler operation duration over 5m

+

Aggregate successful events exporter operation duration distribution over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34073,21 +32789,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_transport_http_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum by (le)(rate(src_telemetrygatewayexporter_exporter_duration_seconds_bucket{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_errors_total +#### telemetry: telemetrygatewayexporter_exporter_errors_total -

Http handler operation errors every 5m

+

Events exporter operation errors every 30m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-exporter-errors-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34095,21 +32811,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_exporter_errors_total{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_uploads_transport_http_error_rate +#### telemetry: telemetrygatewayexporter_exporter_error_rate -

Http handler operation error rate over 5m

+

Events exporter operation error rate over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34117,23 +32833,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_transport_http_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_transport_http_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_telemetrygatewayexporter_exporter_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_exporter_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_exporter_errors_total{job=~"^worker.*"}[30m]))) * 100 ```

-### Code Intelligence > Uploads: Codeintel: Repository with stale commit graph +### Telemetry: Telemetry Gateway Exporter: Events export queue cleanup job operations -#### codeintel-uploads: codeintel_commit_graph_queue_size +#### telemetry: telemetrygatewayexporter_queue_cleanup_total -

Repository queue size

+

Events export queue cleanup operations every 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34141,27 +32857,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -max(src_codeintel_commit_graph_total{job=~"^${source:regex}.*"}) +sum(increase(src_telemetrygatewayexporter_queue_cleanup_total{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_commit_graph_queue_growth_rate - -

Repository queue growth rate over 30m

- -This value compares the rate of enqueues against the rate of finished jobs. +#### telemetry: telemetrygatewayexporter_queue_cleanup_99th_percentile_duration - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +

Aggregate successful events export queue cleanup operation duration distribution over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34169,21 +32879,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_commit_graph_total{job=~"^${source:regex}.*"}[30m])) / sum(increase(src_codeintel_commit_graph_processor_total{job=~"^${source:regex}.*"}[30m])) +sum by (le)(rate(src_telemetrygatewayexporter_queue_cleanup_duration_seconds_bucket{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_commit_graph_queued_max_age +#### telemetry: telemetrygatewayexporter_queue_cleanup_errors_total -

Repository queue longest time in queue

+

Events export queue cleanup operation errors every 30m

-Refer to the [alerts reference](alerts#codeintel-uploads-codeintel-commit-graph-queued-max-age) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-queue-cleanup-errors-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34191,25 +32901,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -max(src_codeintel_commit_graph_queued_duration_seconds_total{job=~"^${source:regex}.*"}) +sum(increase(src_telemetrygatewayexporter_queue_cleanup_errors_total{job=~"^worker.*"}[30m])) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Expiration task - -#### codeintel-uploads: codeintel_background_repositories_scanned_total - -

Lsif upload repository scan repositories scanned every 5m

+#### telemetry: telemetrygatewayexporter_queue_cleanup_error_rate -Number of repositories scanned for data retention +

Events export queue cleanup operation error rate over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100203` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34217,23 +32923,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_background_repositories_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_queue_cleanup_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_queue_cleanup_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_queue_cleanup_errors_total{job=~"^worker.*"}[30m]))) * 100 ```

-#### codeintel-uploads: codeintel_background_upload_records_scanned_total +### Telemetry: Telemetry Gateway Exporter: Events export queue metrics reporting job operations -

Lsif upload records scan records scanned every 5m

+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_total -Number of codeintel upload records scanned for data retention +

Events export backlog metrics reporting operations every 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34241,23 +32947,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_background_upload_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_total{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_background_commits_scanned_total - -

Lsif upload commits scanned commits scanned every 5m

+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_99th_percentile_duration -Number of commits reachable from a codeintel upload record scanned for data retention +

Aggregate successful events export backlog metrics reporting operation duration distribution over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100502` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34265,23 +32969,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_background_commits_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum by (le)(rate(src_telemetrygatewayexporter_queue_metrics_reporter_duration_seconds_bucket{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_background_upload_records_expired_total - -

Lsif upload records expired uploads scanned every 5m

+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_errors_total -Number of codeintel upload records marked as expired +

Events export backlog metrics reporting operation errors every 30m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-queue-metrics-reporter-errors-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100302` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34289,25 +32991,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_background_upload_records_expired_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_errors_total{job=~"^worker.*"}[30m])) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor unknown repository - -#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_records_scanned_total - -

Records scanned every 5m

+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_error_rate -The number of candidate records considered for cleanup. +

Events export backlog metrics reporting operation error rate over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100303` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34315,23 +33013,25 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_unknown_repository_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_errors_total{job=~"^worker.*"}[30m]))) * 100 ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_records_altered_total +### Telemetry: Telemetry persistence -

Records altered every 5m

+#### telemetry: telemetry_v2_export_queue_write_failures -The number of candidate records altered as part of cleanup. +

Failed writes to events export queue over 5m

-This panel has no related alerts. +Telemetry V2 writes send events into the `telemetry_events_export_queue` for the exporter to periodically export. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100601` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#telemetry-telemetry-v2-export-queue-write-failures) for 2 alerts related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100400` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34339,21 +33039,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_unknown_repository_records_altered_total{job=~"^${source:regex}.*"}[5m])) +(sum(increase(src_telemetry_export_store_queued_events{failed="true"}[5m])) / sum(increase(src_telemetry_export_store_queued_events[5m]))) * 100 ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_total +#### telemetry: telemetry_v2_event_logs_write_failures -

Job invocation operations every 5m

+

Failed write V2 events to V1 'event_logs' over 5m

-This panel has no related alerts. +Telemetry V2 writes also attempt to `tee` events into the legacy V1 events format in the `event_logs` database table for long-term local persistence. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100610` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#telemetry-telemetry-v2-event-logs-write-failures) for 2 alerts related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100401` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34361,21 +33063,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_total{job=~"^${source:regex}.*"}[5m])) +(sum(increase(src_telemetry_teestore_v1_events{failed="true"}[5m])) / sum(increase(src_telemetry_teestore_v1_events[5m]))) * 100 ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_99th_percentile_duration +### Telemetry: Telemetry Gateway Exporter: (off by default) User metadata export job operations -

99th percentile successful job invocation operation duration over 5m

+#### telemetry: telemetrygatewayexporter_usermetadata_exporter_total + +

(off by default) user metadata exporter operations every 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100611` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34383,21 +33087,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_unknown_repository_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum(increase(src_telemetrygatewayexporter_usermetadata_exporter_total{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_errors_total +#### telemetry: telemetrygatewayexporter_usermetadata_exporter_99th_percentile_duration -

Job invocation operation errors every 5m

+

Aggregate successful (off by default) user metadata exporter operation duration distribution over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100612` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34405,21 +33109,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (le)(rate(src_telemetrygatewayexporter_usermetadata_exporter_duration_seconds_bucket{job=~"^worker.*"}[30m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_repository_error_rate +#### telemetry: telemetrygatewayexporter_usermetadata_exporter_errors_total -

Job invocation operation error rate over 5m

+

(off by default) user metadata exporter operation errors every 30m

-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-usermetadata-exporter-errors-total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100613` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34427,25 +33131,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_unknown_repository_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum(increase(src_telemetrygatewayexporter_usermetadata_exporter_errors_total{job=~"^worker.*"}[30m])) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor unknown commit - -#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_records_scanned_total - -

Records scanned every 5m

+#### telemetry: telemetrygatewayexporter_usermetadata_exporter_error_rate -The number of candidate records considered for cleanup. +

(off by default) user metadata exporter operation error rate over 30m

This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100700` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100503` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -34453,23 +33153,40 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_unknown_commit_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum(increase(src_telemetrygatewayexporter_usermetadata_exporter_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_usermetadata_exporter_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_usermetadata_exporter_errors_total{job=~"^worker.*"}[30m]))) * 100 ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_records_altered_total +## OpenTelemetry Collector -

Records altered every 5m

+

The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.

-The number of candidate records altered as part of cleanup. +To see this dashboard, visit `/-/debug/grafana/d/otel-collector/otel-collector` on your Sourcegraph instance. + +### OpenTelemetry Collector: Receivers + +#### otel-collector: otel_span_receive_rate + +

Spans received per receiver per minute

+ +Shows the rate of spans accepted by the configured reveiver + +A Trace is a collection of spans and a span represents a unit of work or operation. Spans are the building blocks of Traces. +The spans have only been accepted by the receiver, which means they still have to move through the configured pipeline to be exported. +For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. + +See the Exporters section see spans that have made it through the pipeline and are exported. + +Depending the configured processors, received spans might be dropped and not exported. For more information on configuring processors see +https://opentelemetry.io/docs/collector/configuration/#processors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100701` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34477,21 +33194,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_unknown_commit_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum by (receiver) (rate(otelcol_receiver_accepted_spans[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_total +#### otel-collector: otel_span_refused -

Job invocation operations every 5m

+

Spans refused per receiver

-This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100710` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +Refer to the [alerts reference](alerts#otel-collector-otel-span-refused) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100001` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34499,21 +33218,30 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_total{job=~"^${source:regex}.*"}[5m])) +sum by (receiver) (rate(otelcol_receiver_refused_spans[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_99th_percentile_duration +### OpenTelemetry Collector: Exporters -

99th percentile successful job invocation operation duration over 5m

+#### otel-collector: otel_span_export_rate + +

Spans exported per exporter per minute

+ +Shows the rate of spans being sent by the exporter + +A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. +The rate of spans here indicates spans that have made it through the configured pipeline and have been sent to the configured export destination. + +For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100711` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34521,21 +33249,25 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_unknown_commit_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum by (exporter) (rate(otelcol_exporter_sent_spans[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_errors_total +#### otel-collector: otel_span_export_failures -

Job invocation operation errors every 5m

+

Span export failures by exporter

-This panel has no related alerts. +Shows the rate of spans failed to be sent by the configured reveiver. A number higher than 0 for a long period can indicate a problem with the exporter configuration or with the service that is being exported too -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100712` on your Sourcegraph instance. +For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +Refer to the [alerts reference](alerts#otel-collector-otel-span-export-failures) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100101` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34543,21 +33275,25 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (exporter) (rate(otelcol_exporter_send_failed_spans[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_unknown_commit_error_rate +### OpenTelemetry Collector: Queue Length -

Job invocation operation error rate over 5m

+#### otel-collector: otelcol_exporter_queue_capacity + +

Exporter queue capacity

+ +Shows the the capacity of the retry queue (in batches). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100713` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34565,25 +33301,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_unknown_commit_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (exporter) (rate(otelcol_exporter_queue_capacity{job=~"^.*"}[1m])) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor abandoned - -#### codeintel-uploads: codeintel_uploads_janitor_abandoned_records_scanned_total +#### otel-collector: otelcol_exporter_queue_size -

Records scanned every 5m

+

Exporter queue size

-The number of candidate records considered for cleanup. +Shows the current size of retry queue This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100800` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34591,23 +33325,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_abandoned_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +sum by (exporter) (rate(otelcol_exporter_queue_size{job=~"^.*"}[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_abandoned_records_altered_total +#### otel-collector: otelcol_exporter_enqueue_failed_spans -

Records altered every 5m

+

Exporter enqueue failed spans

-The number of candidate records altered as part of cleanup. +Shows the rate of spans failed to be enqueued by the configured exporter. A number higher than 0 for a long period can indicate a problem with the exporter configuration -This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-otelcol-exporter-enqueue-failed-spans) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100801` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34615,21 +33349,25 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_abandoned_records_altered_total{job=~"^${source:regex}.*"}[5m])) +sum by (exporter) (rate(otelcol_exporter_enqueue_failed_spans{job=~"^.*"}[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_abandoned_total +### OpenTelemetry Collector: Processors -

Job invocation operations every 5m

+#### otel-collector: otelcol_processor_dropped_spans -This panel has no related alerts. +

Spans dropped per processor per minute

-To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100810` on your Sourcegraph instance. +Shows the rate of spans dropped by the configured processor -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +Refer to the [alerts reference](alerts#otel-collector-otelcol-processor-dropped-spans) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34637,21 +33375,25 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_total{job=~"^${source:regex}.*"}[5m])) +sum by (processor) (rate(otelcol_processor_dropped_spans[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_abandoned_99th_percentile_duration +### OpenTelemetry Collector: Collector resource usage -

99th percentile successful job invocation operation duration over 5m

+#### otel-collector: otel_cpu_usage + +

Cpu usage of the collector

+ +Shows CPU usage as reported by the OpenTelemetry collector. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100811` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100400` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34659,21 +33401,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_abandoned_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum by (job) (rate(otelcol_process_cpu_seconds{job=~"^.*"}[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_abandoned_errors_total +#### otel-collector: otel_memory_resident_set_size -

Job invocation operation errors every 5m

+

Memory allocated to the otel collector

+ +Shows the allocated memory Resident Set Size (RSS) as reported by the OpenTelemetry collector. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100812` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100401` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34681,21 +33425,29 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by (job) (rate(otelcol_process_memory_rss{job=~"^.*"}[1m])) ```

-#### codeintel-uploads: codeintel_uploads_janitor_abandoned_error_rate +#### otel-collector: otel_memory_usage -

Job invocation operation error rate over 5m

+

Memory used by the collector

+ +Shows how much memory is being used by the otel collector. + +* High memory usage might indicate thad the configured pipeline is keeping a lot of spans in memory for processing +* Spans failing to be sent and the exporter is configured to retry +* A high batch count by using a batch processor + +For more information on configuring processors for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#processors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100813` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100402` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34703,25 +33455,33 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_abandoned_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (job) (rate(otelcol_process_runtime_total_alloc_bytes{job=~"^.*"}[1m])) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads expirer unreferenced +### OpenTelemetry Collector: Container monitoring (not available on server) -#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_records_scanned_total +#### otel-collector: container_missing -

Records scanned every 5m

+

Container missing

-The number of candidate records considered for cleanup. +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod otel-collector` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p otel-collector`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' otel-collector` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the otel-collector container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs otel-collector` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100900` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100500` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34729,23 +33489,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_expirer_unreferenced_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +count by(name) ((time() - container_last_seen{name=~"^otel-collector.*"}) > 60) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_records_altered_total - -

Records altered every 5m

+#### otel-collector: container_cpu_usage -The number of candidate records altered as part of cleanup. +

Container cpu usage total (1m average) across all cores by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-container-cpu-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100901` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100501` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34753,21 +33511,21 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_expirer_unreferenced_records_altered_total{job=~"^${source:regex}.*"}[5m])) +cadvisor_container_cpu_usage_percentage_total{name=~"^otel-collector.*"} ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_total +#### otel-collector: container_memory_usage -

Job invocation operations every 5m

+

Container memory usage by instance

-This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-container-memory-usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100910` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100502` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34775,21 +33533,24 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_total{job=~"^${source:regex}.*"}[5m])) +cadvisor_container_memory_usage_percentage_total{name=~"^otel-collector.*"} ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_99th_percentile_duration +#### otel-collector: fs_io_operations -

99th percentile successful job invocation operation duration over 5m

+

Filesystem reads and writes rate by instance over 1h

+ +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100911` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100503` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34797,21 +33558,23 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_expirer_unreferenced_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +sum by(name) (rate(container_fs_reads_total{name=~"^otel-collector.*"}[1h]) + rate(container_fs_writes_total{name=~"^otel-collector.*"}[1h])) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_errors_total +### OpenTelemetry Collector: Kubernetes monitoring (only available on Kubernetes) -

Job invocation operation errors every 5m

+#### otel-collector: pods_available_percentage -This panel has no related alerts. +

Percentage pods available

-To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100912` on your Sourcegraph instance. +Refer to the [alerts reference](alerts#otel-collector-pods-available-percentage) for 1 alert related to this panel. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).* +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100600` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -34819,21 +33582,30 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_errors_total{job=~"^${source:regex}.*"}[5m])) +sum by(app) (up{app=~".*otel-collector"}) / count by (app) (up{app=~".*otel-collector"}) * 100 ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_error_rate +## Completions -

Job invocation operation error rate over 5m

+

Cody chat and code completions.

+ +To see this dashboard, visit `/-/debug/grafana/d/completions/completions` on your Sourcegraph instance. + +### Completions: Completions requests + +#### completions: api_request_rate + +

Rate of completions API requests

+ +Rate (QPS) of requests to cody chat and code completion endpoints. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=100913` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34841,25 +33613,24 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +sum by (code)(irate(src_http_request_duration_seconds_count{route=~"^cody.completions.*"}[5m])) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads expirer unreferenced graph +### Completions: Chat completions -#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_records_scanned_total +#### completions: chat_completions_p99_stream_duration -

Records scanned every 5m

+

Stream: total time (p99)

-The number of candidate records considered for cleanup. +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34867,23 +33638,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_expirer_unreferenced_graph_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_stream_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_records_altered_total +#### completions: chat_completions_p95_stream_duration -

Records altered every 5m

+

Stream: total time (p95)

-The number of candidate records altered as part of cleanup. +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34891,21 +33661,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_expirer_unreferenced_graph_records_altered_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.95, sum(rate(src_completions_stream_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_total +#### completions: chat_completions_p75_stream_duration -

Job invocation operations every 5m

+

Stream: total time (p75)

+ +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34913,21 +33684,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_stream_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_99th_percentile_duration +#### completions: chat_completions_p50_stream_duration -

99th percentile successful job invocation operation duration over 5m

+

Stream: total time (p50)

+ +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100103` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34935,21 +33707,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_expirer_unreferenced_graph_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.50, sum(rate(src_completions_stream_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_errors_total +#### completions: chat_completions_p99_non_stream_overhead_duration -

Job invocation operation errors every 5m

+

Non-stream overhead (p99)

+ +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101012` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34957,21 +33730,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_errors_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-#### codeintel-uploads: codeintel_uploads_expirer_unreferenced_graph_error_rate +#### completions: chat_completions_p95_non_stream_overhead_duration -

Job invocation operation error rate over 5m

+

Non-stream overhead (p95)

+ +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101013` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -34979,25 +33753,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_expirer_unreferenced_graph_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.95, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads hard deleter - -#### codeintel-uploads: codeintel_uploads_hard_deleter_records_scanned_total +#### completions: chat_completions_p75_non_stream_overhead_duration -

Records scanned every 5m

+

Non-stream overhead (p75)

-The number of candidate records considered for cleanup. +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100112` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35005,23 +33776,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_hard_deleter_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-#### codeintel-uploads: codeintel_uploads_hard_deleter_records_altered_total +#### completions: chat_completions_p50_non_stream_overhead_duration -

Records altered every 5m

+

Non-stream overhead (p50)

-The number of candidate records altered as part of cleanup. +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100113` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35029,21 +33799,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_hard_deleter_records_altered_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.50, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-#### codeintel-uploads: codeintel_uploads_hard_deleter_total +#### completions: chat_completions_p99_stream_first_event_duration -

Job invocation operations every 5m

+

Stream: time to first event (p99)

+ +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100120` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35051,21 +33822,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_hard_deleter_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_hard_deleter_99th_percentile_duration +#### completions: chat_completions_p95_stream_first_event_duration -

99th percentile successful job invocation operation duration over 5m

+

Stream: time to first event (p95)

+ +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100121` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35073,21 +33845,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_hard_deleter_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.95, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_hard_deleter_errors_total +#### completions: chat_completions_p75_stream_first_event_duration -

Job invocation operation errors every 5m

+

Stream: time to first event (p75)

+ +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100122` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35095,21 +33868,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_hard_deleter_errors_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_hard_deleter_error_rate +#### completions: chat_completions_p50_stream_first_event_duration -

Job invocation operation error rate over 5m

+

Stream: time to first event (p50)

+ +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100123` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35117,25 +33891,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_hard_deleter_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_hard_deleter_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_hard_deleter_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor audit logs - -#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_records_scanned_total +#### completions: chat_completions_p99_upstream_roundtrip_duration -

Records scanned every 5m

+

Stream: first byte sent -> received (p99)

-The number of candidate records considered for cleanup. +Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100130` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35143,23 +33914,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_audit_logs_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_records_altered_total +#### completions: chat_completions_p95_upstream_roundtrip_duration -

Records altered every 5m

+

Stream: first byte sent -> received (p95)

-The number of candidate records altered as part of cleanup. +Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100131` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35167,21 +33937,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_audit_logs_records_altered_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.95, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_total +#### completions: chat_completions_p75_upstream_roundtrip_duration -

Job invocation operations every 5m

+

Stream: first byte sent -> received (p75)

+ +Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100132` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35189,21 +33960,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_99th_percentile_duration +#### completions: chat_completions_p50_upstream_roundtrip_duration -

99th percentile successful job invocation operation duration over 5m

+

Stream: first byte sent -> received (p50)

+ +Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100133` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35211,21 +33983,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_audit_logs_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.50, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="chat_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_errors_total +#### completions: chat_completions_p99_http_connect_total -

Job invocation operation errors every 5m

+

Stream: HTTP connect: total (p99)

+ +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100140` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35233,21 +34006,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_errors_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_error_rate +#### completions: chat_completions_p95_http_connect_total -

Job invocation operation error rate over 5m

+

Stream: HTTP connect: total (p95)

+ +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100141` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35255,25 +34029,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_audit_logs_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Janitor task > Codeintel uploads janitor scip documents - -#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_records_scanned_total +#### completions: chat_completions_p75_http_connect_total -

Records scanned every 5m

+

Stream: HTTP connect: total (p75)

-The number of candidate records considered for cleanup. +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100142` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35281,23 +34052,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_scip_documents_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_records_altered_total +#### completions: chat_completions_p50_http_connect_total -

Records altered every 5m

+

Stream: HTTP connect: total (p50)

-The number of candidate records altered as part of cleanup. +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100143` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35305,21 +34075,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_janitor_scip_documents_records_altered_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_total +#### completions: chat_completions_p99_http_connect_dns -

Job invocation operations every 5m

+

Stream: HTTP connect: dns (p99)

+ +Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100150` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35327,21 +34098,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_99th_percentile_duration +#### completions: chat_completions_p95_http_connect_dns -

99th percentile successful job invocation operation duration over 5m

+

Stream: HTTP connect: dns (p95)

+ +Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100151` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35349,21 +34121,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_janitor_scip_documents_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_errors_total +#### completions: chat_completions_p75_http_connect_dns -

Job invocation operation errors every 5m

+

Stream: HTTP connect: dns (p75)

+ +Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100152` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35371,21 +34144,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_errors_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_error_rate +#### completions: chat_completions_p50_http_connect_dns -

Job invocation operation error rate over 5m

+

Stream: HTTP connect: dns (p50)

+ +Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101313` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100153` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35393,25 +34167,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_janitor_scip_documents_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Reconciler task > Codeintel uploads reconciler scip metadata - -#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_records_scanned_total +#### completions: chat_completions_p99_http_connect_tls -

Records scanned every 5m

+

Stream: HTTP connect: tls (p99)

-The number of candidate records considered for cleanup. +Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100160` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35419,23 +34190,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_reconciler_scip_metadata_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_records_altered_total +#### completions: chat_completions_p95_http_connect_tls -

Records altered every 5m

+

Stream: HTTP connect: tls (p95)

-The number of candidate records altered as part of cleanup. +Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100161` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35443,21 +34213,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_reconciler_scip_metadata_records_altered_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_total +#### completions: chat_completions_p75_http_connect_tls -

Job invocation operations every 5m

+

Stream: HTTP connect: tls (p75)

+ +Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100162` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35465,21 +34236,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_99th_percentile_duration +#### completions: chat_completions_p50_http_connect_tls -

99th percentile successful job invocation operation duration over 5m

+

Stream: HTTP connect: tls (p50)

+ +Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100163` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35487,21 +34259,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_reconciler_scip_metadata_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_errors_total +#### completions: chat_completions_p99_http_connect_dial -

Job invocation operation errors every 5m

+

Stream: HTTP connect: dial (p99)

+ +Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100170` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35509,21 +34282,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_errors_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_error_rate +#### completions: chat_completions_p95_http_connect_dial -

Job invocation operation error rate over 5m

+

Stream: HTTP connect: dial (p95)

+ +Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100171` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35531,25 +34305,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_reconciler_scip_metadata_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-### Code Intelligence > Uploads: Codeintel: Uploads > Reconciler task > Codeintel uploads reconciler scip data - -#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_records_scanned_total +#### completions: chat_completions_p75_http_connect_dial -

Records scanned every 5m

+

Stream: HTTP connect: dial (p75)

-The number of candidate records considered for cleanup. +Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100172` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35557,23 +34328,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_reconciler_scip_data_records_scanned_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_records_altered_total +#### completions: chat_completions_p50_http_connect_dial -

Records altered every 5m

+

Stream: HTTP connect: dial (p50)

-The number of candidate records altered as part of cleanup. +Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100173` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35581,21 +34351,24 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum(increase(src_codeintel_uploads_reconciler_scip_data_records_altered_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_total +### Completions: Code completions -

Job invocation operations every 5m

+#### completions: code_completions_p99_stream_duration + +

Stream: total time (p99)

+ +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101510` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35603,21 +34376,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_stream_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_99th_percentile_duration +#### completions: code_completions_p95_stream_duration -

99th percentile successful job invocation operation duration over 5m

+

Stream: total time (p95)

+ +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101511` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35625,21 +34399,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_codeintel_uploads_reconciler_scip_data_duration_seconds_bucket{job=~"^${source:regex}.*"}[5m]))) +histogram_quantile(0.95, sum(rate(src_completions_stream_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_errors_total +#### completions: code_completions_p75_stream_duration -

Job invocation operation errors every 5m

+

Stream: total time (p75)

+ +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101512` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35647,21 +34422,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_errors_total{job=~"^${source:regex}.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_stream_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_error_rate +#### completions: code_completions_p50_stream_duration -

Job invocation operation error rate over 5m

+

Stream: total time (p50)

+ +Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101513` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100203` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*
Technical details @@ -35669,31 +34445,22 @@ To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads Query: ``` -sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_errors_total{job=~"^${source:regex}.*"}[5m])) / (sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_total{job=~"^${source:regex}.*"}[5m])) + sum by (op)(increase(src_codeintel_uploads_reconciler_scip_data_errors_total{job=~"^${source:regex}.*"}[5m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_stream_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-## Telemetry - -

Monitoring telemetry services in Sourcegraph.

- -To see this dashboard, visit `/-/debug/grafana/d/telemetry/telemetry` on your Sourcegraph instance. - -### Telemetry: Telemetry Gateway Exporter: Export and queue metrics +#### completions: code_completions_p99_non_stream_overhead_duration -#### telemetry: telemetry_gateway_exporter_queue_size - -

Telemetry event payloads pending export

+

Non-stream overhead (p99)

-The number of events queued to be exported. +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35701,23 +34468,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10000 Query: ``` -sum(src_telemetrygatewayexporter_queue_size) +histogram_quantile(0.99, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-#### telemetry: telemetry_gateway_exporter_queue_growth +#### completions: code_completions_p95_non_stream_overhead_duration -

Rate of growth of export queue over 30m

+

Non-stream overhead (p95)

-A positive value indicates the queue is growing. +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. -Refer to the [alerts reference](alerts#telemetry-telemetry-gateway-exporter-queue-growth) for 2 alerts related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100211` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35725,23 +34491,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10000 Query: ``` -max(deriv(src_telemetrygatewayexporter_queue_size[30m])) +histogram_quantile(0.95, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-#### telemetry: src_telemetrygatewayexporter_exported_events +#### completions: code_completions_p75_non_stream_overhead_duration -

Events exported from queue per hour

+

Non-stream overhead (p75)

-The number of events being exported. +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100010` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100212` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35749,24 +34514,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10001 Query: ``` -max(increase(src_telemetrygatewayexporter_exported_events[1h])) +histogram_quantile(0.75, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-#### telemetry: telemetry_gateway_exporter_batch_size +#### completions: code_completions_p50_non_stream_overhead_duration -

Number of events exported per batch over 30m

+

Non-stream overhead (p50)

-The number of events exported in each batch. The largest bucket is the maximum number of events exported per batch. -If the distribution trends to the maximum bucket, then events export throughput is at or approaching saturation - try increasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_BATCH_SIZE` or decreasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_INTERVAL`. +Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100011` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100213` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35774,23 +34537,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10001 Query: ``` -sum by (le) (rate(src_telemetrygatewayexporter_batch_size_bucket[30m])) +histogram_quantile(0.50, sum(rate(src_completions_handler_overhead_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le,model)) ```

-### Telemetry: Telemetry Gateway Exporter: Export job operations +#### completions: code_completions_p99_stream_first_event_duration -#### telemetry: telemetrygatewayexporter_exporter_total +

Stream: time to first event (p99)

-

Events exporter operations every 30m

+Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100220` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35798,21 +34560,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10010 Query: ``` -sum(increase(src_telemetrygatewayexporter_exporter_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.99, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### telemetry: telemetrygatewayexporter_exporter_99th_percentile_duration +#### completions: code_completions_p95_stream_first_event_duration -

Aggregate successful events exporter operation duration distribution over 30m

+

Stream: time to first event (p95)

+ +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100221` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35820,21 +34583,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10010 Query: ``` -sum by (le)(rate(src_telemetrygatewayexporter_exporter_duration_seconds_bucket{job=~"^worker.*"}[30m])) +histogram_quantile(0.95, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### telemetry: telemetrygatewayexporter_exporter_errors_total +#### completions: code_completions_p75_stream_first_event_duration -

Events exporter operation errors every 30m

+

Stream: time to first event (p75)

-Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-exporter-errors-total) for 1 alert related to this panel. +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100102` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100222` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35842,21 +34606,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10010 Query: ``` -sum(increase(src_telemetrygatewayexporter_exporter_errors_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.75, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-#### telemetry: telemetrygatewayexporter_exporter_error_rate +#### completions: code_completions_p50_stream_first_event_duration -

Events exporter operation error rate over 30m

+

Stream: time to first event (p50)

+ +Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100223` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35864,23 +34629,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10010 Query: ``` -sum(increase(src_telemetrygatewayexporter_exporter_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_exporter_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_exporter_errors_total{job=~"^worker.*"}[30m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_stream_first_event_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, model)) ```

-### Telemetry: Telemetry Gateway Exporter: Export queue cleanup job operations +#### completions: code_completions_p99_upstream_roundtrip_duration -#### telemetry: telemetrygatewayexporter_queue_cleanup_total +

Stream: first byte sent -> received (p99)

-

Export queue cleanup operations every 30m

+Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100230` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35888,21 +34652,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10020 Query: ``` -sum(increase(src_telemetrygatewayexporter_queue_cleanup_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetrygatewayexporter_queue_cleanup_99th_percentile_duration +#### completions: code_completions_p95_upstream_roundtrip_duration + +

Stream: first byte sent -> received (p95)

-

Aggregate successful export queue cleanup operation duration distribution over 30m

+Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100231` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35910,21 +34675,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10020 Query: ``` -sum by (le)(rate(src_telemetrygatewayexporter_queue_cleanup_duration_seconds_bucket{job=~"^worker.*"}[30m])) +histogram_quantile(0.95, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetrygatewayexporter_queue_cleanup_errors_total +#### completions: code_completions_p75_upstream_roundtrip_duration -

Export queue cleanup operation errors every 30m

+

Stream: first byte sent -> received (p75)

-Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-queue-cleanup-errors-total) for 1 alert related to this panel. +Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100202` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100232` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35932,21 +34698,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10020 Query: ``` -sum(increase(src_telemetrygatewayexporter_queue_cleanup_errors_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetrygatewayexporter_queue_cleanup_error_rate +#### completions: code_completions_p50_upstream_roundtrip_duration + +

Stream: first byte sent -> received (p50)

-

Export queue cleanup operation error rate over 30m

+Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100233` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35954,23 +34721,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10020 Query: ``` -sum(increase(src_telemetrygatewayexporter_queue_cleanup_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_queue_cleanup_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_queue_cleanup_errors_total{job=~"^worker.*"}[30m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_upstream_roundtrip_duration_seconds_bucket{feature="code_completions",model=~'${model}'}[$sampling_duration])) by (le, provider)) ```

-### Telemetry: Telemetry Gateway Exporter: Export queue metrics reporting job operations +#### completions: code_completions_p99_http_connect_total -#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_total +

Stream: HTTP connect: total (p99)

-

Export backlog metrics reporting operations every 30m

+Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100240` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -35978,21 +34744,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10030 Query: ``` -sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_99th_percentile_duration +#### completions: code_completions_p95_http_connect_total -

Aggregate successful export backlog metrics reporting operation duration distribution over 30m

+

Stream: HTTP connect: total (p95)

+ +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100241` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36000,21 +34767,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10030 Query: ``` -sum by (le)(rate(src_telemetrygatewayexporter_queue_metrics_reporter_duration_seconds_bucket{job=~"^worker.*"}[30m])) +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_errors_total +#### completions: code_completions_p75_http_connect_total -

Export backlog metrics reporting operation errors every 30m

+

Stream: HTTP connect: total (p75)

-Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter-queue-metrics-reporter-errors-total) for 1 alert related to this panel. +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100302` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100242` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36022,21 +34790,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10030 Query: ``` -sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_errors_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_error_rate +#### completions: code_completions_p50_http_connect_total -

Export backlog metrics reporting operation error rate over 30m

+

Stream: HTTP connect: total (p50)

+ +Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100303` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100243` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36044,23 +34813,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10030 Query: ``` -sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_errors_total{job=~"^worker.*"}[30m])) / (sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_total{job=~"^worker.*"}[30m])) + sum(increase(src_telemetrygatewayexporter_queue_metrics_reporter_errors_total{job=~"^worker.*"}[30m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_total_duration_seconds_bucket[$sampling_duration])) by (le, connection_type, provider)) ```

-### Telemetry: Usage data exporter (legacy): Job operations +#### completions: code_completions_p99_http_connect_dns -#### telemetry: telemetry_job_total +

Stream: HTTP connect: dns (p99)

-

Aggregate usage data exporter operations every 5m

+Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100250` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36068,21 +34836,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10040 Query: ``` -sum(increase(src_telemetry_job_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_99th_percentile_duration +#### completions: code_completions_p95_http_connect_dns -

Aggregate successful usage data exporter operation duration distribution over 5m

+

Stream: HTTP connect: dns (p95)

+ +Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100251` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36090,21 +34859,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10040 Query: ``` -sum by (le)(rate(src_telemetry_job_duration_seconds_bucket{job=~"^worker.*"}[5m])) +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_errors_total +#### completions: code_completions_p75_http_connect_dns + +

Stream: HTTP connect: dns (p75)

-

Aggregate usage data exporter operation errors every 5m

+Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100252` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36112,21 +34882,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10040 Query: ``` -sum(increase(src_telemetry_job_errors_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_error_rate +#### completions: code_completions_p50_http_connect_dns + +

Stream: HTTP connect: dns (p50)

-

Aggregate usage data exporter operation error rate over 5m

+Portion of time spent on DNS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100403` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100253` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36134,21 +34905,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10040 Query: ``` -sum(increase(src_telemetry_job_errors_total{job=~"^worker.*"}[5m])) / (sum(increase(src_telemetry_job_total{job=~"^worker.*"}[5m])) + sum(increase(src_telemetry_job_errors_total{job=~"^worker.*"}[5m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_dns_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_total +#### completions: code_completions_p99_http_connect_tls + +

Stream: HTTP connect: tls (p99)

-

Usage data exporter operations every 5m

+Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100260` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36156,21 +34928,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10041 Query: ``` -sum by (op)(increase(src_telemetry_job_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_99th_percentile_duration +#### completions: code_completions_p95_http_connect_tls + +

Stream: HTTP connect: tls (p95)

-

99th percentile successful usage data exporter operation duration over 5m

+Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100261` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36178,21 +34951,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10041 Query: ``` -histogram_quantile(0.99, sum by (le,op)(rate(src_telemetry_job_duration_seconds_bucket{job=~"^worker.*"}[5m]))) +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_errors_total +#### completions: code_completions_p75_http_connect_tls + +

Stream: HTTP connect: tls (p75)

-

Usage data exporter operation errors every 5m

+Portion of time spent on TLS when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100412` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100262` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36200,21 +34974,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10041 Query: ``` -sum by (op)(increase(src_telemetry_job_errors_total{job=~"^worker.*"}[5m])) +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_error_rate +#### completions: code_completions_p50_http_connect_tls + +

Stream: HTTP connect: tls (p50)

-

Usage data exporter operation error rate over 5m

+Portion of time spent on TLS when acquiring an HTTP connection to the upstream. -Refer to the [alerts reference](alerts#telemetry-telemetry-job-error-rate) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100413` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100263` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36222,23 +34997,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10041 Query: ``` -sum by (op)(increase(src_telemetry_job_errors_total{job=~"^worker.*"}[5m])) / (sum by (op)(increase(src_telemetry_job_total{job=~"^worker.*"}[5m])) + sum by (op)(increase(src_telemetry_job_errors_total{job=~"^worker.*"}[5m]))) * 100 +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_tls_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-### Telemetry: Usage data exporter (legacy): Queue size +#### completions: code_completions_p99_http_connect_dial -#### telemetry: telemetry_job_queue_size_queue_size +

Stream: HTTP connect: dial (p99)

-

Event level usage data queue size

+Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100270` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -36246,27 +35020,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10050 Query: ``` -max(src_telemetry_job_queue_size_total{job=~"^worker.*"}) +histogram_quantile(0.99, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### telemetry: telemetry_job_queue_size_queue_growth_rate +#### completions: code_completions_p95_http_connect_dial -

Event level usage data queue growth rate over 30m

+

Stream: HTTP connect: dial (p95)

-This value compares the rate of enqueues against the rate of finished jobs. - - - A value < than 1 indicates that process rate > enqueue rate - - A value = than 1 indicates that process rate = enqueue rate - - A value > than 1 indicates that process rate < enqueue rate +Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100501` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100271` on your Sourcegraph instance. -*Managed by the [Sourcegraph Code Search team](https://handbook.sourcegraph.com/departments/engineering/teams/code-search).*
Technical details @@ -36274,23 +35043,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10050 Query: ``` -sum(increase(src_telemetry_job_queue_size_total{job=~"^worker.*"}[30m])) / sum(increase(src_telemetry_job_queue_size_processor_total{job=~"^worker.*"}[30m])) +histogram_quantile(0.95, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-### Telemetry: Usage data exporter (legacy): Utilization +#### completions: code_completions_p75_http_connect_dial -#### telemetry: telemetry_job_utilized_throughput +

Stream: HTTP connect: dial (p75)

-

Utilized percentage of maximum throughput

+Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. -Refer to the [alerts reference](alerts#telemetry-telemetry-job-utilized-throughput) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100272` on your Sourcegraph instance. -*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).*
Technical details @@ -36298,40 +35066,22 @@ To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=10060 Query: ``` -rate(src_telemetry_job_total{op="SendEvents"}[1h]) / on() group_right() src_telemetry_job_max_throughput * 100 +histogram_quantile(0.75, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-## OpenTelemetry Collector - -

The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.

- -To see this dashboard, visit `/-/debug/grafana/d/otel-collector/otel-collector` on your Sourcegraph instance. - -### OpenTelemetry Collector: Receivers - -#### otel-collector: otel_span_receive_rate - -

Spans received per receiver per minute

- -Shows the rate of spans accepted by the configured reveiver - -A Trace is a collection of spans and a span represents a unit of work or operation. Spans are the building blocks of Traces. -The spans have only been accepted by the receiver, which means they still have to move through the configured pipeline to be exported. -For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. +#### completions: code_completions_p50_http_connect_dial -See the Exporters section see spans that have made it through the pipeline and are exported. +

Stream: HTTP connect: dial (p50)

-Depending the configured processors, received spans might be dropped and not exported. For more information on configuring processors see -https://opentelemetry.io/docs/collector/configuration/#processors. +Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100273` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).*
Technical details @@ -36339,23 +35089,23 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (receiver) (rate(otelcol_receiver_accepted_spans[1m])) +histogram_quantile(0.50, sum(rate(src_completions_upstream_connection_dial_duration_seconds_bucket[$sampling_duration])) by (le, provider)) ```

-#### otel-collector: otel_span_refused - -

Spans refused per receiver

+### Completions: Completion credits entitlements +#### completions: completion_credits_check_entitlement_duration_p95 +

95th percentile completion credits entitlement check duration

-Refer to the [alerts reference](alerts#otel-collector-otel-span-refused) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#completions-completion-credits-check-entitlement-duration-p95) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100300` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -36363,30 +35113,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (receiver) (rate(otelcol_receiver_refused_spans[1m])) +histogram_quantile(0.95, sum(rate(src_completion_credits_check_entitlement_duration_ms_bucket[5m])) by (le)) ```

-### OpenTelemetry Collector: Exporters - -#### otel-collector: otel_span_export_rate - -

Spans exported per exporter per minute

- -Shows the rate of spans being sent by the exporter - -A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. -The rate of spans here indicates spans that have made it through the configured pipeline and have been sent to the configured export destination. +#### completions: completion_credits_consume_credits_duration_p95 -For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. +

95th percentile completion credits consume duration

-This panel has no related alerts. +Refer to the [alerts reference](alerts#completions-completion-credits-consume-credits-duration-p95) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100301` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -36394,25 +35135,24 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (exporter) (rate(otelcol_exporter_sent_spans[1m])) +histogram_quantile(0.95, sum(rate(src_completion_credits_consume_duration_ms_bucket[5m])) by (le)) ```

-#### otel-collector: otel_span_export_failures - -

Span export failures by exporter

+#### completions: completion_credits_check_entitlement_durations -Shows the rate of spans failed to be sent by the configured reveiver. A number higher than 0 for a long period can indicate a problem with the exporter configuration or with the service that is being exported too +

Completion credits entitlement check duration over 5m

-For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. +- This metric tracks pre-completion-request latency for checking if completion credits entitlement has been exceeded. + - If this value is high, this latency may be noticeable to users. -Refer to the [alerts reference](alerts#otel-collector-otel-span-export-failures) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100310` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Core Services team](https://handbook.sourcegraph.com/departments/engineering/teams).*
Technical details @@ -36420,23 +35160,30 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (exporter) (rate(otelcol_exporter_send_failed_spans[1m])) +sum by (le) (rate(src_completion_credits_check_entitlement_duration_ms_bucket[5m])) ```

-### OpenTelemetry Collector: Queue Length +## Periodic Goroutines -#### otel-collector: otelcol_exporter_queue_capacity +

Overview of all periodic background routines across Sourcegraph services.

-

Exporter queue capacity

+To see this dashboard, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines` on your Sourcegraph instance. -Shows the the capacity of the retry queue (in batches). +### Periodic Goroutines: Periodic Goroutines Overview + +#### periodic-goroutines: total_running_goroutines + +

Total number of running periodic goroutines across all services

+ +The total number of running periodic goroutines across all services. +This provides a high-level overview of system activity. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100000` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36446,21 +35193,22 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (exporter) (rate(otelcol_exporter_queue_capacity{job=~"^.*"}[1m])) +sum(src_periodic_goroutine_running) ```
-#### otel-collector: otelcol_exporter_queue_size +#### periodic-goroutines: goroutines_by_service -

Exporter queue size

+

Number of running periodic goroutines by service

-Shows the current size of retry queue +The number of running periodic goroutines broken down by service. +This helps identify which services are running the most background routines. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100001` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36470,21 +35218,22 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (exporter) (rate(otelcol_exporter_queue_size{job=~"^.*"}[1m])) +sum by (job) (src_periodic_goroutine_running) ```
-#### otel-collector: otelcol_exporter_enqueue_failed_spans +#### periodic-goroutines: top_error_producers -

Exporter enqueue failed spans

+

Top 10 periodic goroutines by error rate

-Shows the rate of spans failed to be enqueued by the configured exporter. A number higher than 0 for a long period can indicate a problem with the exporter configuration +The top 10 periodic goroutines with the highest error rates. +These routines may require immediate attention or investigation. -Refer to the [alerts reference](alerts#otel-collector-otelcol-exporter-enqueue-failed-spans) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100010` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36494,23 +35243,22 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (exporter) (rate(otelcol_exporter_enqueue_failed_spans{job=~"^.*"}[1m])) +topk(10, sum by (name, job) (rate(src_periodic_goroutine_errors_total[5m]))) ```
-### OpenTelemetry Collector: Processors - -#### otel-collector: otelcol_processor_dropped_spans +#### periodic-goroutines: top_time_consumers -

Spans dropped per processor per minute

+

Top 10 slowest periodic goroutines

-Shows the rate of spans dropped by the configured processor +The top 10 periodic goroutines with the longest average execution time. +These routines may be candidates for optimization or load distribution. -Refer to the [alerts reference](alerts#otel-collector-otelcol-processor-dropped-spans) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100011` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36520,23 +35268,23 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (processor) (rate(otelcol_processor_dropped_spans[1m])) +topk(10, max by (name, job) (rate(src_periodic_goroutine_duration_seconds_sum[5m]) / rate(src_periodic_goroutine_duration_seconds_count[5m]))) ```
-### OpenTelemetry Collector: Collector resource usage +### Periodic Goroutines: Drill down -#### otel-collector: otel_cpu_usage +#### periodic-goroutines: filtered_success_rate -

Cpu usage of the collector

+

Success rate for selected goroutines

-Shows CPU usage as reported by the OpenTelemetry collector. +The rate of successful executions for the filtered periodic goroutines. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100100` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36546,21 +35294,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (job) (rate(otelcol_process_cpu_seconds{job=~"^.*"}[1m])) +sum by (name, job) (rate(src_periodic_goroutine_total{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m])) ```
-#### otel-collector: otel_memory_resident_set_size +#### periodic-goroutines: filtered_error_rate -

Memory allocated to the otel collector

+

Error rate for selected goroutines

-Shows the allocated memory Resident Set Size (RSS) as reported by the OpenTelemetry collector. +The rate of errors for the filtered periodic goroutines. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100101` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36570,27 +35318,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (job) (rate(otelcol_process_memory_rss{job=~"^.*"}[1m])) +sum by (name, job) (rate(src_periodic_goroutine_errors_total{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m])) ```
-#### otel-collector: otel_memory_usage - -

Memory used by the collector

- -Shows how much memory is being used by the otel collector. +#### periodic-goroutines: filtered_duration -* High memory usage might indicate thad the configured pipeline is keeping a lot of spans in memory for processing -* Spans failing to be sent and the exporter is configured to retry -* A high batch count by using a batch processor +

95th percentile execution time for selected goroutines

-For more information on configuring processors for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#processors. +The 95th percentile execution time for the filtered periodic goroutines. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100402` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100110` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36600,31 +35342,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by (job) (rate(otelcol_process_runtime_total_alloc_bytes{job=~"^.*"}[1m])) +histogram_quantile(0.95, sum by (name, job, le) (rate(src_periodic_goroutine_duration_seconds_bucket{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m]))) ```
-### OpenTelemetry Collector: Container monitoring (not available on server) - -#### otel-collector: container_missing - -

Container missing

+#### periodic-goroutines: filtered_loop_time -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +

95th percentile loop time for selected goroutines

-- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod otel-collector` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p otel-collector`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' otel-collector` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the otel-collector container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs otel-collector` (note this will include logs from the previous and currently running container). +The 95th percentile loop time for the filtered periodic goroutines. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100111` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36634,19 +35366,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -count by(name) ((time() - container_last_seen{name=~"^otel-collector.*"}) > 60) +histogram_quantile(0.95, sum by (name, job, le) (rate(src_periodic_goroutine_loop_duration_seconds_bucket{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m]))) ```
-#### otel-collector: container_cpu_usage +#### periodic-goroutines: filtered_tenant_count -

Container cpu usage total (1m average) across all cores by instance

+

Number of tenants processed by selected goroutines

-Refer to the [alerts reference](alerts#otel-collector-container-cpu-usage) for 1 alert related to this panel. +Number of tenants processed by each selected periodic goroutine. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100501` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100120` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36656,19 +35390,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^otel-collector.*"} +max by (name, job) (src_periodic_goroutine_tenant_count{name=~'${routineName:regex}', job=~'${serviceName:regex}'}) ```
-#### otel-collector: container_memory_usage +#### periodic-goroutines: filtered_tenant_duration -

Container memory usage by instance

+

95th percentile tenant processing time for selected goroutines

-Refer to the [alerts reference](alerts#otel-collector-container-memory-usage) for 1 alert related to this panel. +The 95th percentile processing time for individual tenants. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100502` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100121` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36678,22 +35414,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^otel-collector.*"} +histogram_quantile(0.95, sum by (name, job, le) (rate(src_periodic_goroutine_tenant_duration_seconds_bucket{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m]))) ```
-#### otel-collector: fs_io_operations +#### periodic-goroutines: filtered_tenant_success_rate -

Filesystem reads and writes rate by instance over 1h

+

Tenant success rate for selected goroutines

-This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +The rate of successful tenant processing operations. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100503` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100130` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36703,21 +35438,21 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^otel-collector.*"}[1h]) + rate(container_fs_writes_total{name=~"^otel-collector.*"}[1h])) +sum by (name, job) (rate(src_periodic_goroutine_tenant_success_total{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m])) ```
-### OpenTelemetry Collector: Kubernetes monitoring (only available on Kubernetes) +#### periodic-goroutines: filtered_tenant_error_rate -#### otel-collector: pods_available_percentage +

Tenant error rate for selected goroutines

-

Percentage pods available

+The rate of tenant processing operations resulting in errors. -Refer to the [alerts reference](alerts#otel-collector-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/periodic-goroutines/periodic-goroutines?viewPanel=100131` on your Sourcegraph instance. *Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* @@ -36727,31 +35462,31 @@ To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewP Query: ``` -sum by(app) (up{app=~".*otel-collector"}) / count by (app) (up{app=~".*otel-collector"}) * 100 +sum by (name, job) (rate(src_periodic_goroutine_tenant_errors_total{name=~'${routineName:regex}', job=~'${serviceName:regex}'}[5m])) ```
-## Embeddings +## Background Jobs Dashboard -

Handles embeddings searches.

+

Overview of all background jobs in the system.

-To see this dashboard, visit `/-/debug/grafana/d/embeddings/embeddings` on your Sourcegraph instance. +To see this dashboard, visit `/-/debug/grafana/d/background-jobs/background-jobs` on your Sourcegraph instance. -### Embeddings: Site configuration client update latency +### Background Jobs Dashboard: DBWorker Store Operations -#### embeddings: embeddings_site_configuration_duration_since_last_successful_update_by_instance +#### background-jobs: operation_rates_by_method -

Duration since last successful site configuration update (by instance)

+

Rate of operations by method (5m)

-The duration since the configuration client used by the "embeddings" service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration. +shows the rate of different dbworker store operations This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100000` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36759,21 +35494,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -src_conf_client_time_since_last_successful_update_seconds{job=~`.*embeddings`,instance=~`${instance:regex}`} +sum by (op) (rate(src_workerutil_dbworker_store_total{domain=~"$dbworker_domain"}[5m])) ```

-#### embeddings: embeddings_site_configuration_duration_since_last_successful_update_by_instance +#### background-jobs: error_rates + +

Rate of errors by method (5m)

-

Maximum duration since last successful site configuration update (all "embeddings" instances)

+Rate of errors by operation type. Check specific operations with high error rates. -Refer to the [alerts reference](alerts#embeddings-embeddings-site-configuration-duration-since-last-successful-update-by-instance) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100001` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100001` on your Sourcegraph instance. -*Managed by the [Sourcegraph Infrastructure Org team](https://handbook.sourcegraph.com/departments/engineering/infrastructure).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36781,23 +35518,25 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*embeddings`,instance=~`${instance:regex}`}[1m])) +sum by (op) (rate(src_workerutil_dbworker_store_errors_total{domain=~"$dbworker_domain"}[5m])) ```

-### Embeddings: Database connections +#### background-jobs: p90_duration_by_method -#### embeddings: max_open_conns +

90th percentile duration by method

-

Maximum open

+90th percentile latency for dbworker store operations. + +Investigate database query performance and indexing for the affected operations. Look for slow queries in database logs. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100010` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36805,21 +35544,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_max_open{app_name="embeddings"}) +histogram_quantile(0.9, sum by(le, op) (rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain=~"$dbworker_domain"}[5m]))) ```

-#### embeddings: open_conns +#### background-jobs: p50_duration_by_method -

Established

+

Median duration by method

+ +median latency for dbworker store operations This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100011` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36827,21 +35568,25 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_open{app_name="embeddings"}) +histogram_quantile(0.5, sum by(le, op) (rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain=~"$dbworker_domain"}[5m]))) ```

-#### embeddings: in_use +#### background-jobs: p90_duration_by_domain -

Used

+

90th percentile duration by domain

+ +90th percentile latency for dbworker store operations. + +Investigate database performance for the specific domain. May indicate issues with specific database tables or query patterns. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100012` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36849,21 +35594,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_in_use{app_name="embeddings"}) +histogram_quantile(0.9, sum by(le, domain) (rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain=~"$dbworker_domain"}[5m]))) ```

-#### embeddings: idle +#### background-jobs: p50_duration_by_method -

Idle

+

Median operation duration by method

+ +median latency for dbworker store operations by method This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100013` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36871,21 +35618,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (src_pgsql_conns_idle{app_name="embeddings"}) +histogram_quantile(0.5, sum by(le, op) (rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain=~"$dbworker_domain"}[5m]))) ```

-#### embeddings: mean_blocked_seconds_per_conn_request +#### background-jobs: dequeue_performance -

Mean blocked seconds per conn request

+

Dequeue operation metrics

+ +rate of dequeue operations by domain - critical for worker performance -Refer to the [alerts reference](alerts#embeddings-mean-blocked-seconds-per-conn-request) for 2 alerts related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100020` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36893,21 +35642,21 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name="embeddings"}[5m])) / sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name="embeddings"}[5m])) +sum by (domain) (rate(src_workerutil_dbworker_store_total{op="Dequeue", domain=~"$dbworker_domain"}[5m])) ```

-#### embeddings: closed_max_idle +#### background-jobs: error_percentage_by_method -

Closed by SetMaxIdleConns

+

Percentage of operations resulting in error by method

-This panel has no related alerts. +Refer to the [alerts reference](alerts#background-jobs-error-percentage-by-method) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100130` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100021` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36915,21 +35664,21 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle{app_name="embeddings"}[5m])) +(sum by (op) (rate(src_workerutil_dbworker_store_errors_total{domain=~"$dbworker_domain"}[5m])) / sum by (op) (rate(src_workerutil_dbworker_store_total{domain=~"$dbworker_domain"}[5m]))) * 100 ```

-#### embeddings: closed_max_lifetime +#### background-jobs: error_percentage_by_domain -

Closed by SetConnMaxLifetime

+

Percentage of operations resulting in error by domain

-This panel has no related alerts. +Refer to the [alerts reference](alerts#background-jobs-error-percentage-by-domain) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100131` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100022` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36937,21 +35686,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_lifetime{app_name="embeddings"}[5m])) +(sum by (domain) (rate(src_workerutil_dbworker_store_errors_total{domain=~"$dbworker_domain"}[5m])) / sum by (domain) (rate(src_workerutil_dbworker_store_total{domain=~"$dbworker_domain"}[5m]))) * 100 ```

-#### embeddings: closed_max_idle_time +#### background-jobs: operation_latency_heatmap -

Closed by SetConnMaxIdleTime

+

Distribution of operation durations

+ +Distribution of operation durations - shows the spread of latencies across all operations This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100132` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100023` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36959,33 +35710,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by (app_name, db_name) (increase(src_pgsql_conns_closed_max_idle_time{app_name="embeddings"}[5m])) +sum by (le) (rate(src_workerutil_dbworker_store_duration_seconds_bucket{domain=~"$dbworker_domain"}[5m])) ```

-### Embeddings: Container monitoring (not available on server) - -#### embeddings: container_missing - -

Container missing

+### Background Jobs Dashboard: DBWorker Resetter -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. +#### background-jobs: resetter_duration -- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod embeddings` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p embeddings`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' embeddings` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the embeddings container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs embeddings` (note this will include logs from the previous and currently running container). +

Time spent running the resetter

-This panel has no related alerts. +Refer to the [alerts reference](alerts#background-jobs-resetter-duration) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100100` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -36993,21 +35734,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -count by(name) ((time() - container_last_seen{name=~"^embeddings.*"}) > 60) +histogram_quantile(0.95, sum by(le, domain) (rate(src_dbworker_resetter_duration_seconds_bucket{domain=~"$resetter_domain"}[5m]))) ```

-#### embeddings: container_cpu_usage +#### background-jobs: resetter_runs -

Container cpu usage total (1m average) across all cores by instance

+

Number of times the resetter ran

+ +the number of times the resetter ran in the last 5 minutes -Refer to the [alerts reference](alerts#embeddings-container-cpu-usage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100101` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37015,21 +35758,21 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -cadvisor_container_cpu_usage_percentage_total{name=~"^embeddings.*"} +sum by (domain) (increase(src_dbworker_resetter_total{domain=~"$resetter_domain"}[5m])) ```

-#### embeddings: container_memory_usage +#### background-jobs: resetter_failures -

Container memory usage by instance

+

Number of times the resetter failed to run

-Refer to the [alerts reference](alerts#embeddings-container-memory-usage) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#background-jobs-resetter-failures) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100102` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37037,24 +35780,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -cadvisor_container_memory_usage_percentage_total{name=~"^embeddings.*"} +sum by (domain) (increase(src_dbworker_resetter_errors_total{domain=~"$resetter_domain"}[5m])) ```

-#### embeddings: fs_io_operations +#### background-jobs: reset_records -

Filesystem reads and writes rate by instance over 1h

+

Number of stalled records reset back to 'queued' state

-This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. +the number of stalled records that were reset back to the queued state in the last 5 minutes This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100110` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37062,23 +35804,21 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by(name) (rate(container_fs_reads_total{name=~"^embeddings.*"}[1h]) + rate(container_fs_writes_total{name=~"^embeddings.*"}[1h])) +sum by (domain) (increase(src_dbworker_resetter_record_resets_total{domain=~"$resetter_domain"}[5m])) ```

-### Embeddings: Provisioning indicators (not available on server) +#### background-jobs: failed_records -#### embeddings: provisioning_container_cpu_usage_long_term +

Number of stalled records marked as 'failed'

-

Container cpu usage total (90th percentile over 1d) across all cores by instance

- -Refer to the [alerts reference](alerts#embeddings-provisioning-container-cpu-usage-long-term) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#background-jobs-failed-records) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100111` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37086,21 +35826,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -quantile_over_time(0.9, cadvisor_container_cpu_usage_percentage_total{name=~"^embeddings.*"}[1d]) +sum by (domain) (increase(src_dbworker_resetter_record_reset_failures_total{domain=~"$resetter_domain"}[5m])) ```

-#### embeddings: provisioning_container_memory_usage_long_term +#### background-jobs: stall_duration -

Container memory usage (1d maximum) by instance

+

Duration jobs were stalled before being reset

-Refer to the [alerts reference](alerts#embeddings-provisioning-container-memory-usage-long-term) for 1 alert related to this panel. +median time a job was stalled before being reset -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100301` on your Sourcegraph instance. +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100120` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37108,21 +35850,21 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^embeddings.*"}[1d]) +sum by (le) (rate(src_dbworker_resetter_stall_duration_seconds_bucket{domain=~"$resetter_domain"}[5m])) ```

-#### embeddings: provisioning_container_cpu_usage_short_term +#### background-jobs: stall_duration_p90 -

Container cpu usage total (5m maximum) across all cores by instance

+

90th percentile of stall duration

-Refer to the [alerts reference](alerts#embeddings-provisioning-container-cpu-usage-short-term) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#background-jobs-stall-duration-p90) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100121` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37130,21 +35872,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max_over_time(cadvisor_container_cpu_usage_percentage_total{name=~"^embeddings.*"}[5m]) +histogram_quantile(0.9, sum by(le, domain) (rate(src_dbworker_resetter_stall_duration_seconds_bucket{domain=~"$resetter_domain"}[5m]))) ```

-#### embeddings: provisioning_container_memory_usage_short_term +#### background-jobs: reset_vs_failure_ratio -

Container memory usage (5m maximum) by instance

+

Ratio of jobs reset to queued versus marked as failed

-Refer to the [alerts reference](alerts#embeddings-provisioning-container-memory-usage-short-term) for 1 alert related to this panel. +ratio of reset jobs to failed jobs - higher values indicate healthier job processing + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100122` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37152,24 +35896,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max_over_time(cadvisor_container_memory_usage_percentage_total{name=~"^embeddings.*"}[5m]) +(sum by (domain) (increase(src_dbworker_resetter_record_resets_total{domain=~"$resetter_domain"}[1h]))) / on(domain) (sum by (domain) (increase(src_dbworker_resetter_record_reset_failures_total{domain=~"$resetter_domain"}[1h]) > 0) or on(domain) sum by (domain) (increase(src_dbworker_resetter_record_resets_total{domain=~"$resetter_domain"}[1h]) * 0 + 1)) ```

-#### embeddings: container_oomkill_events_total +### Background Jobs Dashboard: Worker Queue Metrics -

Container OOMKILL events total by instance

+#### background-jobs: aggregate_queue_size -This value indicates the total number of times the container main process or child processes were terminated by OOM killer. -When it occurs frequently, it is an indicator of underprovisioning. +

Total number of jobs queued across all domains

-Refer to the [alerts reference](alerts#embeddings-container-oomkill-events-total) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#background-jobs-aggregate-queue-size) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100200` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37177,25 +35920,21 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max by (name) (container_oom_events_total{name=~"^embeddings.*"}) +sum(max by (domain) (src_workerutil_queue_depth)) ```

-### Embeddings: Golang runtime monitoring - -#### embeddings: go_goroutines - -

Maximum active goroutines

+#### background-jobs: max_queue_duration -A high value here indicates a possible goroutine leak. +

Maximum time a job has been in queue across all domains

-Refer to the [alerts reference](alerts#embeddings-go-goroutines) for 1 alert related to this panel. +Refer to the [alerts reference](alerts#background-jobs-max-queue-duration) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100201` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37203,21 +35942,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max by(instance) (go_goroutines{job=~".*embeddings"}) +max(src_workerutil_queue_duration_seconds) ```

-#### embeddings: go_gc_duration_seconds +#### background-jobs: queue_growth_rate -

Maximum go garbage collection duration

+

Rate of queue growth/decrease

+ +Rate at which queue is growing. Positive values indicate more jobs are being added than processed. -Refer to the [alerts reference](alerts#embeddings-go-gc-duration-seconds) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100202` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37225,23 +35966,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -max by(instance) (go_gc_duration_seconds{job=~".*embeddings"}) +sum(increase(src_workerutil_queue_depth[30m]))/1800 ```

-### Embeddings: Kubernetes monitoring (only available on Kubernetes) +#### background-jobs: queue_depth_by_domain -#### embeddings: pods_available_percentage +

Number of jobs in queue by domain

-

Percentage pods available

+Number of queued jobs per domain. Large values may indicate workers are not keeping up with incoming jobs. -Refer to the [alerts reference](alerts#embeddings-pods-available-percentage) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100500` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100210` on your Sourcegraph instance. -*Managed by the [Sourcegraph Cody team](https://handbook.sourcegraph.com/departments/engineering/teams/cody).* +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37249,24 +35990,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -sum by(app) (up{app=~".*embeddings"}) / count by (app) (up{app=~".*embeddings"}) * 100 +sum by (domain) (max by (domain) (src_workerutil_queue_depth)) ```

-### Embeddings: Cache - -#### embeddings: hit_ratio +#### background-jobs: queue_duration_by_domain -

Hit ratio of the embeddings cache

+

Maximum queue time by domain

-A low hit rate indicates your cache is not well utilized. Consider increasing the cache size. +Maximum time a job has been waiting in queue per domain. Long durations indicate potential worker stalls. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100600` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100211` on your Sourcegraph instance. +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37274,22 +36014,23 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -rate(src_embeddings_cache_hit_count[30m]) / (rate(src_embeddings_cache_hit_count[30m]) + rate(src_embeddings_cache_miss_count[30m])) +sum by (domain) (max by (domain) (src_workerutil_queue_duration_seconds)) ```

-#### embeddings: missed_bytes +#### background-jobs: queue_growth_by_domain -

Bytes fetched due to a cache miss

+

Rate of change in queue size by domain

-A high volume of misses indicates that the many searches are not hitting the cache. Consider increasing the cache size. +Rate of change in queue size per domain. Consistently positive values indicate jobs are being queued faster than processed. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100601` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/background-jobs/background-jobs?viewPanel=100212` on your Sourcegraph instance. +*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*
Technical details @@ -37297,7 +36038,7 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100 Query: ``` -rate(src_embeddings_cache_miss_bytes[10m]) +sum by (domain) (idelta(src_workerutil_queue_depth[10m])) / 600 ```
diff --git a/docs/cli/references/api.mdx b/docs/cli/references/api.mdx index 2cabe7906..2204e1b1f 100644 --- a/docs/cli/references/api.mdx +++ b/docs/cli/references/api.mdx @@ -9,7 +9,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-query` | GraphQL query to execute, e.g. 'query \{ currentUser \{ username \} \}' (stdin otherwise) | | -| `-trace` | Log the trace ID for requests. See [docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-vars` | GraphQL query variables to include as JSON string, e.g. '\{"var": "val", "var2": "val2"\}' | | @@ -27,7 +27,7 @@ Usage of 'src api': -query string GraphQL query to execute, e.g. 'query { currentUser { username } }' (stdin otherwise) -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -vars string @@ -60,3 +60,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/batch/apply.mdx b/docs/cli/references/batch/apply.mdx index 1613d9105..d33ec6143 100644 --- a/docs/cli/references/batch/apply.mdx +++ b/docs/cli/references/batch/apply.mdx @@ -12,6 +12,7 @@ | `-clear-cache` | If true, clears the execution cache and executes all steps anew. | `false` | | `-dump-requests` | Log GraphQL requests and responses to stdout | `false` | | `-f` | The batch spec file to read, or - to read from standard input. | | +| `-fail-fast` | Halts execution immediately upon first error instead of continuing with other tasks. | `false` | | `-force-override-ignore` | Do not ignore repositories that have a .batchignore file. | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | @@ -20,11 +21,11 @@ | `-n` | Alias for -namespace. | | | `-namespace` | The user or organization namespace to place the batch change within. Default is the currently authenticated user. | | | `-run-as-root` | If true, forces all step containers to run as root. | `false` | -| `-skip-errors` | If true, errors encountered while executing steps in a repository won't stop the execution of the batch spec but only cause that repository to be skipped. | `false` | +| `-skip-errors` | If true, errors encountered won't stop the program, but only log them. | `false` | | `-text-only` | INTERNAL USE ONLY. EXPERIMENTAL. Switches off the TUI to only print JSON lines. | `false` | | `-timeout` | The maximum duration a single batch spec step can take. | `1h0m0s` | | `-tmp` | Directory for storing temporary data, such as log files. Default is /tmp. Can also be set with environment variable SRC_BATCH_TMP_DIR; if both are set, this flag will be used and not the environment variable. | `/tmp` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-v` | print verbose output | `false` | | `-workspace` | Workspace mode to use ("auto", "bind", or "volume") | `auto` | @@ -48,6 +49,8 @@ Usage of 'src batch apply': Log GraphQL requests and responses to stdout -f string The batch spec file to read, or - to read from standard input. + -fail-fast + Halts execution immediately upon first error instead of continuing with other tasks. -force-override-ignore Do not ignore repositories that have a .batchignore file. -get-curl @@ -65,7 +68,7 @@ Usage of 'src batch apply': -run-as-root If true, forces all step containers to run as root. -skip-errors - If true, errors encountered while executing steps in a repository won't stop the execution of the batch spec but only cause that repository to be skipped. + If true, errors encountered won't stop the program, but only log them. -text-only INTERNAL USE ONLY. EXPERIMENTAL. Switches off the TUI to only print JSON lines. -timeout duration @@ -73,7 +76,7 @@ Usage of 'src batch apply': -tmp string Directory for storing temporary data, such as log files. Default is /tmp. Can also be set with environment variable SRC_BATCH_TMP_DIR; if both are set, this flag will be used and not the environment variable. (default "/tmp") -trace - Log the trace ID for requests. # See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -v print verbose output @@ -99,3 +102,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/batch/new.mdx b/docs/cli/references/batch/new.mdx index a123c0f50..c6972b00a 100644 --- a/docs/cli/references/batch/new.mdx +++ b/docs/cli/references/batch/new.mdx @@ -9,7 +9,8 @@ | `-f` | The name of the batch spec file to create. | `batch.yaml` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-skip-errors` | If true, errors encountered won't stop the program, but only log them. | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -25,8 +26,10 @@ Usage of 'src batch new': Print the curl command for executing this query and exit (WARNING: includes printing your access token!) -insecure-skip-verify Skip validation of TLS certificates against trusted chains + -skip-errors + If true, errors encountered won't stop the program, but only log them. -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -45,3 +48,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/batch/preview.mdx b/docs/cli/references/batch/preview.mdx index bb2e07b4d..682f0f0f8 100644 --- a/docs/cli/references/batch/preview.mdx +++ b/docs/cli/references/batch/preview.mdx @@ -12,6 +12,7 @@ | `-clear-cache` | If true, clears the execution cache and executes all steps anew. | `false` | | `-dump-requests` | Log GraphQL requests and responses to stdout | `false` | | `-f` | The batch spec file to read, or - to read from standard input. | | +| `-fail-fast` | Halts execution immediately upon first error instead of continuing with other tasks. | `false` | | `-force-override-ignore` | Do not ignore repositories that have a .batchignore file. | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | @@ -20,11 +21,11 @@ | `-n` | Alias for -namespace. | | | `-namespace` | The user or organization namespace to place the batch change within. Default is the currently authenticated user. | | | `-run-as-root` | If true, forces all step containers to run as root. | `false` | -| `-skip-errors` | If true, errors encountered while executing steps in a repository won't stop the execution of the batch spec but only cause that repository to be skipped. | `false` | +| `-skip-errors` | If true, errors encountered won't stop the program, but only log them. | `false` | | `-text-only` | INTERNAL USE ONLY. EXPERIMENTAL. Switches off the TUI to only print JSON lines. | `false` | | `-timeout` | The maximum duration a single batch spec step can take. | `1h0m0s` | | `-tmp` | Directory for storing temporary data, such as log files. Default is /tmp. Can also be set with environment variable SRC_BATCH_TMP_DIR; if both are set, this flag will be used and not the environment variable. | `/tmp` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-v` | print verbose output | `false` | | `-workspace` | Workspace mode to use ("auto", "bind", or "volume") | `auto` | @@ -48,6 +49,8 @@ Usage of 'src batch preview': Log GraphQL requests and responses to stdout -f string The batch spec file to read, or - to read from standard input. + -fail-fast + Halts execution immediately upon first error instead of continuing with other tasks. -force-override-ignore Do not ignore repositories that have a .batchignore file. -get-curl @@ -65,7 +68,7 @@ Usage of 'src batch preview': -run-as-root If true, forces all step containers to run as root. -skip-errors - If true, errors encountered while executing steps in a repository won't stop the execution of the batch spec but only cause that repository to be skipped. + If true, errors encountered won't stop the program, but only log them. -text-only INTERNAL USE ONLY. EXPERIMENTAL. Switches off the TUI to only print JSON lines. -timeout duration @@ -73,7 +76,7 @@ Usage of 'src batch preview': -tmp string Directory for storing temporary data, such as log files. Default is /tmp. Can also be set with environment variable SRC_BATCH_TMP_DIR; if both are set, this flag will be used and not the environment variable. (default "/tmp") -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -v print verbose output @@ -97,3 +100,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/batch/remote.mdx b/docs/cli/references/batch/remote.mdx index d7e5e5901..650eee96b 100644 --- a/docs/cli/references/batch/remote.mdx +++ b/docs/cli/references/batch/remote.mdx @@ -14,7 +14,8 @@ | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-n` | Alias for -namespace. | | | `-namespace` | The user or organization namespace to place the batch change within. Default is the currently authenticated user. | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-skip-errors` | If true, errors encountered won't stop the program, but only log them. | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -40,8 +41,10 @@ Usage of 'src batch remote': Alias for -namespace. -namespace string The user or organization namespace to place the batch change within. Default is the currently authenticated user. + -skip-errors + If true, errors encountered won't stop the program, but only log them. -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) 'src batch remote' runs a batch spec on the Sourcegraph instance. @@ -58,3 +61,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/batch/repositories.mdx b/docs/cli/references/batch/repositories.mdx index 23b0f0776..961723322 100644 --- a/docs/cli/references/batch/repositories.mdx +++ b/docs/cli/references/batch/repositories.mdx @@ -11,7 +11,8 @@ | `-force-override-ignore` | Do not ignore repositories that have a .batchignore file. | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-skip-errors` | If true, errors encountered won't stop the program, but only log them. | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -31,8 +32,10 @@ Usage of 'src batch repositories': Print the curl command for executing this query and exit (WARNING: includes printing your access token!) -insecure-skip-verify Skip validation of TLS certificates against trusted chains + -skip-errors + If true, errors encountered won't stop the program, but only log them. -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -52,3 +55,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/batch/validate.mdx b/docs/cli/references/batch/validate.mdx index 9a0982fad..8896b1689 100644 --- a/docs/cli/references/batch/validate.mdx +++ b/docs/cli/references/batch/validate.mdx @@ -11,7 +11,8 @@ | `-force-override-ignore` | Do not ignore repositories that have a .batchignore file. | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-skip-errors` | If true, errors encountered won't stop the program, but only log them. | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -31,8 +32,10 @@ Usage of 'src batch validate': Print the curl command for executing this query and exit (WARNING: includes printing your access token!) -insecure-skip-verify Skip validation of TLS certificates against trusted chains + -skip-errors + If true, errors encountered won't stop the program, but only log them. -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -51,3 +54,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/code-intel/upload.mdx b/docs/cli/references/code-intel/upload.mdx index 6502589ef..c1630c95b 100644 --- a/docs/cli/references/code-intel/upload.mdx +++ b/docs/cli/references/code-intel/upload.mdx @@ -7,23 +7,22 @@ |------|-------------|---------------| | `-associated-index-id` | ID of the associated index record for this upload. For internal use only. | `-1` | | `-commit` | The 40-character hash of the commit. Defaults to the currently checked-out commit. | | -| `-file` | The path to the LSIF dump file. | | +| `-file` | The path to the SCIP index file. | | | `-github-token` | A GitHub access token with 'public_repo' scope that Sourcegraph uses to verify you have access to the repository. | | | `-gitlab-token` | A GitLab access token with 'read_api' scope that Sourcegraph uses to verify you have access to the repository. | | | `-ignore-upload-failure` | Exit with status code zero on upload failure. | `false` | -| `-indexer` | The name of the indexer that generated the dump. This will override the 'toolInfo.name' field in the metadata vertex of the LSIF dump file. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). | | -| `-indexerVersion` | The version of the indexer that generated the dump. This will override the 'toolInfo.version' field in the metadata vertex of the LSIF dump file. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). | | +| `-indexer` | The name of the indexer that generated the dump. This will override the 'toolInfo.name' field in the metadata section of SCIP index. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). | | +| `-indexerVersion` | The version of the indexer that generated the dump. This will override the 'toolInfo.version' field in the metadata section of SCIP index. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). | | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-json` | Output relevant state in JSON on success. | `false` | | `-max-concurrency` | The maximum number of concurrent uploads. Only relevant for multipart uploads. Defaults to all parts concurrently. | `-1` | | `-max-payload-size` | The maximum upload size (in megabytes). Indexes exceeding this limit will be uploaded over multiple HTTP requests. | `100` | | `-no-progress` | Do not display progress updates. | `false` | -| `-open` | Open the LSIF upload page in your browser. | `false` | +| `-open` | Open the SCIP upload page in your browser. | `false` | | `-repo` | The name of the repository (e.g. github.com/gorilla/mux). By default, derived from the origin remote. | | -| `-root` | The path in the repository that matches the LSIF projectRoot (e.g. cmd/project1). Defaults to the directory where the dump file is located. | | -| `-skip-scip` | Skip converting LSIF index to SCIP if the instance supports it; this option should only used for debugging | `false` | +| `-root` | The path in the repository that matches the SCIP projectRoot (e.g. cmd/project1). Defaults to the directory where the SCIP index file is located. | | | `-trace` | -trace=0 shows no logs; -trace=1 shows requests and response metadata; -trace=2 shows headers, -trace=3 shows response body | `0` | -| `-upload-route` | The path of the upload route. For internal use only. | `/.api/lsif/upload` | +| `-upload-route` | The path of the upload route. For internal use only. | `/.api/scip/upload` | ## Usage @@ -35,7 +34,7 @@ Usage of 'src code-intel upload': -commit string The 40-character hash of the commit. Defaults to the currently checked-out commit. -file string - The path to the LSIF dump file. + The path to the SCIP index file. -github-token string A GitHub access token with 'public_repo' scope that Sourcegraph uses to verify you have access to the repository. -gitlab-token string @@ -43,9 +42,9 @@ Usage of 'src code-intel upload': -ignore-upload-failure Exit with status code zero on upload failure. -indexer string - The name of the indexer that generated the dump. This will override the 'toolInfo.name' field in the metadata vertex of the LSIF dump file. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). + The name of the indexer that generated the dump. This will override the 'toolInfo.name' field in the metadata section of SCIP index. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). -indexerVersion string - The version of the indexer that generated the dump. This will override the 'toolInfo.version' field in the metadata vertex of the LSIF dump file. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). + The version of the indexer that generated the dump. This will override the 'toolInfo.version' field in the metadata section of SCIP index. This must be supplied if the indexer does not set this field (in which case the upload will fail with an explicit message). -insecure-skip-verify Skip validation of TLS certificates against trusted chains -json @@ -57,22 +56,20 @@ Usage of 'src code-intel upload': -no-progress Do not display progress updates. -open - Open the LSIF upload page in your browser. + Open the SCIP upload page in your browser. -repo string The name of the repository (e.g. github.com/gorilla/mux). By default, derived from the origin remote. -root string - The path in the repository that matches the LSIF projectRoot (e.g. cmd/project1). Defaults to the directory where the dump file is located. - -skip-scip - Skip converting LSIF index to SCIP if the instance supports it; this option should only used for debugging + The path in the repository that matches the SCIP projectRoot (e.g. cmd/project1). Defaults to the directory where the SCIP index file is located. -trace int -trace=0 shows no logs; -trace=1 shows requests and response metadata; -trace=2 shows headers, -trace=3 shows response body -upload-route string - The path of the upload route. For internal use only. (default "/.api/lsif/upload") + The path of the upload route. For internal use only. (default "/.api/scip/upload") Examples: Before running any of these, first use src auth to authenticate. Alternately, use the SRC_ACCESS_TOKEN environment variable for - individual src-cli invocations. + individual src-cli invocations. If run from within the project itself, src-cli will infer various flags based on git metadata. @@ -93,9 +90,6 @@ Examples: $ src code-intel upload -github-token=BAZ, or $ src code-intel upload -gitlab-token=BAZ - For any of these commands, an LSIF index (default name: dump.lsif) can be - used instead of a SCIP index (default name: index.scip). - ``` \ No newline at end of file diff --git a/docs/cli/references/config/edit.mdx b/docs/cli/references/config/edit.mdx index 6e98b13a8..67959780b 100644 --- a/docs/cli/references/config/edit.mdx +++ b/docs/cli/references/config/edit.mdx @@ -11,7 +11,7 @@ | `-overwrite` | Overwrite the entire settings with the value given in -value (not just a single property). | `false` | | `-property` | The name of the settings property to set. | | | `-subject` | The ID of the settings subject whose settings to edit. (default: authenticated user) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-value` | The value for the settings property (when used with -property). | | | `-value-file` | Read the value from this file instead of from the -value command-line option. | | @@ -34,7 +34,7 @@ Usage of 'src config edit': -subject string The ID of the settings subject whose settings to edit. (default: authenticated user) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -value string @@ -70,3 +70,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/config/get.mdx b/docs/cli/references/config/get.mdx index 050dbe863..42972c2fe 100644 --- a/docs/cli/references/config/get.mdx +++ b/docs/cli/references/config/get.mdx @@ -10,7 +10,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-subject` | The ID of the settings subject whose settings to get. (default: authenticated user) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src config get': -subject string The ID of the settings subject whose settings to get. (default: authenticated user) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -50,3 +50,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/config/list.mdx b/docs/cli/references/config/list.mdx index 2f531ef07..ab6ee82b8 100644 --- a/docs/cli/references/config/list.mdx +++ b/docs/cli/references/config/list.mdx @@ -10,7 +10,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-subject` | The ID of the settings subject whose settings to list. (default: authenticated user) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src config list': -subject string The ID of the settings subject whose settings to list. (default: authenticated user) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -46,3 +46,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/extensions/copy.mdx b/docs/cli/references/extensions/copy.mdx index 0b8a5c807..069d8d042 100644 --- a/docs/cli/references/extensions/copy.mdx +++ b/docs/cli/references/extensions/copy.mdx @@ -10,7 +10,7 @@ | `-extension-id` | The <extID> in https://sourcegraph.com/extensions/<extID> (e.g. sourcegraph/java) | | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src extensions copy': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -37,3 +37,4 @@ Copy an extension from Sourcegraph.com to your private registry. ``` + \ No newline at end of file diff --git a/docs/cli/references/extensions/delete.mdx b/docs/cli/references/extensions/delete.mdx index 2da01bfb7..2222034c8 100644 --- a/docs/cli/references/extensions/delete.mdx +++ b/docs/cli/references/extensions/delete.mdx @@ -9,7 +9,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-id` | The ID (GraphQL API ID, not extension ID) of the extension to delete. | | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -26,7 +26,7 @@ Usage of 'src extensions delete': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -43,3 +43,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/extensions/get.mdx b/docs/cli/references/extensions/get.mdx index 8d014547d..935ce9ffd 100644 --- a/docs/cli/references/extensions/get.mdx +++ b/docs/cli/references/extensions/get.mdx @@ -10,7 +10,7 @@ | `-f` | Format for the output, using the syntax of Go package text/template. (e.g. "\{\{.ExtensionID\}\}: \{\{.Manifest.Title\}\} (\{\{.RemoteURL\}\})" or "\{\{.\|json\}\}") | `{{.\|json}}` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src extensions get': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -43,3 +43,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/extensions/list.mdx b/docs/cli/references/extensions/list.mdx index 4484a1438..093295a01 100644 --- a/docs/cli/references/extensions/list.mdx +++ b/docs/cli/references/extensions/list.mdx @@ -11,7 +11,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-query` | Returns extensions whose extension IDs match the query. (e.g. "myextension") | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -32,7 +32,7 @@ Usage of 'src extensions list': -query string Returns extensions whose extension IDs match the query. (e.g. "myextension") -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -53,3 +53,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/extensions/publish.mdx b/docs/cli/references/extensions/publish.mdx index 61461fd50..938695278 100644 --- a/docs/cli/references/extensions/publish.mdx +++ b/docs/cli/references/extensions/publish.mdx @@ -12,7 +12,7 @@ | `-git-head` | Override the current git commit for the bundle. (default: uses `git rev-parse head`) | | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-manifest` | The extension manifest file. | `package.json` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-url` | Override the URL for the bundle. (example: set to http://localhost:1234/myext.js for local dev with parcel) | | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -36,7 +36,7 @@ Usage of 'src extensions publish': -manifest string The extension manifest file. (default "package.json") -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -url string Override the URL for the bundle. (example: set to http://localhost:1234/myext.js for local dev with parcel) -user-agent-telemetry @@ -66,3 +66,4 @@ Notes: ``` + \ No newline at end of file diff --git a/docs/cli/references/extsvc/create.mdx b/docs/cli/references/extsvc/create.mdx index 3d36811fa..6d1b3d88b 100644 --- a/docs/cli/references/extsvc/create.mdx +++ b/docs/cli/references/extsvc/create.mdx @@ -10,7 +10,7 @@ | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-kind` | kind of the external service to create | | | `-name` | exact name of the external service to create | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src extsvc create': -name string exact name of the external service to create -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -39,6 +39,7 @@ Usage of 'src extsvc create': $ cat new-config.json | src extsvc create $ src extsvc create -name 'My GitHub connection' new-config.json - + ``` + \ No newline at end of file diff --git a/docs/cli/references/extsvc/edit.mdx b/docs/cli/references/extsvc/edit.mdx index e3753b80a..856304c6e 100644 --- a/docs/cli/references/extsvc/edit.mdx +++ b/docs/cli/references/extsvc/edit.mdx @@ -12,7 +12,7 @@ | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-name` | exact name of the external service to edit | | | `-rename` | when specified, renames the external service | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -35,7 +35,7 @@ Usage of 'src extsvc edit': -rename string when specified, renames the external service -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -56,3 +56,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/extsvc/list.mdx b/docs/cli/references/extsvc/list.mdx index 7777d2ef1..6222b4fe5 100644 --- a/docs/cli/references/extsvc/list.mdx +++ b/docs/cli/references/extsvc/list.mdx @@ -10,7 +10,7 @@ | `-first` | Return only the first n external services. (use -1 for unlimited) | `-1` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src extsvc list': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -46,3 +46,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/index.mdx b/docs/cli/references/index.mdx index 11abeac68..b384b3fe4 100644 --- a/docs/cli/references/index.mdx +++ b/docs/cli/references/index.mdx @@ -1,8 +1,8 @@ -# Command reference -Most commands require that the user first [authenticate](quickstart#connect-to-sourcegraph) against their Sourcegraph instance. See `src --help` for a full list of the sub commands available for your release of the `src` cli. +# `src` ## Subcommands + * [`admin`](references/admin) * [`api`](references/api) * [`batch`](references/batch) @@ -10,19 +10,19 @@ Most commands require that the user first [authenticate](quickstart#connect-to-s * [`codeowners`](references/codeowners) * [`config`](references/config) * [`debug`](references/debug) -* [`extensions`](references/extensions) * [`extsvc`](references/extsvc) * [`login`](references/login) -* [`lsif`](references/lsif) * [`orgs`](references/orgs) -* [`prompts`](references/prompts) * [`repos`](references/repos) +* [`sbom`](references/sbom) * [`scout`](references/scout) * [`search`](references/search) * [`search-jobs`](references/search-jobs) * [`serve-git`](references/serve-git) +* [`signature`](references/signature) * [`snapshot`](references/snapshot) * [`teams`](references/teams) * [`users`](references/users) * [`validate`](references/validate) * [`version`](references/version) + \ No newline at end of file diff --git a/docs/cli/references/login.mdx b/docs/cli/references/login.mdx index 78ba45a01..a53c76fd7 100644 --- a/docs/cli/references/login.mdx +++ b/docs/cli/references/login.mdx @@ -8,7 +8,7 @@ | `-dump-requests` | Log GraphQL requests and responses to stdout | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -38,8 +38,9 @@ Examples: -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) ``` + \ No newline at end of file diff --git a/docs/cli/references/orgs/create.mdx b/docs/cli/references/orgs/create.mdx index 781fd339e..2842349a3 100644 --- a/docs/cli/references/orgs/create.mdx +++ b/docs/cli/references/orgs/create.mdx @@ -10,7 +10,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-name` | The new organization's name. (required) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src orgs create': -name string The new organization's name. (required) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -42,3 +42,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/orgs/delete.mdx b/docs/cli/references/orgs/delete.mdx index f4137449f..5175de533 100644 --- a/docs/cli/references/orgs/delete.mdx +++ b/docs/cli/references/orgs/delete.mdx @@ -9,7 +9,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-id` | The ID of the organization to delete. | | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -26,7 +26,7 @@ Usage of 'src orgs delete': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -47,3 +47,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/orgs/get.mdx b/docs/cli/references/orgs/get.mdx index d59ed0a25..64e0c0295 100644 --- a/docs/cli/references/orgs/get.mdx +++ b/docs/cli/references/orgs/get.mdx @@ -10,7 +10,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-name` | Look up organization by name. (e.g. "abc-org") | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src orgs get': -name string Look up organization by name. (e.g. "abc-org") -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -46,3 +46,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/orgs/list.mdx b/docs/cli/references/orgs/list.mdx index 094823e48..96f5b8b92 100644 --- a/docs/cli/references/orgs/list.mdx +++ b/docs/cli/references/orgs/list.mdx @@ -11,7 +11,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-query` | Returns organizations whose names match the query. (e.g. "alice") | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -32,7 +32,7 @@ Usage of 'src orgs list': -query string Returns organizations whose names match the query. (e.g. "alice") -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -53,3 +53,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/orgs/members/add.mdx b/docs/cli/references/orgs/members/add.mdx index 6cbdc30d7..3d87f9135 100644 --- a/docs/cli/references/orgs/members/add.mdx +++ b/docs/cli/references/orgs/members/add.mdx @@ -9,7 +9,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-org-id` | ID of organization to which to add member. (required) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-username` | Username of user to add as member. (required) | | @@ -27,7 +27,7 @@ Usage of 'src orgs members add': -org-id string ID of organization to which to add member. (required) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -username string @@ -42,3 +42,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/orgs/members/remove.mdx b/docs/cli/references/orgs/members/remove.mdx index 7d40a9af5..e4f09aa23 100644 --- a/docs/cli/references/orgs/members/remove.mdx +++ b/docs/cli/references/orgs/members/remove.mdx @@ -9,7 +9,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-org-id` | ID of organization from which to remove member. (required) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-user-id` | ID of user to remove as member. (required) | | @@ -27,7 +27,7 @@ Usage of 'src orgs members remove': -org-id string ID of organization from which to remove member. (required) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -user-id string @@ -41,3 +41,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/repos/add-metadata.mdx b/docs/cli/references/repos/add-metadata.mdx index 03dafc4eb..43ea04782 100644 --- a/docs/cli/references/repos/add-metadata.mdx +++ b/docs/cli/references/repos/add-metadata.mdx @@ -11,7 +11,7 @@ | `-key` | The name of the metadata key to add (required) | | | `-repo` | The ID of the repo to add the key-value pair metadata to (required if -repo-name is not specified) | | | `-repo-name` | The name of the repo to add the key-value pair metadata to (required if -repo is not specified) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-value` | The metadata value associated with the metadata key. Defaults to null. | | @@ -33,7 +33,7 @@ Usage of 'src repos add-metadata': -repo-name string The name of the repo to add the key-value pair metadata to (required if -repo is not specified) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -value string @@ -51,3 +51,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/repos/delete-metadata.mdx b/docs/cli/references/repos/delete-metadata.mdx index 6da9f300c..c4a132d4d 100644 --- a/docs/cli/references/repos/delete-metadata.mdx +++ b/docs/cli/references/repos/delete-metadata.mdx @@ -11,7 +11,7 @@ | `-key` | The name of the metadata key to be deleted (required) | | | `-repo` | The ID of the repo with the key-value pair metadata to be deleted (required if -repo-name is not specified) | | | `-repo-name` | The name of the repo to add the key-value pair metadata to (required if -repo is not specified) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -32,7 +32,7 @@ Usage of 'src repos delete-metadata': -repo-name string The name of the repo to add the key-value pair metadata to (required if -repo is not specified) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -46,3 +46,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/repos/delete.mdx b/docs/cli/references/repos/delete.mdx index d005eb249..dce1a97d3 100644 --- a/docs/cli/references/repos/delete.mdx +++ b/docs/cli/references/repos/delete.mdx @@ -8,7 +8,7 @@ | `-dump-requests` | Log GraphQL requests and responses to stdout | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -23,7 +23,7 @@ Usage of 'src repos delete' -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -34,3 +34,4 @@ Examples: $ src repos delete github.com/my/repo github.com/my/repo2 ``` + \ No newline at end of file diff --git a/docs/cli/references/repos/get.mdx b/docs/cli/references/repos/get.mdx index 31b5b530b..554e5710b 100644 --- a/docs/cli/references/repos/get.mdx +++ b/docs/cli/references/repos/get.mdx @@ -10,7 +10,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-name` | The name of the repository. (required) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -29,7 +29,7 @@ Usage of 'src repos get': -name string The name of the repository. (required) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -42,3 +42,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/repos/list.mdx b/docs/cli/references/repos/list.mdx index 2988e3fde..75d0b4801 100644 --- a/docs/cli/references/repos/list.mdx +++ b/docs/cli/references/repos/list.mdx @@ -18,7 +18,7 @@ | `-not-indexed` | Include repositories that do not have a text search index. | `true` | | `-order-by` | How to order the results; possible choices are: "name", "created-at" | `name` | | `-query` | Returns repositories whose names match the query. (e.g. "myorg/") | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -53,7 +53,7 @@ Usage of 'src repos list': -query string Returns repositories whose names match the query. (e.g. "myorg/") -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -78,3 +78,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/repos/update-metadata.mdx b/docs/cli/references/repos/update-metadata.mdx index 9fcd6bf21..6669a18fa 100644 --- a/docs/cli/references/repos/update-metadata.mdx +++ b/docs/cli/references/repos/update-metadata.mdx @@ -11,7 +11,7 @@ | `-key` | The name of the metadata key to be updated (required) | | | `-repo` | The ID of the repo with the metadata key to be updated (required if -repo-name is not specified) | | | `-repo-name` | The name of the repo to add the key-value pair metadata to (required if -repo is not specified) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-value` | The new metadata value of the metadata key to be set. Defaults to null. | | @@ -33,7 +33,7 @@ Usage of 'src repos update-metadata': -repo-name string The name of the repo to add the key-value pair metadata to (required if -repo is not specified) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -value string @@ -51,3 +51,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/search.mdx b/docs/cli/references/search.mdx index b2fae52ca..bca07eb39 100644 --- a/docs/cli/references/search.mdx +++ b/docs/cli/references/search.mdx @@ -13,7 +13,7 @@ | `-json` | Whether or not to output results as JSON. | `false` | | `-less` | Pipe output to 'less -R' (only if stdout is terminal, and not json flag). | `true` | | `-stream` | Consume results as stream. Streaming search only supports a subset of flags and parameters: trace, insecure-skip-verify, display, json. | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -38,7 +38,7 @@ Usage of 'src search': -stream Consume results as stream. Streaming search only supports a subset of flags and parameters: trace, insecure-skip-verify, display, json. -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -63,7 +63,7 @@ Other tips: Force color output on (not on by default when piped to other programs) by setting COLOR=t - Query syntax: https://sourcegraph.com/docs/code_search/reference/queries + Query syntax: https://docs.sourcegraph.com/code_search/reference/queries Be careful with search strings including negation: a search with an initial negated term may be parsed as a flag rather than as a search string. You can @@ -73,3 +73,4 @@ Other tips: ``` + \ No newline at end of file diff --git a/docs/cli/references/serve-git.mdx b/docs/cli/references/serve-git.mdx index b7d6a2d21..ec46e8dd5 100644 --- a/docs/cli/references/serve-git.mdx +++ b/docs/cli/references/serve-git.mdx @@ -22,6 +22,7 @@ By default 'src serve-git' will recursively serve your current directory on the 'src serve-git -list' will not start up the server. Instead it will write to stdout a list of repository names it would serve. -Read Documentation here https://sourcegraph.com/docs/admin/code_host_connection/src_serve_git +Documentation at https://sourcegraph.com/docs/admin/code_hosts/src_serve_git ``` + \ No newline at end of file diff --git a/docs/cli/references/users/create.mdx b/docs/cli/references/users/create.mdx index c1e03702c..096cfb496 100644 --- a/docs/cli/references/users/create.mdx +++ b/docs/cli/references/users/create.mdx @@ -10,7 +10,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-reset-password-url` | Print the reset password URL to manually send to the new user. | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-username` | The new user's username. (required) | | @@ -30,7 +30,7 @@ Usage of 'src users create': -reset-password-url Print the reset password URL to manually send to the new user. -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -username string @@ -45,3 +45,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/users/delete.mdx b/docs/cli/references/users/delete.mdx index 137b95339..72b0db301 100644 --- a/docs/cli/references/users/delete.mdx +++ b/docs/cli/references/users/delete.mdx @@ -9,7 +9,7 @@ | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-id` | The ID of the user to delete. | | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -26,7 +26,7 @@ Usage of 'src users delete': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -47,3 +47,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/users/get.mdx b/docs/cli/references/users/get.mdx index 72531e8e5..15054f468 100644 --- a/docs/cli/references/users/get.mdx +++ b/docs/cli/references/users/get.mdx @@ -10,7 +10,7 @@ | `-f` | Format for the output, using the syntax of Go package text/template. (e.g. "\{\{.ID\}\}: \{\{.Username\}\} (\{\{.DisplayName\}\})") | `{{.\|json}}` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-username` | Look up user by username. (e.g. "alice") | | @@ -30,7 +30,7 @@ Usage of 'src users get': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -username string @@ -45,3 +45,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/users/list.mdx b/docs/cli/references/users/list.mdx index 688b9f996..2b9cb2566 100644 --- a/docs/cli/references/users/list.mdx +++ b/docs/cli/references/users/list.mdx @@ -12,7 +12,7 @@ | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-query` | Returns users whose names match the query. (e.g. "alice") | | | `-tag` | Returns users with the given tag. | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -35,7 +35,7 @@ Usage of 'src users list': -tag string Returns users with the given tag. -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -60,3 +60,4 @@ Examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/users/prune.mdx b/docs/cli/references/users/prune.mdx index 1c8f211ea..471fb7b90 100644 --- a/docs/cli/references/users/prune.mdx +++ b/docs/cli/references/users/prune.mdx @@ -13,7 +13,7 @@ | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | | `-remove-admin` | prune admin accounts | `false` | | `-remove-null-users` | removes users with no last active value | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -38,17 +38,18 @@ Usage of 'src users prune': -remove-null-users removes users with no last active value -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) This command removes users from a Sourcegraph instance who have been inactive for 60 or more days. Admin accounts are omitted by default. - + Examples: $ src users prune -days 182 - + $ src users prune -remove-admin -remove-null-users ``` + \ No newline at end of file diff --git a/docs/cli/references/users/tag.mdx b/docs/cli/references/users/tag.mdx index b4affd3d5..5ec8cd5d2 100644 --- a/docs/cli/references/users/tag.mdx +++ b/docs/cli/references/users/tag.mdx @@ -8,9 +8,9 @@ | `-dump-requests` | Log GraphQL requests and responses to stdout | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-remove` | Remove the tag. (default: add the tag) | `false` | +| `-remove` | Remove the tag. (default: add the tag | `false` | | `-tag` | The tag to set on the user. (required) | | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | | `-user-id` | The ID of the user to tag. (required) | | @@ -30,7 +30,7 @@ Usage of 'src users tag': -tag string The tag to set on the user. (required) -trace - Log the trace ID for requests. See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) -user-id string @@ -55,3 +55,4 @@ Related examples: ``` + \ No newline at end of file diff --git a/docs/cli/references/validate.mdx b/docs/cli/references/validate.mdx index 1fa8fe6da..47cc1d4f3 100644 --- a/docs/cli/references/validate.mdx +++ b/docs/cli/references/validate.mdx @@ -9,7 +9,7 @@ EXPERIMENTAL: 'validate' is an experimental command in the 'src' tool. -Please visit https://sourcegraph.com/docs/admin/validation for documentation of the validate command. +Please visit https://docs.sourcegraph.com/admin/validation for documentation of the validate command. Usage: @@ -24,3 +24,4 @@ Use "src validate [command] -h" for more information about a command. ``` + \ No newline at end of file diff --git a/docs/cli/references/version.mdx b/docs/cli/references/version.mdx index f7a851a3c..8971177a3 100644 --- a/docs/cli/references/version.mdx +++ b/docs/cli/references/version.mdx @@ -9,7 +9,7 @@ | `-dump-requests` | Log GraphQL requests and responses to stdout | `false` | | `-get-curl` | Print the curl command for executing this query and exit (WARNING: includes printing your access token!) | `false` | | `-insecure-skip-verify` | Skip validation of TLS certificates against trusted chains | `false` | -| `-trace` | Log the trace ID for requests. [See docs](/admin/observability/tracing) | `false` | +| `-trace` | Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing | `false` | | `-user-agent-telemetry` | Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph | `true` | @@ -26,7 +26,7 @@ Usage of 'src version': -insecure-skip-verify Skip validation of TLS certificates against trusted chains -trace - Log the trace ID for requests. #See https://sourcegraph.com/docs/admin/observability/tracing + Log the trace ID for requests. See https://docs.sourcegraph.com/admin/observability/tracing -user-agent-telemetry Include the operating system and architecture in the User-Agent sent with requests to Sourcegraph (default true) @@ -38,3 +38,4 @@ Examples: ``` + \ No newline at end of file