diff --git a/README.md b/README.md
index 99560e4..e7caa7c 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,63 @@ monitoring and observability infrastructure for the PyTorch Foundation
 This infrastructure-as-code setup manages:
 
 - **Datadog Users**: User accounts and role assignments
-- **Datadog Roles**: Custom roles with specific permissions
-- **Monitoring Resources**: Various Datadog monitoring components
+- **Datadog Roles**: Custom role definitions and permissions
+- **Monitoring Resources**: Datadog monitors, dashboards, synthetics
+
+[Back to top](#table-of-contents)
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Table of Contents](#table-of-contents)
+- [Prerequisites](#prerequisites)
+- [Structure](#structure)
+- [Configuration](#configuration)
+  - [Variables Reference](#variables-reference)
+    - [User Variables (`dd_users`)](#user-variables-dd_users)
+    - [Role Variables (`dd_roles`)](#role-variables-dd_roles)
+  - [Available Permissions](#available-permissions)
+  - [Custom Roles](#custom-roles)
+- [Usage](#usage)
+  - [Adding Yourself as a User](#adding-yourself-as-a-user)
+  - [Using Existing Datadog Roles](#using-existing-datadog-roles)
+  - [Adding Multiple Users](#adding-multiple-users)
+  - [Creating Custom Roles](#creating-custom-roles)
+- [Monitoring and Alerts](#monitoring-and-alerts)
+  - [Synthetics website and API checks](#synthetics-website-and-api-checks)
+  - [GitHub ci-sev issues check](#github-ci-sev-issues-check)
+  - [Synthetics queue checks (scripts/)](#synthetics-queue-checks-scripts)
+  - [Datadog monitors (ALI/GitHub API)](#datadog-monitors-aligithub-api)
+- [Deployment](#deployment)
+  - [Automated Deployment via GitHub Actions](#automated-deployment-via-github-actions)
+  - [GitHub Actions Workflow](#github-actions-workflow)
+  - [Code Quality Requirements](#code-quality-requirements)
+    - [Manual Validation (Optional)](#manual-validation-optional)
+  - [Manual Deployment Steps (for testing)](#manual-deployment-steps-for-testing)
+- [Accessing Datadog](#accessing-datadog)
+  - [Single Sign-On (SSO) Login](#single-sign-on-sso-login)
+  - [First-Time Access](#first-time-access)
+  - [Troubleshooting Access](#troubleshooting-access)
+  - [Role Capabilities](#role-capabilities)
+- [Security Considerations](#security-considerations)
+- [Troubleshooting](#troubleshooting)
+  - [Common Issues](#common-issues)
+  - [Getting Help](#getting-help)
+- [Contributing](#contributing)
+  - [Development Workflow](#development-workflow)
+  - [Pre-commit Requirements](#pre-commit-requirements)
+  - [Automated Checks](#automated-checks)
+  - [MegaLinter Configuration](#megalinter-configuration)
+
+## Prerequisites
+
+- Terraform >= 1.0
+- Datadog provider configured
+- Appropriate Datadog API and APP keys (handled by CI/CD)
+- Access to the PyTorch Datadog organization
+- **Valid Linux Foundation ID (LFID)** for SSO access
+
+[Back to top](#table-of-contents)
 
 ## Structure
 
@@ -17,21 +72,56 @@ This infrastructure-as-code setup manages:
 .
 ├── datadog-users.tf      # User management configuration
 ├── datadog-roles.tf      # Custom role definitions
+├── datadog-monitors.tf   # Monitor and alert definitions
+├── datadog-synthetics_tests.tf # Synthetics API tests
 ├── variables.tf          # Variable definitions (if present)
-├── terraform.tfvars     # Variable values (not committed)
-└── README.md           # This file
+├── terraform.tfvars      # Variable values (not committed)
+├── scripts/              # Synthetics JavaScript checks
+└── README.md             # This file
 ```
 
-## Prerequisites
-
-- Terraform >= 1.0
-- Datadog provider configured
-- Appropriate Datadog API and APP keys (handled by CI/CD)
-- Access to the PyTorch Datadog organization
-- **Valid Linux Foundation ID (LFID)** for SSO access
+[Back to top](#table-of-contents)
 
 ## Configuration
 
+### Variables Reference
+
+#### User Variables (`dd_users`)
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `email` | string | Yes | User's email address |
+| `roles` | list(string) | No | List of role IDs to assign (defaults to empty) |
+| `disabled` | bool | No | Whether account is disabled (defaults to false) |
+
+#### Role Variables (`dd_roles`)
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `name` | string | Yes | Display name for the role |
+| `permissions` | list(string) | No | List of permission IDs (defaults empty) |
+
+### Available Permissions
+
+Common permissions you can use in custom roles:
+
+**Read Permissions:**
+
+- `logs_read_data` - Read log data
+- `logs_read_index_data` - Read indexed logs  
+- `synthetics_read` - View synthetic tests
+- `cases_read` - View support cases
+- `audit_logs_read` - View audit logs
+
+**Write Permissions:**
+
+- `dashboards_write` - Create/edit dashboards
+- `monitors_write` - Create/edit monitors
+- `synthetics_write` - Create/edit synthetic tests
+- `cases_write` - Create/edit support cases
+- `notebooks_write` - Create/edit notebooks
+- `incident_write` - Create/edit incidents
+
 ### Custom Roles
 
 The repository defines a "Custom Read Write" role (referenced as
@@ -51,6 +141,8 @@ The repository defines a "Custom Read Write" role (referenced as
 - Case and notebook management
 - Incident response capabilities
 
+[Back to top](#table-of-contents)
+
 ## Usage
 
 ### Adding Yourself as a User
@@ -87,6 +179,26 @@ dd_users = {
 }
 ```
 
+### Using Existing Datadog Roles
+
+To assign existing Datadog roles instead of custom ones:
+
+```hcl
+# terraform.tfvars
+dd_users = {
+  "readonly-user" = {
+    email    = "readonly@example.com"
+    roles    = [data.datadog_role.ro_role.id]  # Datadog Read Only Role
+    disabled = false
+  },
+  "standard-user" = {
+    email    = "standard@example.com"
+    roles    = [data.datadog_role.standard_role.id]  # Datadog Standard Role  
+    disabled = false
+  }
+}
+```
+
 ### Adding Multiple Users
 
 You can add multiple users at once:
@@ -112,26 +224,6 @@ dd_users = {
 }
 ```
 
-### Using Existing Datadog Roles
-
-To assign existing Datadog roles instead of custom ones:
-
-```hcl
-# terraform.tfvars
-dd_users = {
-  "readonly-user" = {
-    email    = "readonly@example.com"
-    roles    = [data.datadog_role.ro_role.id]  # Datadog Read Only Role
-    disabled = false
-  },
-  "standard-user" = {
-    email    = "standard@example.com"
-    roles    = [data.datadog_role.standard_role.id]  # Datadog Standard Role  
-    disabled = false
-  }
-}
-```
-
 ### Creating Custom Roles
 
 To define additional custom roles:
@@ -160,6 +252,147 @@ dd_roles = {
 }
 ```
 
+[Back to top](#table-of-contents)
+
+## Monitoring and Alerts
+
+### Synthetics website and API checks
+
+These lightweight API checks verify availability and basic correctness for
+public PyTorch properties every 5 minutes:
+
+- pytorch.org
+  - GET <https://pytorch.org> → status 200 and body contains
+    "Install PyTorch"
+  - Alerts: @slack-pytorch-infra-alerts
+
+- docs.pytorch.org
+  - GET <https://docs.pytorch.org/docs/stable/index.html> → status 200 and
+    body contains "PyTorch documentation"
+  - Alerts: @slack-pytorch-infra-alerts
+
+- pytorch.org/docs redirect
+  - GET <https://pytorch.org/docs> → status 301; headers:
+    - location is <https://docs.pytorch.org/docs>
+    - server is nginx
+  - Alerts: @slack-pytorch-infra-alerts
+
+- download.pytorch.org (CDN index)
+  - GET <https://download.pytorch.org/whl> → status 200 and body contains
+    "pytorch"
+  - Alerts: @slack-pytorch-infra-alerts
+
+- hud.pytorch.org
+  - GET <https://hud.pytorch.org> → status 200 and body contains
+    "pytorch/pytorch"
+  - Alerts: @slack-pytorch-infra-alerts
+
+- landscape.pytorch.org
+  - GET <https://landscape.pytorch.org> → status 200 and body contains
+    "landscape"
+  - Alerts: @slack-pytorch-infra-alerts
+
+- discuss.pytorch.org
+  - GET <https://discuss.pytorch.org> → status 200 and body contains
+    "PyTorch Forums"
+  - Alerts: @webhook-lf-incident-io (follow LF runbook)
+
+- dev-discuss.pytorch.org
+  - GET <https://dev-discuss.pytorch.org> → status 200 and body contains
+    "PyTorch releases"
+  - Alerts: @slack-pytorch-infra-alerts
+
+Cadence: tick_every = 300s; retries: 3 attempts, 300,000 ms interval.
+
+### GitHub ci-sev issues check
+
+Watches for open issues labeled "ci: sev" in pytorch/pytorch. Fails if any
+are found.
+
+- GET <https://github.com/pytorch/pytorch/issues?q=state%3Aopen%20label%3A%22ci%3A%20sev%22>
+- Expect status 200 and body contains "No results"
+- Alerts: @slack-pytorch-infra-alerts
+- Cadence: tick_every = 300s
+
+### Synthetics queue checks (scripts/)
+
+These API tests detect long GitHub Actions runner queues and alert Slack.
+
+How it works:
+
+- Each test calls the HUD endpoint
+  <https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D>
+- The script expects HTTP 200, parses JSON, and filters by machine_type
+  pattern
+- If any item exceeds a per-vendor queue time threshold, the test fails
+- On failure, the script logs a human message which is included in the
+  Datadog alert and sent to Slack
+
+Scripts and thresholds:
+
+- [check-long-queue-lf.js](./scripts/check-long-queue-lf.js)
+  - Filter: machine_type startsWith 'lf.'
+  - Threshold: > 10,800s (3h)
+
+- [check-long-queue-nvidia.js](./scripts/check-long-queue-nvidia.js)
+  - Filter: machine_type includes '.dgx.'
+  - Threshold: > 10,800s (3h)
+
+- [check-long-queue-rocm.js](./scripts/check-long-queue-rocm.js)
+  - Filter: machine_type includes '.rocm.'
+  - Threshold: > 14,400s (4h)
+
+- [check-long-queue-s390x.js](./scripts/check-long-queue-s390x.js)
+  - Filter: machine_type includes '.s390x'
+  - Threshold: > 7,200s (2h)
+
+- [check-long-queue-intel.js](./scripts/check-long-queue-intel.js)
+  - Filter: machine_type includes '.idc.'
+  - Threshold: > 10,800s (3h)
+
+- [check-long-queue-meta-h100.js](./scripts/check-long-queue-meta-h100.js)
+  - Filter: machine_type equals 'linux.aws.h100'
+  - Threshold: > 21,600s (6h)
+
+- [check-long-queue-meta.js](./scripts/check-long-queue-meta.js)
+  - Filter: excludes '.dgx.', '.rocm.', '.s390x', '^lf\\.', '^linux.aws.h100'
+  - Threshold: > 10,800s (3h)
+
+Example failure message (from script stderr):
+
+```text
+High queue detected for machine types containing .s390x: linux.s390x (7300s)
+```
+
+### Datadog monitors (ALI/GitHub API)
+
+Event and metric-based monitors supporting autoscaler and GitHub API health:
+
+- ALI AutoScaler Dead Letter Queue High Number Of Messages
+  - Query: sum(last_5m):max:aws.sqs.number_of_messages_sent{
+    queuename:ghci-lf-queued-builds-dead-letter}.as_count() > 5000
+  - Thresholds: warning 1000; critical 5000
+  - Action: check scale-up logs; alerts to @webhook-lf-incident-io,
+    @slack-PyTorch-pytorch-infra-alerts, @slack-Linux_Foundation-pytorch-alerts
+
+- ALI ValidationException Detected
+  - Type: event-v2 alert on SNS event with title "ALI ValidationException
+    Detected" in last 5 minutes
+  - Critical when count > 0
+  - Action: review scale-up Lambda logs; possibly revert test-infra release
+  - Alerts: @slack-PyTorch-pytorch-infra-alerts,
+    @slack-Linux_Foundation-pytorch-alerts, @webhook-lf-incident-io
+
+- GitHub API usage unusually high
+  - Type: event-v2 alert on SNS event with title "GitHub API usage unusually
+    high" in last 5 minutes
+  - Critical when count > 0
+  - Action: review ALI rate limit metrics and API call counts
+  - Alerts: @slack-PyTorch-pytorch-infra-alerts,
+    @slack-Linux_Foundation-pytorch-alerts, @webhook-lf-incident-io
+
+[Back to top](#table-of-contents)
+
 ## Deployment
 
 ### Automated Deployment via GitHub Actions
@@ -168,10 +401,35 @@ All infrastructure changes are deployed automatically through GitHub Actions
 workflows. The deployment process includes:
 
 1. **Code Quality Checks**: All commits must pass MegaLinter validation
-2. **Terraform Planning**: Changes are planned and validated before deployment
+2. **Terraform Planning**: Changes are planned and validated before
+   deployment
 3. **Automated Apply**: Approved changes are automatically applied to the
    Datadog organization
 
+### GitHub Actions Workflow
+
+The repository uses GitHub Actions with MegaLinter for continuous
+deployment:
+
+- **On Pull Request**: Runs MegaLinter suite (includes `tflint`, `tofu fmt`,
+  security checks)
+- **On Merge to Main**: Automatically applies changes after all checks pass
+- **Manual Triggers**: Infrastructure team can manually trigger deployments
+  when needed
+
+All commits pushed to any branch must pass the complete MegaLinter
+validation suite:
+
+- ✅ **Terraform Formatting** (`tofu fmt`) - Code formatting with
+  `tofu fmt`
+- ✅ **Terraform Linting** (`tflint`) - Best practices and error detection
+- ✅ **Security Scanning** - Infrastructure security checks
+- ✅ **Documentation** - README and code documentation validation
+- ✅ **Configuration Validation** (`terraform plan`) - Syntax and logic
+  validation
+
+Commits that fail MegaLinter checks will be rejected and cannot be merged.
+
 ### Code Quality Requirements
 
 Before any deployment, all code must pass **MegaLinter** validation, which
@@ -197,29 +455,6 @@ tofu fmt -check
 tflint
 ```
 
-### GitHub Actions Workflow
-
-The repository uses GitHub Actions with MegaLinter for continuous deployment:
-
-- **On Pull Request**: Runs MegaLinter suite (includes `tflint`, `tofu fmt`,
-  security checks)
-- **On Merge to Main**: Automatically applies changes after all checks pass
-- **Manual Triggers**: Infrastructure team can manually trigger deployments
-  when needed
-
-All commits pushed to any branch must pass the complete MegaLinter validation
-suite:
-
-- ✅ **Terraform Formatting** (`tofu fmt`) - Code formatting with
-  `tofu fmt`
-- ✅ **Terraform Linting** (`tflint`) - Best practices and error detection
-- ✅ **Security Scanning** - Infrastructure security checks
-- ✅ **Documentation** - README and code documentation validation
-- ✅ **Configuration Validation** (`terraform plan`) - Syntax and logic
-  validation
-
-Commits that fail MegaLinter checks will be rejected and cannot be merged.
-
 ### Manual Deployment Steps (for testing)
 
 If you need to test changes locally after MegaLinter validation:
@@ -245,6 +480,8 @@ If you need to test changes locally after MegaLinter validation:
 4. **Verify deployment:**
    Check the Datadog UI to confirm users and roles were created correctly.
 
+[Back to top](#table-of-contents)
+
 ## Accessing Datadog
 
 Once your user account has been provisioned through this Terraform
@@ -269,8 +506,8 @@ When accessing Datadog for the first time:
 
 1. **Ensure your user is provisioned**: Your email must be added to this
    Terraform configuration and deployed
-2. **Use your LFID**: Login with the same email address that was provisioned
-   in the Terraform config
+2. **Use your LFID**: Login with the same email address that was
+   provisioned in the Terraform config
 3. **Verify permissions**: Check that you can access the appropriate
    dashboards and features based on your assigned role
 
@@ -289,60 +526,28 @@ If you cannot access Datadog:
 
 ### Role Capabilities
 
-After logging in with SSO, your access will be determined by your assigned role:
+After logging in with SSO, your access will be determined by your assigned
+role:
 
 - **Limited Read Write Role**: Can view all monitoring data and create/edit
   dashboards, monitors, and incidents
-- **Admin Role**: Full administrative access (reserved for infrastructure team)
+- **Admin Role**: Full administrative access (reserved for infrastructure
+  team)
 - **Read Only Role**: View-only access to monitoring data
 - **Standard Role**: Basic Datadog access with limited write permissions
 
-## Variables Reference
-
-### User Variables (`dd_users`)
-
-| Field | Type | Required | Description |
-|-------|------|----------|-------------|
-| `email` | string | Yes | User's email address |
-| `roles` | list(string) | No | List of role IDs to assign (defaults to empty) |
-| `disabled` | bool | No | Whether account is disabled (defaults to false) |
-
-### Role Variables (`dd_roles`)
-
-| Field | Type | Required | Description |
-|-------|------|----------|-------------|
-| `name` | string | Yes | Display name for the role |
-| `permissions` | list(string) | No | List of permission IDs (defaults empty) |
-
-## Available Permissions
-
-Common permissions you can use in custom roles:
-
-**Read Permissions:**
-
-- `logs_read_data` - Read log data
-- `logs_read_index_data` - Read indexed logs  
-- `synthetics_read` - View synthetic tests
-- `cases_read` - View support cases
-- `audit_logs_read` - View audit logs
-
-**Write Permissions:**
-
-- `dashboards_write` - Create/edit dashboards
-- `monitors_write` - Create/edit monitors
-- `synthetics_write` - Create/edit synthetic tests
-- `cases_write` - Create/edit support cases
-- `notebooks_write` - Create/edit notebooks
-- `incident_write` - Create/edit incidents
+[Back to top](#table-of-contents)
 
 ## Security Considerations
 
 - **Principle of Least Privilege**: Only assign necessary permissions
 - **Regular Review**: Periodically audit user access and roles
-- **Disabled Accounts**: Use `disabled = true` instead of deleting users when
-  access is temporarily revoked
-- **External Users**: Consider using separate roles for contractors/external
-  users
+- **Disabled Accounts**: Use `disabled = true` instead of deleting users
+  when access is temporarily revoked
+- **External Users**: Consider using separate roles for
+  contractors/external users
+
+[Back to top](#table-of-contents)
 
 ## Troubleshooting
 
@@ -367,6 +572,8 @@ Common permissions you can use in custom roles:
 - Review Datadog provider documentation
 - Contact the PyTorch infrastructure team
 
+[Back to top](#table-of-contents)
+
 ## Contributing
 
 ### Development Workflow
@@ -378,8 +585,10 @@ Common permissions you can use in custom roles:
 5. **Test locally**: Run `terraform plan` to validate your changes
 6. **Commit and push**: Push your branch to trigger GitHub Actions checks
 7. **Submit a pull request** with a clear description of changes
-8. **Address feedback**: Fix any issues identified by reviewers or MegaLinter
-9. **Merge after approval**: Once approved and all checks pass, merge to main
+8. **Address feedback**: Fix any issues identified by reviewers or
+   MegaLinter
+9. **Merge after approval**: Once approved and all checks pass, merge to
+   main
 
 ### Pre-commit Requirements
 
@@ -414,5 +623,7 @@ The repository uses MegaLinter's Terraform flavor, which includes:
 - Documentation linters
 - General code quality tools
 
-For detailed configuration, see `.mega-linter.yml` (if present) or the default
-Terraform flavor settings.
+For detailed configuration, see `.mega-linter.yml` (if present) or the
+default Terraform flavor settings.
+
+[Back to top](#table-of-contents)
diff --git a/datadog-synthetics_tests.tf b/datadog-synthetics_tests.tf
index 08fde64..53af083 100644
--- a/datadog-synthetics_tests.tf
+++ b/datadog-synthetics_tests.tf
@@ -399,6 +399,35 @@ EOT
   }
 }
 
+resource "datadog_synthetics_test" "pytorch-gha-runners-queue-check-intel" {
+  type      = "api"
+  name      = "GHA Runner Queue Check - Intel Runners"
+  message   = <<EOT
+Detected GitHub Runner Queue - Intel Runners has jobs waiting
+unusually long for runners.
+
+{{{synthetics.attributes.result.failure.message}}}
+
+Check https://hud.pytorch.org/metrics for more details.
+
+@slack-pytorch-infra-alerts
+EOT
+  status    = "live"
+  tags      = ["env:project", "project:pytorch", "service:gha-runners"]
+  locations = ["aws:us-west-2"]
+  options_list {
+    tick_every = 900
+  }
+  request_definition {
+    method = "GET"
+    url    = "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D"
+  }
+  assertion {
+    type = "javascript"
+    code = file("scripts/check-long-queue-intel.js")
+  }
+}
+
 resource "datadog_synthetics_test" "pytorch-gha-runners-queue-check-meta" {
   type      = "api"
   name      = "GHA Runner Queue Check - Meta Runners"
diff --git a/scripts/check-long-queue-intel.js b/scripts/check-long-queue-intel.js
new file mode 100644
index 0000000..40cc5a6
--- /dev/null
+++ b/scripts/check-long-queue-intel.js
@@ -0,0 +1,19 @@
+dd.expect(dd.response.statusCode).to.equal(200);
+
+const MACHINE_TYPE_FILTER = '.idc.';
+const jsonData = dd.response.body;
+const parsedData = JSON.parse(jsonData);
+
+const highQueueItems = parsedData
+  .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800)
+  .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s }));
+
+if (highQueueItems.length > 0) {
+  const machineDetails = highQueueItems
+    .map(item => `${item.machine_type} (${item.avg_queue_s}s)`)
+    .join(', ');
+  const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`;
+  console.error(message);
+}
+
+dd.expect(highQueueItems.length > 0).to.be.false;
diff --git a/scripts/check-long-queue-meta.js b/scripts/check-long-queue-meta.js
index c62c8f5..6ee37a6 100644
--- a/scripts/check-long-queue-meta.js
+++ b/scripts/check-long-queue-meta.js
@@ -1,5 +1,5 @@
 dd.expect(dd.response.statusCode).to.equal(200);
-const EXCLUDED_MACHINE_PATTERNS = ['.dgx.', '.rocm.', '.s390x', '^lf\\.', '^linux.aws.h100'];
+const EXCLUDED_MACHINE_PATTERNS = ['.dgx.', '.idc.', '.rocm.', '.s390x', '^lf\\.', '^linux.aws.h100'];
 const jsonData = dd.response.body;
 const parsedData = JSON.parse(jsonData);
 const highQueueItems = parsedData