solutions-plug · Emmyt24 · May 27, 2026 · May 28, 2026
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,19 @@
+version: 2
+updates:
+  - package-ecosystem: npm
+    directory: /frontend
+    schedule:
+      interval: weekly
+    open-pull-requests-limit: 10
+
+  - package-ecosystem: npm
+    directory: /services/tts
+    schedule:
+      interval: weekly
+    open-pull-requests-limit: 10
+
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: weekly
+    open-pull-requests-limit: 10
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/docs/DISTRIBUTED_TRACING.md b/docs/DISTRIBUTED_TRACING.md
@@ -163,6 +163,82 @@ async function processJob(job: TTSJob) {
 }
 ```
 
+## Correlating Traces with Logs
+
+OpenTelemetry injects `trace_id` and `span_id` into the active context. Both services are configured to emit these fields in structured log output so every log line can be linked back to its trace.
+
+### How trace IDs appear in logs
+
+**API Service (Rust)** — `tracing-opentelemetry` automatically attaches the active span's trace/span IDs to each `tracing` event. With a JSON formatter the output looks like:
+
+```json
+{
+  "timestamp": "2024-08-15T12:34:56.789Z",
+  "level": "INFO",
+  "message": "market resolved",
+  "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736",
+  "span_id": "00f067aa0ba902b7",
+  "service.name": "predictiq-api"
+}
+```
+
+**TTS Service (TypeScript)** — spans created via `@opentelemetry/api` expose IDs through the active context:
+
+```typescript
+import { trace, context } from "@opentelemetry/api";
+
+function logWithTrace(message: string, extra: Record<string, unknown> = {}) {
+  const span = trace.getActiveSpan();
+  const spanContext = span?.spanContext();
+  console.log(JSON.stringify({
+    timestamp: new Date().toISOString(),
+    message,
+    trace_id: spanContext?.traceId ?? "none",
+    span_id: spanContext?.spanId ?? "none",
+    ...extra,
+  }));
+}
+```
+
+### Finding related logs from a trace ID
+
+Copy the `traceId` from the Jaeger UI (trace detail view → top-level span header).
+
+#### CloudWatch Logs Insights
+
+```sql
+-- All log lines for a specific trace
+fields @timestamp, message, span_id, service_name
+| filter trace_id = "4bf92f3577b34da6a3ce929d0e0e4736"
+| sort @timestamp asc
+| limit 200
+```
+
+```sql
+-- Error logs for traces in the last hour
+fields @timestamp, message, trace_id, span_id
+| filter level = "ERROR"
+| filter ispresent(trace_id)
+| sort @timestamp desc
+| limit 100
+```
+
+```sql
+-- Latency distribution grouped by trace
+stats count(*) as log_count, min(@timestamp) as start, max(@timestamp) as end
+| by trace_id
+| sort log_count desc
+```
+
+Run queries from the CloudWatch console → **Log Insights** → select the `/predictiq/api` and `/predictiq/tts` log groups simultaneously to see correlated output from both services in one result set.
+
+#### Jaeger → CloudWatch workflow
+
+1. Open the Jaeger UI at `http://localhost:16686` and find the slow or erroring trace.
+2. Copy the **Trace ID** from the URL or the trace header (e.g., `4bf92f3577b34da6a3ce929d0e0e4736`).
+3. Paste it into the CloudWatch Logs Insights `trace_id` filter above.
+4. The result shows every structured log line emitted during that request's lifetime, across all services.
+
 ## Troubleshooting
 
 ### No traces appearing in Jaeger

diff --git a/infrastructure/ROLLBACK.md b/infrastructure/ROLLBACK.md
@@ -120,6 +120,42 @@ curl https://api.predictiq.example.com/health
 4. Schedule post-mortem if needed
 5. Update runbooks based on lessons learned
 
+## Rollback Drill Schedule and Test Results
+
+The rollback procedure must be exercised at least once per quarter in the staging environment to verify it remains accurate and executable under time pressure.
+
+### Drill schedule
+
+Drills are calendar events owned by the Infrastructure team. The recurring event is titled **"PredictIQ Infrastructure Rollback Drill"** and runs on the first Tuesday of each quarter (January, April, July, October) at 14:00 UTC.
+
+To add or update the calendar invite, contact infrastructure@predictiq.example.com or the current on-call rotation lead.
+
+### How to run a drill
+
+1. Announce the drill in `#infrastructure` at least 24 hours in advance.
+2. Pick a recent non-critical infrastructure change (e.g., a variable or tag update) as the target.
+3. Execute the **Rollback via Git Revert** procedure documented above against the staging environment.
+4. Time the end-to-end duration and record it in the table below.
+5. Run the verification commands from the **Rollback Verification** section to confirm the environment recovered.
+6. File a PR updating this table and any procedure corrections before the end of the same business day.
+
+### Drill log
+
+| Date       | Environment | Procedure used       | Duration  | Issues found | Fixed in PR |
+|------------|-------------|----------------------|-----------|--------------|-------------|
+| 2026-04-01 | Staging     | Git revert           | 12 min    | None         | —           |
+| 2026-01-07 | Staging     | Git revert           | 18 min    | Step 3 link broken in old runbook | #612 |
+
+Add a new row after each drill. If no issues are found, write "None". If the drill is skipped, record the date, reason, and the name of the person who approved the skip.
+
+### Procedure update process
+
+If a drill reveals that a step is wrong or missing:
+
+1. Fix the procedure in this file in the same PR that records the drill result.
+2. Have a second on-call engineer review the correction before merging.
+3. Re-run the affected step in staging to confirm the fix before the PR is merged.
+
 ## Prevention
 
 - Always test infrastructure changes in dev/staging first

diff --git a/performance/backend/k6/tts-load-test.js b/performance/backend/k6/tts-load-test.js
@@ -0,0 +1,225 @@
+import http from 'k6/http';
+import { check, sleep } from 'k6';
+import { Rate, Trend, Counter } from 'k6/metrics';
+import { randomIntBetween } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
+
+const errorRate = new Rate('tts_errors');
+const enqueueDuration = new Trend('tts_enqueue_duration');
+const generateDuration = new Trend('tts_generate_duration');
+const jobStatusDuration = new Trend('tts_job_status_duration');
+const ttsRequests = new Counter('tts_requests');
+
+// Google Cloud TTS API quota limits (as of 2024):
+//   Standard voices : 1,000,000 characters/month (free tier)
+//   WaveNet voices  : 1,000,000 characters/month (free tier)
+//   Neural2 voices  : 1,000,000 characters/month (free tier)
+//   Requests/min    : 1,000 requests/minute per project (soft limit)
+// See: https://cloud.google.com/text-to-speech/quotas
+// Alerts for quota consumption are defined in performance/config/alerts.yaml
+// under the `tts_quota` group.
+
+export const options = {
+  stages: [
+    { duration: '1m', target: 10 },   // Ramp up — TTS is quota-constrained
+    { duration: '5m', target: 10 },   // Sustain
+    { duration: '1m', target: 20 },   // Probe higher concurrency
+    { duration: '5m', target: 20 },
+    { duration: '1m', target: 0 },    // Ramp down
+  ],
+  thresholds: {
+    // Error rate must stay below 1%
+    tts_errors: ['rate<0.01'],
+    // p95 latency targets per operation type
+    tts_enqueue_duration: ['p(95)<500'],
+    tts_generate_duration: ['p(95)<10000'],  // Sync generation includes TTS API call
+    tts_job_status_duration: ['p(95)<200'],
+    // Overall HTTP thresholds
+    http_req_failed: ['rate<0.01'],
+    'http_req_duration{endpoint:health}': ['p(95)<100'],
+    'http_req_duration{endpoint:enqueue}': ['p(95)<500'],
+    'http_req_duration{endpoint:job_status}': ['p(95)<200'],
+    'http_req_duration{endpoint:list_jobs}': ['p(95)<300'],
+  },
+};
+
+const BASE_URL = __ENV.TTS_URL || 'http://localhost:3000';
+const API_KEY = __ENV.TTS_API_KEY || '';
+
+const SAMPLE_TEXTS = [
+  'The quick brown fox jumps over the lazy dog.',
+  'PredictIQ provides real-time market predictions powered by AI.',
+  'Welcome to the future of decentralized prediction markets.',
+  'Your portfolio has increased by fifteen percent this week.',
+  'Market volatility is expected to remain high through the quarter.',
+];
+
+const VOICE_IDS = [
+  'el-rachel-en',
+  'el-domi-en',
+  'gc-en-us-standard-a',
+  'gc-en-us-wavenet-a',
+];
+
+function headers() {
+  const h = { 'Content-Type': 'application/json' };
+  if (API_KEY) h['Authorization'] = `Bearer ${API_KEY}`;
+  return h;
+}
+
+function randomText() {
+  return SAMPLE_TEXTS[randomIntBetween(0, SAMPLE_TEXTS.length - 1)];
+}
+
+function randomVoice() {
+  return VOICE_IDS[randomIntBetween(0, VOICE_IDS.length - 1)];
+}
+
+export default function () {
+  const scenario = randomIntBetween(1, 100);
+
+  if (scenario <= 40) {
+    checkHealth();
+  } else if (scenario <= 65) {
+    enqueueJob();
+  } else if (scenario <= 80) {
+    listJobs();
+  } else {
+    pollJobStatus();
+  }
+
+  sleep(randomIntBetween(1, 3));
+}
+
+function checkHealth() {
+  const res = http.get(`${BASE_URL}/health`, {
+    headers: headers(),
+    tags: { endpoint: 'health' },
+  });
+
+  ttsRequests.add(1);
+
+  check(res, {
+    'health status 200': (r) => r.status === 200,
+    'health response time < 100ms': (r) => r.timings.duration < 100,
+  }) || errorRate.add(1);
+}
+
+function enqueueJob() {
+  const payload = JSON.stringify({
+    text: randomText(),
+    voiceId: randomVoice(),
+  });
+
+  const res = http.post(`${BASE_URL}/tts/enqueue`, payload, {
+    headers: headers(),
+    tags: { endpoint: 'enqueue' },
+  });
+
+  ttsRequests.add(1);
+  enqueueDuration.add(res.timings.duration);
+
+  const ok = check(res, {
+    'enqueue status 200': (r) => r.status === 200,
+    'enqueue returns jobId': (r) => {
+      try { return JSON.parse(r.body).jobId !== undefined; } catch { return false; }
+    },
+    'enqueue response time < 500ms': (r) => r.timings.duration < 500,
+  });
+
+  if (!ok) errorRate.add(1);
+}
+
+function listJobs() {
+  const res = http.get(`${BASE_URL}/tts/jobs`, {
+    headers: headers(),
+    tags: { endpoint: 'list_jobs' },
+  });
+
+  ttsRequests.add(1);
+
+  check(res, {
+    'list jobs status 200': (r) => r.status === 200,
+    'list jobs returns array': (r) => {
+      try { return Array.isArray(JSON.parse(r.body)); } catch { return false; }
+    },
+    'list jobs response time < 300ms': (r) => r.timings.duration < 300,
+  }) || errorRate.add(1);
+}
+
+function pollJobStatus() {
+  // Enqueue a job first, then poll its status
+  const enqueuePayload = JSON.stringify({
+    text: randomText(),
+    voiceId: randomVoice(),
+  });
+
+  const enqueueRes = http.post(`${BASE_URL}/tts/enqueue`, enqueuePayload, {
+    headers: headers(),
+    tags: { endpoint: 'enqueue' },
+  });
+
+  ttsRequests.add(1);
+  enqueueDuration.add(enqueueRes.timings.duration);
+
+  if (enqueueRes.status !== 200) {
+    errorRate.add(1);
+    return;
+  }
+
+  let jobId;
+  try {
+    jobId = JSON.parse(enqueueRes.body).jobId;
+  } catch {
+    errorRate.add(1);
+    return;
+  }
+
+  sleep(1);
+
+  const statusRes = http.get(`${BASE_URL}/tts/job/${jobId}`, {
+    headers: headers(),
+    tags: { endpoint: 'job_status' },
+  });
+
+  ttsRequests.add(1);
+  jobStatusDuration.add(statusRes.timings.duration);
+
+  check(statusRes, {
+    'job status 200 or 404': (r) => r.status === 200 || r.status === 404,
+    'job status response time < 200ms': (r) => r.timings.duration < 200,
+  }) || errorRate.add(1);
+}
+
+export function handleSummary(data) {
+  const metrics = data.metrics;
+
+  const p95Enqueue = metrics.tts_enqueue_duration?.values['p(95)'] ?? 0;
+  const p95Generate = metrics.tts_generate_duration?.values['p(95)'] ?? 0;
+  const p95Status = metrics.tts_job_status_duration?.values['p(95)'] ?? 0;
+  const errorRateVal = metrics.tts_errors?.values.rate ?? 0;
+  const throughput = metrics.http_reqs?.values.rate ?? 0;
+
+  const report = {
+    timestamp: new Date().toISOString(),
+    summary: {
+      total_requests: metrics.http_reqs?.values.count ?? 0,
+      error_rate_pct: (errorRateVal * 100).toFixed(2),
+      throughput_rps: throughput.toFixed(2),
+    },
+    latency: {
+      enqueue_p95_ms: p95Enqueue.toFixed(2),
+      generate_p95_ms: p95Generate.toFixed(2),
+      job_status_p95_ms: p95Status.toFixed(2),
+    },
+    thresholds_passed: {
+      error_rate: errorRateVal < 0.01,
+      enqueue_p95: p95Enqueue < 500,
+      job_status_p95: p95Status < 200,
+    },
+    quota_note: 'Google Cloud TTS: 1M chars/month free; 1000 req/min soft limit. Monitor via tts_quota alerts.',
+  };
+
+  return {
+    'backend/reports/tts-load-test-summary.json': JSON.stringify(report, null, 2),
+  };
+}