From 62e53e0ff087485505d779e0453870e50401a596 Mon Sep 17 00:00:00 2001
From: stackhpc-ci <22933334+stackhpc-ci@users.noreply.github.com>
Date: Tue, 14 Oct 2025 11:42:20 +0100
Subject: [PATCH] ci: Add better/longer retries to AIO workflow TF

---
 .github/workflows/stackhpc-all-in-one.yml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml
index 650bea184e..8cdd2e6381 100644
--- a/.github/workflows/stackhpc-all-in-one.yml
+++ b/.github/workflows/stackhpc-all-in-one.yml
@@ -190,7 +190,10 @@ jobs:
       - name: Terraform Apply
         id: tf_apply
         run: |
-          for attempt in $(seq 5); do
+          # Try up to 6 times to create the infrastructure, destroying and retrying if it fails.
+          # If it fails 3 times, wait 2 hours before trying again.
+          # The cloud is likely just at capacity, so wait until other jobs finish.
+          for attempt in $(seq 6); do
               if terraform apply -auto-approve; then
                   echo "Created infrastructure on attempt $attempt"
                   exit 0
@@ -198,7 +201,12 @@ jobs:
               echo "Failed to create infrastructure on attempt $attempt"
               sleep 10
               terraform destroy -auto-approve
-              sleep 60
+              if [ "$attempt" -eq 3 ]; then
+                  echo "Sleeping for 2 hours after 3 failed attempts..."
+                  sleep 7200
+              else
+                  sleep $(shuf -i 60-180 -n 1)
+              fi
           done
           echo "Failed to create infrastructure after $attempt attempts"
           exit 1