diff --git a/tests/e2e/lib.sh b/tests/e2e/lib.sh index 994f0b3a17612..ff3d988501a27 100755 --- a/tests/e2e/lib.sh +++ b/tests/e2e/lib.sh @@ -439,6 +439,45 @@ setup_automation_flavor_e2e_cluster() { fi } +# When working as expected it takes less than one minute for the API server to +# reach ready. Often times out on OSD. If this call fails in CI we need to +# identify the source of pull/scheduling latency, request throttling, etc. +# I tried increasing the timeout from 5m to 20m for OSD but it did not help. +wait_for_central_db() { + info "Waiting for Central DB to start" + + start_time="$(date '+%s')" + max_seconds=300 + + while true; do + central_db_json="$(kubectl -n stackrox get deploy/central-db -o json)" + replicas="$(jq '.status.replicas' <<<"$central_db_json")" + ready_replicas="$(jq '.status.readyReplicas' <<<"$central_db_json")" + curr_time="$(date '+%s')" + elapsed_seconds=$(( curr_time - start_time )) + + # Ready case + if [[ "$replicas" == 1 && "$ready_replicas" == 1 ]]; then + sleep 30 + break + fi + + # Timeout case + if (( elapsed_seconds > max_seconds )); then + kubectl -n stackrox get pod -o wide + kubectl -n stackrox get deploy -o wide + echo >&2 "wait_for_central_db() timeout after $max_seconds seconds." + exit 1 + fi + + # Otherwise report and retry + echo "waiting ($elapsed_seconds/$max_seconds)" + sleep 5 + done + + info "Central DB deployment is ready." +} + if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then if [[ "$#" -lt 1 ]]; then usage diff --git a/tests/upgrade/postgres_run.sh b/tests/upgrade/postgres_run.sh index 14f4adbd81edb..499af94eaebca 100755 --- a/tests/upgrade/postgres_run.sh +++ b/tests/upgrade/postgres_run.sh @@ -153,8 +153,9 @@ test_upgrade_paths() { verifyNoPostgresAccessScopes # Now go back up to Postgres + CURRENT_TAG="$(make --quiet tag)" kubectl -n stackrox set env deploy/central ROX_POSTGRES_DATASTORE=true - kubectl -n stackrox set image deploy/central "central=$REGISTRY/main:$(make --quiet tag)" + kubectl -n stackrox set image deploy/central "central=$REGISTRY/main:$CURRENT_TAG" wait_for_api wait_for_scanner_to_be_ready @@ -182,12 +183,19 @@ test_upgrade_paths() { wait_for_api kubectl -n stackrox delete po "$(kubectl -n stackrox get po -l app=central-db -o=jsonpath='{.items[0].metadata.name}')" --grace-period=0 wait_for_api + wait_for_central_db checkForRocksAccessScopes checkForPostgresAccessScopes - validate_upgrade "01-bounce-db-after-upgrade" "bounce central db after postgres upgrade" "268c98c6-e983-4f4e-95d2-9793cebddfd7" - collect_and_check_stackrox_logs "$log_output_dir" "01_post_bounce-db" + validate_upgrade "02-bounce-db-after-upgrade" "bounce central db after postgres upgrade" "268c98c6-e983-4f4e-95d2-9793cebddfd7" + + # Since we bounced the DB we may see some errors. Those need to be allowed in the case of this test ONLY. + echo "# postgres was bounced, may see some connection errors" >> scripts/ci/logcheck/allowlist-patterns + echo "FATAL: terminating connection due to administrator command \(SQLSTATE 57P01\)" >> scripts/ci/logcheck/allowlist-patterns + echo >> scripts/ci/logcheck/allowlist-patterns + + collect_and_check_stackrox_logs "$log_output_dir" "02_post_bounce-db" info "Fetching a sensor bundle for cluster 'remote'" rm -rf sensor-remote @@ -196,10 +204,10 @@ test_upgrade_paths() { info "Installing sensor" ./sensor-remote/sensor.sh - kubectl -n stackrox set image deploy/sensor "*=$REGISTRY/main:$(make --quiet tag)" - kubectl -n stackrox set image deploy/admission-control "*=$REGISTRY/main:$(make --quiet tag)" - kubectl -n stackrox set image ds/collector "collector=$REGISTRY/collector:$(cat COLLECTOR_VERSION)" \ - "compliance=$REGISTRY/main:$(make --quiet tag)" + kubectl -n stackrox set image deploy/sensor "*=$REGISTRY/main:$CURRENT_TAG" + kubectl -n stackrox set image deploy/admission-control "*=$REGISTRY/main:$CURRENT_TAG" + kubectl -n stackrox set image ds/collector "collector=$REGISTRY/collector:$CURRENT_TAG" \ + "compliance=$REGISTRY/main:$CURRENT_TAG" sensor_wait