diff --git a/src/runpod_flash/cli/commands/deploy.py b/src/runpod_flash/cli/commands/deploy.py index 18417065..da6b6ead 100644 --- a/src/runpod_flash/cli/commands/deploy.py +++ b/src/runpod_flash/cli/commands/deploy.py @@ -88,9 +88,19 @@ def deploy_command( console.print(f"\n[red]Error:[/red] {e}") raise typer.Exit(1) except Exception as e: - console.print(f"\n[red]Deploy failed:[/red] {e}") - logger.exception("Deploy failed") + _handle_deploy_error(e) + + +def _handle_deploy_error(exc: Exception) -> None: + """Handle deploy errors, providing targeted guidance for SSL issues.""" + from requests.exceptions import SSLError + + if isinstance(exc, SSLError): + console.print(f"\n[red]SSL error:[/red] {exc}") raise typer.Exit(1) + console.print(f"\n[red]Deploy failed:[/red] {exc}") + logger.exception("Deploy failed") + raise typer.Exit(1) def _print_curl_example(url: str, method: str = "POST") -> None: diff --git a/src/runpod_flash/core/resources/app.py b/src/runpod_flash/core/resources/app.py index 2d065761..d8eba4e0 100644 --- a/src/runpod_flash/core/resources/app.py +++ b/src/runpod_flash/core/resources/app.py @@ -12,6 +12,10 @@ MAX_TARBALL_SIZE_MB, VALID_TARBALL_EXTENSIONS, GZIP_MAGIC_BYTES, + UPLOAD_TIMEOUT_SECONDS, + UPLOAD_MAX_RETRIES, + UPLOAD_BACKOFF_BASE_SECONDS, + UPLOAD_BACKOFF_MAX_SECONDS, ) if TYPE_CHECKING: @@ -112,6 +116,98 @@ def _validate_tarball_file(tar_path: Path) -> None: ) +def _is_cert_verification_error(exc: Exception) -> bool: + """Check if an SSL error is a certificate verification failure. + + Cert verification errors are not retryable because they indicate a + system configuration problem, not a transient network issue. + """ + msg = str(exc).lower() + return "certificate_verify_failed" in msg or "certificate verify failed" in msg + + +def _upload_tarball(tar_path: Path, url: str, tarball_size: int) -> None: + """Upload a tarball to a presigned URL with retry on transient errors. + + Retries on SSL record errors, connection resets, and timeouts. + Does not retry on certificate verification failures (those indicate + a system configuration problem). + + Args: + tar_path: path to the tarball file + url: presigned upload URL (auth is in query params) + tarball_size: file size in bytes, sent as Content-Length + + Raises: + requests.SSLError: on cert verification failure (with guidance) + requests.ConnectionError: after exhausting retries + requests.Timeout: after exhausting retries + requests.HTTPError: on non-retryable HTTP status + """ + import time + + import requests as _requests + from requests.exceptions import ConnectionError, SSLError, Timeout + + from runpod_flash.core.utils.backoff import get_backoff_delay + from runpod_flash.core.utils.user_agent import get_user_agent + + # presigned URLs already carry auth in query params, so an + # Authorization header causes R2/S3 to reject the request. + upload_headers = { + "User-Agent": get_user_agent(), + "Content-Type": TARBALL_CONTENT_TYPE, + "Content-Length": str(tarball_size), + } + + last_exc: Optional[Exception] = None + + for attempt in range(UPLOAD_MAX_RETRIES): + try: + with tar_path.open("rb") as fh: + resp = _requests.put( + url, + data=fh, + headers=upload_headers, + timeout=UPLOAD_TIMEOUT_SECONDS, + ) + resp.raise_for_status() + resp.close() + return + except SSLError as exc: + if _is_cert_verification_error(exc): + raise SSLError( + "SSL certificate verification failed. This usually means " + "Python cannot find your system's CA certificates.\n\n" + "Try one of:\n" + " export REQUESTS_CA_BUNDLE=" + '$(python -c "import certifi; print(certifi.where())")\n' + " pip install certifi\n" + " # macOS: run 'Install Certificates.command' from your " + "Python installation folder\n" + ) from exc + last_exc = exc + except (ConnectionError, Timeout) as exc: + last_exc = exc + + if attempt < UPLOAD_MAX_RETRIES - 1: + delay = get_backoff_delay( + attempt, + base=UPLOAD_BACKOFF_BASE_SECONDS, + max_seconds=UPLOAD_BACKOFF_MAX_SECONDS, + ) + log.warning( + "Upload attempt %d/%d failed: %s. Retrying in %.1fs...", + attempt + 1, + UPLOAD_MAX_RETRIES, + last_exc, + delay, + ) + time.sleep(delay) + + raise last_exc # type: ignore[misc] + + class FlashApp: """Flash app resource for managing applications, environments, and builds. @@ -427,6 +523,8 @@ async def upload_build(self, tar_path: Union[str, Path]) -> Dict[str, Any]: Manifest is read from .flash/flash_manifest.json during deployment, not extracted from tarball. + Retries on transient SSL/connection errors with exponential backoff. + Args: tar_path: Path to the tarball file (string or Path object) Must be .tar.gz or .tgz, under 1500MB @@ -438,13 +536,8 @@ async def upload_build(self, tar_path: Union[str, Path]) -> Dict[str, Any]: RuntimeError: If app is not hydrated (no ID available) FileNotFoundError: If tar_path does not exist ValueError: If file is invalid (extension, magic bytes, or size) - requests.HTTPError: If upload fails - - TODO: Add integration tests for tarball upload flow including: - - Network failures and retry behavior - - Large file uploads (edge cases near size limit) - - Corrupted tarball handling - - Pre-signed URL expiration scenarios + requests.HTTPError: If upload fails after all retries + requests.SSLError: If SSL certificate verification fails (non-retryable) """ # Convert to Path and validate before hydrating if isinstance(tar_path, str): @@ -470,24 +563,8 @@ async def upload_build(self, tar_path: Union[str, Path]) -> Dict[str, Any]: url = result["uploadUrl"] object_key = result["objectKey"] - # presigned URLs already carry auth in query params, so an - # Authorization header causes R2/S3 to reject the request. - import requests as _requests - - from runpod_flash.core.utils.user_agent import get_user_agent - - upload_headers = { - "User-Agent": get_user_agent(), - "Content-Type": TARBALL_CONTENT_TYPE, - } + _upload_tarball(tar_path, url, tarball_size) - with tar_path.open("rb") as fh: - resp = _requests.put(url, data=fh, headers=upload_headers) - - try: - resp.raise_for_status() - finally: - resp.close() resp = await self._finalize_upload_build(object_key, manifest) return resp diff --git a/src/runpod_flash/core/resources/constants.py b/src/runpod_flash/core/resources/constants.py index e02fcee1..7d826534 100644 --- a/src/runpod_flash/core/resources/constants.py +++ b/src/runpod_flash/core/resources/constants.py @@ -152,5 +152,11 @@ def get_image_name( VALID_TARBALL_EXTENSIONS = (".tar.gz", ".tgz") # Valid tarball file extensions GZIP_MAGIC_BYTES = (0x1F, 0x8B) # Magic bytes for gzip files +# tarball upload retry/timeout settings +UPLOAD_TIMEOUT_SECONDS = 600 # 10 minutes per attempt +UPLOAD_MAX_RETRIES = 3 +UPLOAD_BACKOFF_BASE_SECONDS = 2.0 +UPLOAD_BACKOFF_MAX_SECONDS = 30.0 + # Load balancer stub timeout (seconds) DEFAULT_LB_STUB_TIMEOUT = 60.0 diff --git a/tests/unit/core/resources/test_app.py b/tests/unit/core/resources/test_app.py index 09700283..a30e175b 100644 --- a/tests/unit/core/resources/test_app.py +++ b/tests/unit/core/resources/test_app.py @@ -5,10 +5,14 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest +from requests.exceptions import ConnectionError as RequestsConnectionError +from requests.exceptions import SSLError, Timeout from runpod_flash.core.resources.app import ( FlashApp, FlashEnvironmentNotFoundError, + _is_cert_verification_error, + _upload_tarball, _validate_exclusive_params, _validate_tarball_file, ) @@ -482,3 +486,166 @@ async def test_create_environment(self): result = await app.create_environment("production") assert result["id"] == "env-new" + + +class TestIsCertVerificationError: + """Test _is_cert_verification_error classifier.""" + + def test_detects_cert_verify_failed(self): + exc = SSLError( + "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: " + "unable to get local issuer certificate" + ) + assert _is_cert_verification_error(exc) is True + + def test_ignores_bad_record_mac(self): + exc = SSLError("[SSL: SSLV3_ALERT_BAD_RECORD_MAC] ssl/tls alert bad record mac") + assert _is_cert_verification_error(exc) is False + + def test_ignores_generic_ssl_error(self): + exc = SSLError("connection reset by peer") + assert _is_cert_verification_error(exc) is False + + +class TestUploadTarball: + """Test _upload_tarball retry and error handling.""" + + def _make_tarball(self, tmp_path: Path) -> Path: + tar_path = tmp_path / "build.tar.gz" + with gzip.open(tar_path, "wb") as f: + f.write(b"tarball content") + return tar_path + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_success_on_first_attempt(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + with patch("requests.put", return_value=mock_resp) as mock_put: + _upload_tarball(tar_path, "https://example.com/upload", 100) + + mock_put.assert_called_once() + mock_resp.raise_for_status.assert_called_once() + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_retries_on_ssl_error_then_succeeds(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + ssl_exc = SSLError("[SSL: SSLV3_ALERT_BAD_RECORD_MAC] bad record mac") + + with ( + patch("requests.put", side_effect=[ssl_exc, mock_resp]) as mock_put, + patch("time.sleep") as mock_sleep, + ): + _upload_tarball(tar_path, "https://example.com/upload", 100) + + assert mock_put.call_count == 2 + mock_sleep.assert_called_once() + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_retries_on_connection_error(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + conn_exc = RequestsConnectionError("Connection reset by peer") + + with ( + patch("requests.put", side_effect=[conn_exc, mock_resp]) as mock_put, + patch("time.sleep"), + ): + _upload_tarball(tar_path, "https://example.com/upload", 100) + + assert mock_put.call_count == 2 + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_retries_on_timeout(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + timeout_exc = Timeout("Read timed out") + + with ( + patch("requests.put", side_effect=[timeout_exc, mock_resp]) as mock_put, + patch("time.sleep"), + ): + _upload_tarball(tar_path, "https://example.com/upload", 100) + + assert mock_put.call_count == 2 + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 2) + def test_raises_after_exhausting_retries(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + ssl_exc = SSLError("[SSL: SSLV3_ALERT_BAD_RECORD_MAC] bad record mac") + + with ( + patch("requests.put", side_effect=ssl_exc) as mock_put, + patch("time.sleep"), + pytest.raises(SSLError, match="bad record mac"), + ): + _upload_tarball(tar_path, "https://example.com/upload", 100) + + assert mock_put.call_count == 2 + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_cert_verification_error_not_retried(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + cert_exc = SSLError( + "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed" + ) + + with ( + patch("requests.put", side_effect=cert_exc) as mock_put, + pytest.raises(SSLError, match="CA certificates"), + ): + _upload_tarball(tar_path, "https://example.com/upload", 100) + + # no retry, fails immediately + mock_put.assert_called_once() + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_sends_content_length_header(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + with patch("requests.put", return_value=mock_resp) as mock_put: + _upload_tarball(tar_path, "https://example.com/upload", 12345) + + _, kwargs = mock_put.call_args + assert kwargs["headers"]["Content-Length"] == "12345" + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_sets_timeout(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + with patch("requests.put", return_value=mock_resp) as mock_put: + _upload_tarball(tar_path, "https://example.com/upload", 100) + + _, kwargs = mock_put.call_args + assert kwargs["timeout"] == 600 + + @patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3) + def test_no_authorization_header(self, tmp_path): + tar_path = self._make_tarball(tmp_path) + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.close = MagicMock() + + with patch("requests.put", return_value=mock_resp) as mock_put: + _upload_tarball(tar_path, "https://example.com/upload", 100) + + _, kwargs = mock_put.call_args + assert "Authorization" not in kwargs["headers"]