Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/runpod_flash/cli/commands/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,19 @@ def deploy_command(
console.print(f"\n[red]Error:[/red] {e}")
raise typer.Exit(1)
except Exception as e:
console.print(f"\n[red]Deploy failed:[/red] {e}")
logger.exception("Deploy failed")
_handle_deploy_error(e)


def _handle_deploy_error(exc: Exception) -> None:
"""Handle deploy errors, providing targeted guidance for SSL issues."""
from requests.exceptions import SSLError

if isinstance(exc, SSLError):
console.print(f"\n[red]SSL error:[/red] {exc}")
raise typer.Exit(1)
console.print(f"\n[red]Deploy failed:[/red] {exc}")
logger.exception("Deploy failed")
raise typer.Exit(1)


def _print_curl_example(url: str, method: str = "POST") -> None:
Expand Down
125 changes: 101 additions & 24 deletions src/runpod_flash/core/resources/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
MAX_TARBALL_SIZE_MB,
VALID_TARBALL_EXTENSIONS,
GZIP_MAGIC_BYTES,
UPLOAD_TIMEOUT_SECONDS,
UPLOAD_MAX_RETRIES,
UPLOAD_BACKOFF_BASE_SECONDS,
UPLOAD_BACKOFF_MAX_SECONDS,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -112,6 +116,98 @@ def _validate_tarball_file(tar_path: Path) -> None:
)


def _is_cert_verification_error(exc: Exception) -> bool:
"""Check if an SSL error is a certificate verification failure.

Cert verification errors are not retryable because they indicate a
system configuration problem, not a transient network issue.
"""
msg = str(exc).lower()
return "certificate_verify_failed" in msg or "certificate verify failed" in msg


def _upload_tarball(tar_path: Path, url: str, tarball_size: int) -> None:
"""Upload a tarball to a presigned URL with retry on transient errors.

Retries on SSL record errors, connection resets, and timeouts.
Does not retry on certificate verification failures (those indicate
a system configuration problem).

Args:
tar_path: path to the tarball file
url: presigned upload URL (auth is in query params)
tarball_size: file size in bytes, sent as Content-Length

Raises:
requests.SSLError: on cert verification failure (with guidance)
requests.ConnectionError: after exhausting retries
requests.Timeout: after exhausting retries
requests.HTTPError: on non-retryable HTTP status
"""
import time

import requests as _requests
from requests.exceptions import ConnectionError, SSLError, Timeout

from runpod_flash.core.utils.backoff import get_backoff_delay
from runpod_flash.core.utils.user_agent import get_user_agent

# presigned URLs already carry auth in query params, so an
# Authorization header causes R2/S3 to reject the request.
upload_headers = {
"User-Agent": get_user_agent(),
"Content-Type": TARBALL_CONTENT_TYPE,
"Content-Length": str(tarball_size),
}

last_exc: Optional[Exception] = None

for attempt in range(UPLOAD_MAX_RETRIES):
try:
with tar_path.open("rb") as fh:
resp = _requests.put(
url,
data=fh,
headers=upload_headers,
timeout=UPLOAD_TIMEOUT_SECONDS,
)
resp.raise_for_status()
resp.close()
return
except SSLError as exc:
if _is_cert_verification_error(exc):
raise SSLError(
"SSL certificate verification failed. This usually means "
"Python cannot find your system's CA certificates.\n\n"
"Try one of:\n"
" export REQUESTS_CA_BUNDLE="
'$(python -c "import certifi; print(certifi.where())")\n'
" pip install certifi\n"
" # macOS: run 'Install Certificates.command' from your "
"Python installation folder\n"
) from exc
last_exc = exc
except (ConnectionError, Timeout) as exc:
last_exc = exc

if attempt < UPLOAD_MAX_RETRIES - 1:
delay = get_backoff_delay(
attempt,
base=UPLOAD_BACKOFF_BASE_SECONDS,
max_seconds=UPLOAD_BACKOFF_MAX_SECONDS,
)
log.warning(
"Upload attempt %d/%d failed: %s. Retrying in %.1fs...",
attempt + 1,
UPLOAD_MAX_RETRIES,
last_exc,
delay,
)
time.sleep(delay)

raise last_exc # type: ignore[misc]


class FlashApp:
"""Flash app resource for managing applications, environments, and builds.

Expand Down Expand Up @@ -427,6 +523,8 @@ async def upload_build(self, tar_path: Union[str, Path]) -> Dict[str, Any]:
Manifest is read from .flash/flash_manifest.json during deployment, not extracted
from tarball.

Retries on transient SSL/connection errors with exponential backoff.

Args:
tar_path: Path to the tarball file (string or Path object)
Must be .tar.gz or .tgz, under 1500MB
Expand All @@ -438,13 +536,8 @@ async def upload_build(self, tar_path: Union[str, Path]) -> Dict[str, Any]:
RuntimeError: If app is not hydrated (no ID available)
FileNotFoundError: If tar_path does not exist
ValueError: If file is invalid (extension, magic bytes, or size)
requests.HTTPError: If upload fails

TODO: Add integration tests for tarball upload flow including:
- Network failures and retry behavior
- Large file uploads (edge cases near size limit)
- Corrupted tarball handling
- Pre-signed URL expiration scenarios
requests.HTTPError: If upload fails after all retries
requests.SSLError: If SSL certificate verification fails (non-retryable)
"""
# Convert to Path and validate before hydrating
if isinstance(tar_path, str):
Expand All @@ -470,24 +563,8 @@ async def upload_build(self, tar_path: Union[str, Path]) -> Dict[str, Any]:
url = result["uploadUrl"]
object_key = result["objectKey"]

# presigned URLs already carry auth in query params, so an
# Authorization header causes R2/S3 to reject the request.
import requests as _requests

from runpod_flash.core.utils.user_agent import get_user_agent

upload_headers = {
"User-Agent": get_user_agent(),
"Content-Type": TARBALL_CONTENT_TYPE,
}
_upload_tarball(tar_path, url, tarball_size)

with tar_path.open("rb") as fh:
resp = _requests.put(url, data=fh, headers=upload_headers)

try:
resp.raise_for_status()
finally:
resp.close()
resp = await self._finalize_upload_build(object_key, manifest)
return resp

Expand Down
6 changes: 6 additions & 0 deletions src/runpod_flash/core/resources/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,5 +152,11 @@ def get_image_name(
VALID_TARBALL_EXTENSIONS = (".tar.gz", ".tgz") # Valid tarball file extensions
GZIP_MAGIC_BYTES = (0x1F, 0x8B) # Magic bytes for gzip files

# tarball upload retry/timeout settings
UPLOAD_TIMEOUT_SECONDS = 600 # 10 minutes per attempt
UPLOAD_MAX_RETRIES = 3
UPLOAD_BACKOFF_BASE_SECONDS = 2.0
UPLOAD_BACKOFF_MAX_SECONDS = 30.0

# Load balancer stub timeout (seconds)
DEFAULT_LB_STUB_TIMEOUT = 60.0
167 changes: 167 additions & 0 deletions tests/unit/core/resources/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from requests.exceptions import ConnectionError as RequestsConnectionError
from requests.exceptions import SSLError, Timeout

from runpod_flash.core.resources.app import (
FlashApp,
FlashEnvironmentNotFoundError,
_is_cert_verification_error,
_upload_tarball,
_validate_exclusive_params,
_validate_tarball_file,
)
Expand Down Expand Up @@ -482,3 +486,166 @@ async def test_create_environment(self):

result = await app.create_environment("production")
assert result["id"] == "env-new"


class TestIsCertVerificationError:
"""Test _is_cert_verification_error classifier."""

def test_detects_cert_verify_failed(self):
exc = SSLError(
"[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: "
"unable to get local issuer certificate"
)
assert _is_cert_verification_error(exc) is True

def test_ignores_bad_record_mac(self):
exc = SSLError("[SSL: SSLV3_ALERT_BAD_RECORD_MAC] ssl/tls alert bad record mac")
assert _is_cert_verification_error(exc) is False

def test_ignores_generic_ssl_error(self):
exc = SSLError("connection reset by peer")
assert _is_cert_verification_error(exc) is False


class TestUploadTarball:
"""Test _upload_tarball retry and error handling."""

def _make_tarball(self, tmp_path: Path) -> Path:
tar_path = tmp_path / "build.tar.gz"
with gzip.open(tar_path, "wb") as f:
f.write(b"tarball content")
return tar_path

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_success_on_first_attempt(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

with patch("requests.put", return_value=mock_resp) as mock_put:
_upload_tarball(tar_path, "https://example.com/upload", 100)

mock_put.assert_called_once()
mock_resp.raise_for_status.assert_called_once()

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_retries_on_ssl_error_then_succeeds(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

ssl_exc = SSLError("[SSL: SSLV3_ALERT_BAD_RECORD_MAC] bad record mac")

with (
patch("requests.put", side_effect=[ssl_exc, mock_resp]) as mock_put,
patch("time.sleep") as mock_sleep,
):
_upload_tarball(tar_path, "https://example.com/upload", 100)

assert mock_put.call_count == 2
mock_sleep.assert_called_once()

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_retries_on_connection_error(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

conn_exc = RequestsConnectionError("Connection reset by peer")

with (
patch("requests.put", side_effect=[conn_exc, mock_resp]) as mock_put,
patch("time.sleep"),
):
_upload_tarball(tar_path, "https://example.com/upload", 100)

assert mock_put.call_count == 2

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_retries_on_timeout(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

timeout_exc = Timeout("Read timed out")

with (
patch("requests.put", side_effect=[timeout_exc, mock_resp]) as mock_put,
patch("time.sleep"),
):
_upload_tarball(tar_path, "https://example.com/upload", 100)

assert mock_put.call_count == 2

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 2)
def test_raises_after_exhausting_retries(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
ssl_exc = SSLError("[SSL: SSLV3_ALERT_BAD_RECORD_MAC] bad record mac")

with (
patch("requests.put", side_effect=ssl_exc) as mock_put,
patch("time.sleep"),
pytest.raises(SSLError, match="bad record mac"),
):
_upload_tarball(tar_path, "https://example.com/upload", 100)

assert mock_put.call_count == 2

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_cert_verification_error_not_retried(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
cert_exc = SSLError(
"[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed"
)

with (
patch("requests.put", side_effect=cert_exc) as mock_put,
pytest.raises(SSLError, match="CA certificates"),
):
_upload_tarball(tar_path, "https://example.com/upload", 100)

# no retry, fails immediately
mock_put.assert_called_once()

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_sends_content_length_header(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

with patch("requests.put", return_value=mock_resp) as mock_put:
_upload_tarball(tar_path, "https://example.com/upload", 12345)

_, kwargs = mock_put.call_args
assert kwargs["headers"]["Content-Length"] == "12345"

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_sets_timeout(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

with patch("requests.put", return_value=mock_resp) as mock_put:
_upload_tarball(tar_path, "https://example.com/upload", 100)

_, kwargs = mock_put.call_args
assert kwargs["timeout"] == 600

@patch("runpod_flash.core.resources.app.UPLOAD_MAX_RETRIES", 3)
def test_no_authorization_header(self, tmp_path):
tar_path = self._make_tarball(tmp_path)
mock_resp = MagicMock()
mock_resp.raise_for_status = MagicMock()
mock_resp.close = MagicMock()

with patch("requests.put", return_value=mock_resp) as mock_put:
_upload_tarball(tar_path, "https://example.com/upload", 100)

_, kwargs = mock_put.call_args
assert "Authorization" not in kwargs["headers"]
Loading