diff --git a/README.md b/README.md index 180f822bc..14e808a10 100644 --- a/README.md +++ b/README.md @@ -33,10 +33,10 @@ For a higher-level, Pythonic interface, check out the new [`RunloopSDK`](README- ```python from runloop_api_client import RunloopSDK -sdk = RunloopSDK() # Uses RUNLOOP_API_KEY environment variable by default +runloop = RunloopSDK() # Uses RUNLOOP_API_KEY environment variable by default # Create a devbox and execute commands with a clean, object-oriented interface -with sdk.devbox.create(name="my-devbox") as devbox: +with runloop.devbox.create(name="my-devbox") as devbox: result = devbox.cmd.exec("echo 'Hello from Runloop!'") print(result.stdout()) ``` diff --git a/src/runloop_api_client/sdk/__init__.py b/src/runloop_api_client/sdk/__init__.py index 1b95e585b..5773b9d53 100644 --- a/src/runloop_api_client/sdk/__init__.py +++ b/src/runloop_api_client/sdk/__init__.py @@ -7,6 +7,7 @@ from .sync import AgentOps, DevboxOps, ScorerOps, RunloopSDK, ScenarioOps, SnapshotOps, BlueprintOps, StorageObjectOps from .agent import Agent +from ._types import ScenarioPreview from .async_ import ( AsyncAgentOps, AsyncDevboxOps, @@ -33,9 +34,11 @@ from .async_blueprint import AsyncBlueprint from .async_execution import AsyncExecution from .execution_result import ExecutionResult +from .scenario_builder import ScenarioBuilder from .async_scenario_run import AsyncScenarioRun from .async_storage_object import AsyncStorageObject from .async_execution_result import AsyncExecutionResult +from .async_scenario_builder import AsyncScenarioBuilder __all__ = [ # Main SDK entry points @@ -71,6 +74,9 @@ "AsyncScenario", "ScenarioRun", "AsyncScenarioRun", + "ScenarioBuilder", + "AsyncScenarioBuilder", + "ScenarioPreview", "Scorer", "AsyncScorer", "Snapshot", diff --git a/src/runloop_api_client/sdk/_types.py b/src/runloop_api_client/sdk/_types.py index 31d543abb..be09f6eed 100644 --- a/src/runloop_api_client/sdk/_types.py +++ b/src/runloop_api_client/sdk/_types.py @@ -5,6 +5,8 @@ from ..lib.polling import PollingConfig from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams +from ..types.input_context import InputContext +from ..types.scenario_view import ScenarioView from ..types.agent_list_params import AgentListParams from ..types.devbox_list_params import DevboxListParams from ..types.object_list_params import ObjectListParams @@ -186,3 +188,18 @@ class SDKScenarioRunAsyncParams(ScenarioStartRunBaseParams, LongRequestOptions): class SDKScenarioRunParams(ScenarioStartRunBaseParams, LongPollingRequestOptions): pass + + +class InputContextPreview(InputContext): + problem_statement: Optional[str] = None # type: ignore[assignment] + """The problem statement for the Scenario.""" + + +class ScenarioPreview(ScenarioView): + """Preview of scenario configuration with all fields optional.""" + + id: Optional[str] = None # type: ignore[assignment] + """The ID of the Scenario.""" + + input_context: InputContextPreview # type: ignore[assignment] + """The input context for the Scenario.""" diff --git a/src/runloop_api_client/sdk/async_.py b/src/runloop_api_client/sdk/async_.py index b54e831a5..4bcd08fc1 100644 --- a/src/runloop_api_client/sdk/async_.py +++ b/src/runloop_api_client/sdk/async_.py @@ -37,6 +37,7 @@ from .async_blueprint import AsyncBlueprint from ..lib.context_loader import TarFilter, build_directory_tar from .async_storage_object import AsyncStorageObject +from .async_scenario_builder import AsyncScenarioBuilder from ..types.object_create_params import ContentType from ..types.shared_params.agent_source import Git, Npm, Pip, Object @@ -773,6 +774,16 @@ class AsyncScenarioOps: >>> scenario = runloop.scenario.from_id("scn-xxx") >>> run = await scenario.run() >>> scenarios = await runloop.scenario.list() + + Example using builder: + >>> builder = ( + ... runloop.scenario.builder("my-scenario") + ... .from_blueprint(blueprint) + ... .with_problem_statement("Fix the bug") + ... .add_test_command_scorer("tests", test_command="pytest") + ... ) + >>> params = builder.build() + >>> scenario = await runloop.scenario.create(**params) # equivalent to builder.push() """ def __init__(self, client: AsyncRunloop) -> None: @@ -783,6 +794,16 @@ def __init__(self, client: AsyncRunloop) -> None: """ self._client = client + def builder(self, name: str) -> AsyncScenarioBuilder: + """Create a new scenario builder. + + :param name: Name for the scenario + :type name: str + :return: A new AsyncScenarioBuilder instance + :rtype: AsyncScenarioBuilder + """ + return AsyncScenarioBuilder(name, self._client) + def from_id(self, scenario_id: str) -> AsyncScenario: """Get an AsyncScenario instance for an existing scenario ID. diff --git a/src/runloop_api_client/sdk/async_scenario_builder.py b/src/runloop_api_client/sdk/async_scenario_builder.py new file mode 100644 index 000000000..37a3aa4b5 --- /dev/null +++ b/src/runloop_api_client/sdk/async_scenario_builder.py @@ -0,0 +1,480 @@ +"""AsyncScenarioBuilder for constructing scenarios with a fluent API.""" + +from __future__ import annotations + +from typing import Dict, List, Iterable, Optional +from typing_extensions import Self, Unpack, Literal, override + +from ..types import ScenarioCreateParams, ScenarioEnvironmentParam +from ._types import ScenarioPreview, LongRequestOptions +from .._client import AsyncRunloop +from .async_scenario import AsyncScenario +from .async_snapshot import AsyncSnapshot +from .async_blueprint import AsyncBlueprint +from ..types.scoring_function_param import ( + Scorer, + ScoringFunctionParam, + ScorerCustomScoringFunction, + ScorerAstGrepScoringFunction, + ScorerCommandScoringFunction, + ScorerTestBasedScoringFunction, + ScorerBashScriptScoringFunction, + ScorerPythonScriptScoringFunction, + ScorerTestBasedScoringFunctionTestFile, +) + + +class AsyncScenarioBuilder: + """Async builder for constructing scenarios with a fluent API. + + Provides a step-by-step interface for configuring all aspects of a scenario + before pushing it to the platform. + + Example: + >>> builder = ( + ... runloop.scenario.builder("my-scenario") + ... .from_blueprint(blueprint) + ... .with_working_directory("/app") + ... .with_problem_statement("Fix the bug in main.py") + ... .add_test_command_scorer("tests", test_command="pytest") + ... ) + >>> params = builder.build() + >>> scenario = await runloop.scenario.create(**params) # equivalent to builder.push() + """ + + def __init__(self, name: str, client: AsyncRunloop) -> None: + """Initialize the builder. + + :param name: Name for the scenario + :type name: str + :param client: AsyncRunloop client instance + :type client: AsyncRunloop + """ + self._client = client + self._name = name + + # Environment configuration + self._blueprint: Optional[AsyncBlueprint] = None + self._snapshot: Optional[AsyncSnapshot] = None + self._working_directory: Optional[str] = None + + # Input context + self._problem_statement: Optional[str] = None + self._additional_context: Optional[object] = None + + # Scoring + self._scorers: List[ScoringFunctionParam] = [] + + # Metadata and other options + self._metadata: Dict[str, str] = {} + self._reference_output: Optional[str] = None + self._required_env_vars: Optional[List[str]] = None + self._required_secrets: Optional[List[str]] = None + self._validation_type: Optional[Literal["UNSPECIFIED", "FORWARD", "REVERSE", "EVALUATION"]] = None + + @override + def __repr__(self) -> str: + return f"" + + @property + def name(self) -> str: + """Return the scenario name. + + :return: Scenario name + :rtype: str + """ + return self._name + + def from_blueprint(self, blueprint: AsyncBlueprint) -> Self: + """Set a blueprint to define the baseline environment for the scenario. + + :param blueprint: Blueprint to use + :type blueprint: AsyncBlueprint + :return: Self for method chaining + :rtype: Self + """ + self._blueprint = blueprint + self._snapshot = None # Clear snapshot if blueprint is set + return self + + def from_snapshot(self, snapshot: AsyncSnapshot) -> Self: + """Set a snapshot to define the baseline environment for the scenario. + + :param snapshot: Snapshot to use + :type snapshot: AsyncSnapshot + :return: Self for method chaining + :rtype: Self + """ + self._snapshot = snapshot + self._blueprint = None # Clear blueprint if snapshot is set + return self + + def with_working_directory(self, directory: str) -> Self: + """Set the working directory for the scenario. + + :param directory: Working directory path + :type directory: str + :return: Self for method chaining + :rtype: Self + """ + self._working_directory = directory + return self + + def with_problem_statement(self, statement: str) -> Self: + """Set the problem statement for the scenario; this will be provided as input context to the agent. + + :param statement: Problem statement text + :type statement: str + :return: Self for method chaining + :rtype: Self + """ + self._problem_statement = statement + return self + + def with_additional_context(self, context: object) -> Self: + """Set additional structured context for the scenario. + This can be used to provide additional information to the agent, such as hints, examples, or other relevant information. + + :param context: Additional context (JSON-serializable) + :type context: object + :return: Self for method chaining + :rtype: Self + """ + self._additional_context = context + return self + + def _add_scorer(self, name: str, weight: float, scorer: Scorer) -> Self: + """Internal helper to add a scorer to the list. + + :raises ValueError: If weight is not positive + """ + if weight <= 0: + raise ValueError(f"Scorer weight must be positive, got {weight}") + self._scorers.append({"name": name, "weight": weight, "scorer": scorer}) + return self + + def add_test_command_scorer( + self, + name: str, + *, + test_command: str, + weight: float = 1.0, + test_files: Optional[Iterable[ScorerTestBasedScoringFunctionTestFile]] = None, + ) -> Self: + """Add a test-based scorer that runs a test command. + + :param name: Name of the scoring function + :type name: str + :param test_command: Command to run tests (e.g., "pytest") + :type test_command: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param test_files: Optional test files to create before running + :type test_files: Optional[Iterable[ScorerTestBasedScoringFunctionTestFile]] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerTestBasedScoringFunction = { + "type": "test_based_scorer", + "test_command": test_command, + } + if test_files: + scorer["test_files"] = test_files + return self._add_scorer(name, weight, scorer) + + def add_shell_command_scorer( + self, + name: str, + *, + command: str, + weight: float = 1.0, + ) -> Self: + """Add a command scorer that runs a shell command. + + :param name: Name of the scoring function + :type name: str + :param command: Shell command to execute + :type command: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerCommandScoringFunction = { + "type": "command_scorer", + "command": command, + } + return self._add_scorer(name, weight, scorer) + + def add_bash_script_scorer( + self, + name: str, + *, + bash_script: str, + weight: float = 1.0, + ) -> Self: + """Add a standalone bash script scorer. + + The script should output "score=X.X" where X.X is a float between 0.0 and 1.0, inclusive. + + :param name: Name of the scoring function + :type name: str + :param bash_script: Bash script content + :type bash_script: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerBashScriptScoringFunction = { + "type": "bash_script_scorer", + "bash_script": bash_script, + } + return self._add_scorer(name, weight, scorer) + + def add_python_script_scorer( + self, + name: str, + *, + python_script: str, + weight: float = 1.0, + python_version_constraint: Optional[str] = None, + requirements_contents: Optional[str] = None, + ) -> Self: + """Add a standalone Python script scorer. + + The script is run in an isolated uv environment, and the dependencies are declared in the + `uv script header `__. + + The script should print the score in the range [0.0, 1.0] to stdout. + + :param name: Name of the scoring function + :type name: str + :param python_script: Python script content + :type python_script: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param python_version_constraint: Python version (default "==3.12.10") + :type python_version_constraint: Optional[str] + :param requirements_contents: pip requirements.txt content + :type requirements_contents: Optional[str] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerPythonScriptScoringFunction = { + "type": "python_script_scorer", + "python_script": python_script, + } + if python_version_constraint: + scorer["python_version_constraint"] = python_version_constraint + if requirements_contents: + scorer["requirements_contents"] = requirements_contents + return self._add_scorer(name, weight, scorer) + + def add_ast_grep_scorer( + self, + name: str, + *, + pattern: str, + weight: float = 1.0, + search_directory: str = ".", + lang: Optional[str] = None, + ) -> Self: + """Add an AST grep scorer that matches code patterns. + + :param name: Name of the scoring function + :type name: str + :param pattern: AST pattern to match + :type pattern: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param search_directory: Directory to search (default ".") + :type search_directory: str + :param lang: Language of the pattern (optional) + :type lang: Optional[str] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerAstGrepScoringFunction = { + "type": "ast_grep_scorer", + "pattern": pattern, + "search_directory": search_directory, + } + if lang: + scorer["lang"] = lang + return self._add_scorer(name, weight, scorer) + + def add_custom_scorer( + self, + name: str, + *, + custom_scorer_type: str, + weight: float = 1.0, + scorer_params: Optional[object] = None, + ) -> Self: + """Add a custom scorer registered with Runloop. + + :param name: Name of the scoring function + :type name: str + :param custom_scorer_type: Type identifier registered with Runloop + :type custom_scorer_type: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param scorer_params: Additional JSON parameters for the scorer + :type scorer_params: Optional[object] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerCustomScoringFunction = { + "type": "custom_scorer", + "custom_scorer_type": custom_scorer_type, + } + if scorer_params: + scorer["scorer_params"] = scorer_params + return self._add_scorer(name, weight, scorer) + + def with_metadata(self, metadata: Dict[str, str]) -> Self: + """Set metadata for the scenario. + + :param metadata: Key-value metadata + :type metadata: Dict[str, str] + :return: Self for method chaining + :rtype: Self + """ + self._metadata = metadata + return self + + def with_reference_output(self, output: str) -> Self: + """Set the reference solution or gold patch for validation. + After application, the scorer is expected to return a score of 1.0. + + :param output: Reference solution or gold patch (e.g., git diff) + :type output: str + :return: Self for method chaining + :rtype: Self + """ + self._reference_output = output + return self + + def with_required_env_vars(self, env_vars: List[str]) -> Self: + """Set required environment variables. + + :param env_vars: List of required environment variable names + :type env_vars: List[str] + :return: Self for method chaining + :rtype: Self + """ + self._required_env_vars = env_vars + return self + + def with_required_secrets(self, secrets: List[str]) -> Self: + """Set required secrets. + + :param secrets: List of required secret names + :type secrets: List[str] + :return: Self for method chaining + :rtype: Self + """ + self._required_secrets = secrets + return self + + def with_validation_type(self, validation_type: Literal["UNSPECIFIED", "FORWARD", "REVERSE", "EVALUATION"]) -> Self: + """Set the validation strategy to specify how the reference solution or gold patch is applied to the scenario. + + :param validation_type: Validation type + :type validation_type: Literal["UNSPECIFIED", "FORWARD", "REVERSE", "EVALUATION"] + :return: Self for method chaining + :rtype: Self + """ + self._validation_type = validation_type + return self + + def _build_normalized_scorers(self) -> List[ScoringFunctionParam]: + """Build normalized scorers list.""" + total_weight = sum(s["weight"] for s in self._scorers) + return [{**s, "weight": s["weight"] / total_weight} for s in self._scorers] + + def _build_environment_params(self) -> Optional[ScenarioEnvironmentParam]: + """Build environment parameters.""" + if not self._blueprint and not self._snapshot and not self._working_directory: + return None + return { + "blueprint_id": self._blueprint.id if self._blueprint else None, + "snapshot_id": self._snapshot.id if self._snapshot else None, + "working_directory": self._working_directory if self._working_directory else None, + } + + def build(self) -> ScenarioCreateParams: + """Build the scenario creation parameters. + + Weights are automatically normalized to sum to 1.0. + + :raises ValueError: If required fields are missing + :return: Parameters for scenario creation + :rtype: ScenarioCreateParams + """ + if not self._problem_statement: + raise ValueError("Problem statement is required. Call with_problem_statement() first.") + + if not self._scorers: + raise ValueError( + "At least one scorer is required. " + "Call add_test_command_scorer(), add_bash_script_scorer(), or another scorer method first." + ) + + return { + "name": self._name, + "input_context": { + "problem_statement": self._problem_statement, + "additional_context": self._additional_context, + }, + "scoring_contract": { + "scoring_function_parameters": self._build_normalized_scorers(), + }, + "environment_parameters": self._build_environment_params(), + "metadata": self._metadata, + "reference_output": self._reference_output, + "required_environment_variables": self._required_env_vars, + "required_secret_names": self._required_secrets, + "validation_type": self._validation_type, + } + + def preview(self) -> ScenarioPreview: + """Preview the scenario configuration without pushing to the platform. + + Returns the current configuration state as a ScenarioPreview object. + Does not validate or raise errors for missing required fields. + + :return: Preview of the scenario configuration + :rtype: ScenarioPreview + """ + return ScenarioPreview.model_validate( + { + "name": self._name, + "input_context": { + "problem_statement": self._problem_statement, + "additional_context": self._additional_context, + }, + "scoring_contract": { + "scoring_function_parameters": self._build_normalized_scorers(), + }, + "environment": self._build_environment_params(), + "metadata": self._metadata, + "reference_output": self._reference_output, + "required_environment_variables": self._required_env_vars, + "required_secret_names": self._required_secrets, + "validation_type": self._validation_type, + } + ) + + async def push(self, **options: Unpack[LongRequestOptions]) -> AsyncScenario: + """Create the scenario on the platform. + + :param options: Optional long-running request configuration + :raises ValueError: If required fields are missing + :return: Created scenario wrapper + :rtype: AsyncScenario + """ + params = self.build() + scenario_view = await self._client.scenarios.create(**params, **options) + return AsyncScenario(self._client, scenario_view.id) diff --git a/src/runloop_api_client/sdk/async_scorer.py b/src/runloop_api_client/sdk/async_scorer.py index 3df4fb4e0..91ced0c38 100644 --- a/src/runloop_api_client/sdk/async_scorer.py +++ b/src/runloop_api_client/sdk/async_scorer.py @@ -16,7 +16,7 @@ class AsyncScorer: """A custom scorer for evaluating scenario outputs (async). - Scorers define bash scripts that produce a score (0.0-1.0) for scenario runs. + Scorers define bash scripts that produce a score in the range [0.0, 1.0] for scenario runs. Obtain instances via ``runloop.scorer.create()`` or ``runloop.scorer.from_id()``. Example: diff --git a/src/runloop_api_client/sdk/scenario_builder.py b/src/runloop_api_client/sdk/scenario_builder.py new file mode 100644 index 000000000..e2fc15de4 --- /dev/null +++ b/src/runloop_api_client/sdk/scenario_builder.py @@ -0,0 +1,480 @@ +"""ScenarioBuilder for constructing scenarios with a fluent API.""" + +from __future__ import annotations + +from typing import Dict, List, Iterable, Optional +from typing_extensions import Self, Unpack, Literal, override + +from ..types import ScenarioCreateParams, ScenarioEnvironmentParam +from ._types import ScenarioPreview, LongRequestOptions +from .._client import Runloop +from .scenario import Scenario +from .snapshot import Snapshot +from .blueprint import Blueprint +from ..types.scoring_function_param import ( + Scorer, + ScoringFunctionParam, + ScorerCustomScoringFunction, + ScorerAstGrepScoringFunction, + ScorerCommandScoringFunction, + ScorerTestBasedScoringFunction, + ScorerBashScriptScoringFunction, + ScorerPythonScriptScoringFunction, + ScorerTestBasedScoringFunctionTestFile, +) + + +class ScenarioBuilder: + """Builder for constructing scenarios with a fluent API. + + Provides a step-by-step interface for configuring all aspects of a scenario + before pushing it to the platform. + + Example: + >>> builder = ( + ... runloop.scenario.builder("my-scenario") + ... .from_blueprint(blueprint) + ... .with_working_directory("/app") + ... .with_problem_statement("Fix the bug in main.py") + ... .add_test_command_scorer("tests", test_command="pytest") + ... ) + >>> params = builder.build() + >>> scenario = runloop.scenario.create(**params) # equivalent to builder.push() + """ + + def __init__(self, name: str, client: Runloop) -> None: + """Initialize the builder. + + :param name: Name for the scenario + :type name: str + :param client: Runloop client instance + :type client: Runloop + """ + self._client = client + self._name = name + + # Environment configuration + self._blueprint: Optional[Blueprint] = None + self._snapshot: Optional[Snapshot] = None + self._working_directory: Optional[str] = None + + # Input context + self._problem_statement: Optional[str] = None + self._additional_context: Optional[object] = None + + # Scoring + self._scorers: List[ScoringFunctionParam] = [] + + # Metadata and other options + self._metadata: Dict[str, str] = {} + self._reference_output: Optional[str] = None + self._required_env_vars: Optional[List[str]] = None + self._required_secrets: Optional[List[str]] = None + self._validation_type: Optional[Literal["UNSPECIFIED", "FORWARD", "REVERSE", "EVALUATION"]] = None + + @override + def __repr__(self) -> str: + return f"" + + @property + def name(self) -> str: + """Return the scenario name. + + :return: Scenario name + :rtype: str + """ + return self._name + + def from_blueprint(self, blueprint: Blueprint) -> Self: + """Set a blueprint to define the baseline environment for the scenario. + + :param blueprint: Blueprint to use + :type blueprint: Blueprint + :return: Self for method chaining + :rtype: Self + """ + self._blueprint = blueprint + self._snapshot = None # Clear snapshot if blueprint is set + return self + + def from_snapshot(self, snapshot: Snapshot) -> Self: + """Set a snapshot to define the baseline environment for the scenario. + + :param snapshot: Snapshot to use + :type snapshot: Snapshot + :return: Self for method chaining + :rtype: Self + """ + self._snapshot = snapshot + self._blueprint = None # Clear blueprint if snapshot is set + return self + + def with_working_directory(self, directory: str) -> Self: + """Set the working directory for the scenario. + + :param directory: Working directory path + :type directory: str + :return: Self for method chaining + :rtype: Self + """ + self._working_directory = directory + return self + + def with_problem_statement(self, statement: str) -> Self: + """Set the problem statement for the scenario; this will be provided as input context to the agent. + + :param statement: Problem statement text + :type statement: str + :return: Self for method chaining + :rtype: Self + """ + self._problem_statement = statement + return self + + def with_additional_context(self, context: object) -> Self: + """Set additional structured context for the scenario. + This can be used to provide additional information to the agent, such as hints, examples, or other relevant information. + + :param context: Additional context (JSON-serializable) + :type context: object + :return: Self for method chaining + :rtype: Self + """ + self._additional_context = context + return self + + def _add_scorer(self, name: str, weight: float, scorer: Scorer) -> Self: + """Internal helper to add a scorer to the list. + + :raises ValueError: If weight is not positive + """ + if weight <= 0: + raise ValueError(f"Scorer weight must be positive, got {weight}") + self._scorers.append({"name": name, "weight": weight, "scorer": scorer}) + return self + + def add_test_command_scorer( + self, + name: str, + *, + test_command: str, + weight: float = 1.0, + test_files: Optional[Iterable[ScorerTestBasedScoringFunctionTestFile]] = None, + ) -> Self: + """Add a test-based scorer that runs a test command. + + :param name: Name of the scoring function + :type name: str + :param test_command: Command to run tests (e.g., "pytest") + :type test_command: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param test_files: Optional test files to create before running + :type test_files: Optional[Iterable[ScorerTestBasedScoringFunctionTestFile]] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerTestBasedScoringFunction = { + "type": "test_based_scorer", + "test_command": test_command, + } + if test_files: + scorer["test_files"] = test_files + return self._add_scorer(name, weight, scorer) + + def add_shell_command_scorer( + self, + name: str, + *, + command: str, + weight: float = 1.0, + ) -> Self: + """Add a command scorer that runs a shell command. + + :param name: Name of the scoring function + :type name: str + :param command: Shell command to execute + :type command: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerCommandScoringFunction = { + "type": "command_scorer", + "command": command, + } + return self._add_scorer(name, weight, scorer) + + def add_bash_script_scorer( + self, + name: str, + *, + bash_script: str, + weight: float = 1.0, + ) -> Self: + """Add a standalone bash script scorer. + + The script should output "score=X.X" where X.X is a float between 0.0 and 1.0, inclusive. + + :param name: Name of the scoring function + :type name: str + :param bash_script: Bash script content + :type bash_script: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerBashScriptScoringFunction = { + "type": "bash_script_scorer", + "bash_script": bash_script, + } + return self._add_scorer(name, weight, scorer) + + def add_python_script_scorer( + self, + name: str, + *, + python_script: str, + weight: float = 1.0, + python_version_constraint: Optional[str] = None, + requirements_contents: Optional[str] = None, + ) -> Self: + """Add a standalone Python script scorer. + + The script is run in an isolated uv environment, and the dependencies are declared in the + `uv script header `__. + + The script should print the score in the range [0.0, 1.0] to stdout. + + :param name: Name of the scoring function + :type name: str + :param python_script: Python script content + :type python_script: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param python_version_constraint: Python version (default "==3.12.10") + :type python_version_constraint: Optional[str] + :param requirements_contents: pip requirements.txt content + :type requirements_contents: Optional[str] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerPythonScriptScoringFunction = { + "type": "python_script_scorer", + "python_script": python_script, + } + if python_version_constraint: + scorer["python_version_constraint"] = python_version_constraint + if requirements_contents: + scorer["requirements_contents"] = requirements_contents + return self._add_scorer(name, weight, scorer) + + def add_ast_grep_scorer( + self, + name: str, + *, + pattern: str, + weight: float = 1.0, + search_directory: str = ".", + lang: Optional[str] = None, + ) -> Self: + """Add an AST grep scorer that matches code patterns. + + :param name: Name of the scoring function + :type name: str + :param pattern: AST pattern to match + :type pattern: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param search_directory: Directory to search (default ".") + :type search_directory: str + :param lang: Language of the pattern (optional) + :type lang: Optional[str] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerAstGrepScoringFunction = { + "type": "ast_grep_scorer", + "pattern": pattern, + "search_directory": search_directory, + } + if lang: + scorer["lang"] = lang + return self._add_scorer(name, weight, scorer) + + def add_custom_scorer( + self, + name: str, + *, + custom_scorer_type: str, + weight: float = 1.0, + scorer_params: Optional[object] = None, + ) -> Self: + """Add a custom scorer registered with Runloop. + + :param name: Name of the scoring function + :type name: str + :param custom_scorer_type: Type identifier registered with Runloop + :type custom_scorer_type: str + :param weight: Weight for this scorer (normalized automatically) + :type weight: float + :param scorer_params: Additional JSON parameters for the scorer + :type scorer_params: Optional[object] + :return: Self for method chaining + :rtype: Self + """ + scorer: ScorerCustomScoringFunction = { + "type": "custom_scorer", + "custom_scorer_type": custom_scorer_type, + } + if scorer_params: + scorer["scorer_params"] = scorer_params + return self._add_scorer(name, weight, scorer) + + def with_metadata(self, metadata: Dict[str, str]) -> Self: + """Set metadata for the scenario. + + :param metadata: Key-value metadata + :type metadata: Dict[str, str] + :return: Self for method chaining + :rtype: Self + """ + self._metadata = metadata + return self + + def with_reference_output(self, output: str) -> Self: + """Set the reference solution or gold patch for validation. + After application, the scorer is expected to return a score of 1.0. + + :param output: Reference solution or gold patch (e.g., git diff) + :type output: str + :return: Self for method chaining + :rtype: Self + """ + self._reference_output = output + return self + + def with_required_env_vars(self, env_vars: List[str]) -> Self: + """Set required environment variables. + + :param env_vars: List of required environment variable names + :type env_vars: List[str] + :return: Self for method chaining + :rtype: Self + """ + self._required_env_vars = env_vars + return self + + def with_required_secrets(self, secrets: List[str]) -> Self: + """Set required secrets. + + :param secrets: List of required secret names + :type secrets: List[str] + :return: Self for method chaining + :rtype: Self + """ + self._required_secrets = secrets + return self + + def with_validation_type(self, validation_type: Literal["UNSPECIFIED", "FORWARD", "REVERSE", "EVALUATION"]) -> Self: + """Set the validation strategy to specify how the reference solution or gold patch is applied to the scenario. + + :param validation_type: Validation type + :type validation_type: Literal["UNSPECIFIED", "FORWARD", "REVERSE", "EVALUATION"] + :return: Self for method chaining + :rtype: Self + """ + self._validation_type = validation_type + return self + + def _build_normalized_scorers(self) -> List[ScoringFunctionParam]: + """Build normalized scorers list.""" + total_weight = sum(s["weight"] for s in self._scorers) + return [{**s, "weight": s["weight"] / total_weight} for s in self._scorers] + + def _build_environment_params(self) -> Optional[ScenarioEnvironmentParam]: + """Build environment parameters""" + if not self._blueprint and not self._snapshot and not self._working_directory: + return None + return { + "blueprint_id": self._blueprint.id if self._blueprint else None, + "snapshot_id": self._snapshot.id if self._snapshot else None, + "working_directory": self._working_directory if self._working_directory else None, + } + + def build(self) -> ScenarioCreateParams: + """Build the scenario creation parameters. + + Weights are automatically normalized to sum to 1.0. + + :raises ValueError: If required fields are missing + :return: Parameters for scenario creation + :rtype: ScenarioCreateParams + """ + if not self._problem_statement: + raise ValueError("Problem statement is required. Call with_problem_statement() first.") + + if not self._scorers: + raise ValueError( + "At least one scorer is required. " + "Call add_test_command_scorer(), add_bash_script_scorer(), or another scorer method first." + ) + + return { + "name": self._name, + "input_context": { + "problem_statement": self._problem_statement, + "additional_context": self._additional_context, + }, + "scoring_contract": { + "scoring_function_parameters": self._build_normalized_scorers(), + }, + "environment_parameters": self._build_environment_params(), + "metadata": self._metadata, + "reference_output": self._reference_output, + "required_environment_variables": self._required_env_vars, + "required_secret_names": self._required_secrets, + "validation_type": self._validation_type, + } + + def preview(self) -> ScenarioPreview: + """Preview the scenario configuration without pushing to the platform. + + Returns the current configuration state as a ScenarioPreview object. + Does not validate or raise errors for missing required fields. + + :return: Preview of the scenario configuration + :rtype: ScenarioPreview + """ + return ScenarioPreview.model_validate( + { + "name": self._name, + "input_context": { + "problem_statement": self._problem_statement, + "additional_context": self._additional_context, + }, + "scoring_contract": { + "scoring_function_parameters": self._build_normalized_scorers(), + }, + "environment": self._build_environment_params(), + "metadata": self._metadata, + "reference_output": self._reference_output, + "required_environment_variables": self._required_env_vars, + "required_secret_names": self._required_secrets, + "validation_type": self._validation_type, + } + ) + + def push(self, **options: Unpack[LongRequestOptions]) -> Scenario: + """Create the scenario on the platform. + + :param options: Optional long-running request configuration + :raises ValueError: If required fields are missing + :return: Created scenario wrapper + :rtype: Scenario + """ + params = self.build() + scenario_view = self._client.scenarios.create(**params, **options) + return Scenario(self._client, scenario_view.id) diff --git a/src/runloop_api_client/sdk/scorer.py b/src/runloop_api_client/sdk/scorer.py index a25bb44a8..8df57ac05 100644 --- a/src/runloop_api_client/sdk/scorer.py +++ b/src/runloop_api_client/sdk/scorer.py @@ -16,7 +16,7 @@ class Scorer: """A custom scorer for evaluating scenario outputs. - Scorers define bash scripts that produce a score (0.0-1.0) for scenario runs. + Scorers define bash scripts that produce a score in the range [0.0, 1.0] for scenario runs. Obtain instances via ``runloop.scorer.create()`` or ``runloop.scorer.from_id()``. Example: diff --git a/src/runloop_api_client/sdk/sync.py b/src/runloop_api_client/sdk/sync.py index 6b38b5091..f215c8116 100644 --- a/src/runloop_api_client/sdk/sync.py +++ b/src/runloop_api_client/sdk/sync.py @@ -35,6 +35,7 @@ from .snapshot import Snapshot from .blueprint import Blueprint from .storage_object import StorageObject +from .scenario_builder import ScenarioBuilder from ..lib.context_loader import TarFilter, build_directory_tar from ..types.object_create_params import ContentType from ..types.shared_params.agent_source import Git, Npm, Pip, Object @@ -794,6 +795,16 @@ class ScenarioOps: >>> scenario = runloop.scenario.from_id("scn-xxx") >>> run = scenario.run() >>> scenarios = runloop.scenario.list() + + Example using builder: + >>> builder = ( + ... runloop.scenario.builder("my-scenario") + ... .from_blueprint(blueprint) + ... .with_problem_statement("Fix the bug") + ... .add_test_command_scorer("tests", test_command="pytest") + ... ) + >>> params = builder.build() + >>> scenario = runloop.scenario.create(**params) # equivalent to builder.push() """ def __init__(self, client: Runloop) -> None: @@ -804,6 +815,16 @@ def __init__(self, client: Runloop) -> None: """ self._client = client + def builder(self, name: str) -> ScenarioBuilder: + """Create a new scenario builder. + + :param name: Name for the scenario + :type name: str + :return: A new ScenarioBuilder instance + :rtype: ScenarioBuilder + """ + return ScenarioBuilder(name, self._client) + def from_id(self, scenario_id: str) -> Scenario: """Get a Scenario instance for an existing scenario ID. diff --git a/tests/sdk/test_async_execution.py b/tests/sdk/test_async_execution.py index b33b4cf1f..06629cf63 100644 --- a/tests/sdk/test_async_execution.py +++ b/tests/sdk/test_async_execution.py @@ -14,7 +14,8 @@ TASK_COMPLETION_SHORT, MockExecutionView, ) -from runloop_api_client.sdk.async_execution import AsyncExecution, _AsyncStreamingGroup +from runloop_api_client.sdk import AsyncExecution +from runloop_api_client.sdk.async_execution import _AsyncStreamingGroup # Legacy aliases for backward compatibility SHORT_SLEEP = TASK_COMPLETION_SHORT diff --git a/tests/sdk/test_async_execution_result.py b/tests/sdk/test_async_execution_result.py index 2a71da1c7..cf8a23caa 100644 --- a/tests/sdk/test_async_execution_result.py +++ b/tests/sdk/test_async_execution_result.py @@ -8,7 +8,7 @@ import pytest from tests.sdk.conftest import MockExecutionView -from runloop_api_client.sdk.async_execution_result import AsyncExecutionResult +from runloop_api_client.sdk import AsyncExecutionResult class TestAsyncExecutionResult: diff --git a/tests/sdk/test_async_ops.py b/tests/sdk/test_async_ops.py index b276f29ee..f8a16e1c0 100644 --- a/tests/sdk/test_async_ops.py +++ b/tests/sdk/test_async_ops.py @@ -24,17 +24,17 @@ AsyncAgent, AsyncDevbox, AsyncScorer, + AsyncAgentOps, + AsyncScenario, AsyncSnapshot, AsyncBlueprint, - AsyncStorageObject, -) -from runloop_api_client.sdk.async_ import ( - AsyncAgentOps, AsyncDevboxOps, AsyncScorerOps, AsyncRunloopSDK, + AsyncScenarioOps, AsyncSnapshotOps, AsyncBlueprintOps, + AsyncStorageObject, AsyncStorageObjectOps, ) from runloop_api_client.lib.polling import PollingConfig @@ -1122,8 +1122,6 @@ class TestAsyncScenarioOps: def test_from_id(self, mock_async_client: AsyncMock) -> None: """Test from_id method.""" - from runloop_api_client.sdk import AsyncScenario - from runloop_api_client.sdk.async_ import AsyncScenarioOps ops = AsyncScenarioOps(mock_async_client) scenario = ops.from_id("scn_123") @@ -1134,7 +1132,6 @@ def test_from_id(self, mock_async_client: AsyncMock) -> None: @pytest.mark.asyncio async def test_list_empty(self, mock_async_client: AsyncMock) -> None: """Test list method with empty results.""" - from runloop_api_client.sdk.async_ import AsyncScenarioOps async def async_iter(): return @@ -1151,8 +1148,6 @@ async def async_iter(): @pytest.mark.asyncio async def test_list_single(self, mock_async_client: AsyncMock, scenario_view: MockScenarioView) -> None: """Test list method with single result.""" - from runloop_api_client.sdk import AsyncScenario - from runloop_api_client.sdk.async_ import AsyncScenarioOps async def async_iter(): yield scenario_view @@ -1170,8 +1165,6 @@ async def async_iter(): @pytest.mark.asyncio async def test_list_multiple(self, mock_async_client: AsyncMock) -> None: """Test list method with multiple results.""" - from runloop_api_client.sdk import AsyncScenario - from runloop_api_client.sdk.async_ import AsyncScenarioOps scenario_view1 = MockScenarioView(id="scn_001", name="scenario-1") scenario_view2 = MockScenarioView(id="scn_002", name="scenario-2") @@ -1198,33 +1191,33 @@ class TestAsyncRunloopSDK: def test_init(self) -> None: """Test AsyncRunloopSDK initialization.""" - sdk = AsyncRunloopSDK(bearer_token="test-token") - assert sdk.api is not None - assert isinstance(sdk.agent, AsyncAgentOps) - assert isinstance(sdk.devbox, AsyncDevboxOps) - assert isinstance(sdk.scorer, AsyncScorerOps) - assert isinstance(sdk.snapshot, AsyncSnapshotOps) - assert isinstance(sdk.blueprint, AsyncBlueprintOps) - assert isinstance(sdk.storage_object, AsyncStorageObjectOps) + runloop = AsyncRunloopSDK(bearer_token="test-token") + assert runloop.api is not None + assert isinstance(runloop.agent, AsyncAgentOps) + assert isinstance(runloop.devbox, AsyncDevboxOps) + assert isinstance(runloop.scorer, AsyncScorerOps) + assert isinstance(runloop.snapshot, AsyncSnapshotOps) + assert isinstance(runloop.blueprint, AsyncBlueprintOps) + assert isinstance(runloop.storage_object, AsyncStorageObjectOps) @pytest.mark.asyncio async def test_aclose(self) -> None: """Test aclose method.""" - sdk = AsyncRunloopSDK(bearer_token="test-token") + runloop = AsyncRunloopSDK(bearer_token="test-token") # Verify aclose doesn't raise - await sdk.aclose() + await runloop.aclose() @pytest.mark.asyncio async def test_context_manager(self) -> None: """Test context manager behavior.""" - async with AsyncRunloopSDK(bearer_token="test-token") as sdk: - assert sdk.api is not None + async with AsyncRunloopSDK(bearer_token="test-token") as runloop: + assert runloop.api is not None # Verify context manager properly closes (implementation detail of context manager protocol) def test_api_property(self) -> None: """Test api property access.""" - sdk = AsyncRunloopSDK(bearer_token="test-token") - assert sdk.api is not None - assert hasattr(sdk.api, "devboxes") - assert hasattr(sdk.api, "blueprints") - assert hasattr(sdk.api, "objects") + runloop = AsyncRunloopSDK(bearer_token="test-token") + assert runloop.api is not None + assert hasattr(runloop.api, "devboxes") + assert hasattr(runloop.api, "blueprints") + assert hasattr(runloop.api, "objects") diff --git a/tests/sdk/test_async_scenario_builder.py b/tests/sdk/test_async_scenario_builder.py new file mode 100644 index 000000000..e20d99843 --- /dev/null +++ b/tests/sdk/test_async_scenario_builder.py @@ -0,0 +1,292 @@ +"""Unit tests for AsyncScenarioBuilder class.""" + +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from runloop_api_client.sdk import AsyncSnapshot, AsyncBlueprint, ScenarioPreview, AsyncScenarioBuilder +from runloop_api_client.types.scoring_function_param import ScorerTestBasedScoringFunctionTestFile + + +class TestAsyncScenarioBuilder: + """Tests for the asynchronous AsyncScenarioBuilder.""" + + @pytest.fixture + def mock_async_client(self) -> MagicMock: + """Create a mock AsyncRunloop client.""" + client = MagicMock() + client.scenarios = MagicMock() + client.scenarios.create = AsyncMock() + return client + + @pytest.fixture + def mock_blueprint(self, mock_async_client: MagicMock) -> AsyncBlueprint: + """Create a mock AsyncBlueprint object.""" + return AsyncBlueprint(mock_async_client, "bp-123") + + @pytest.fixture + def mock_snapshot(self, mock_async_client: MagicMock) -> AsyncSnapshot: + """Create a mock AsyncSnapshot object.""" + return AsyncSnapshot(mock_async_client, "snap-123") + + @pytest.fixture + def mock_builder(self, mock_async_client: MagicMock) -> AsyncScenarioBuilder: + """Create an AsyncScenarioBuilder instance with mock client.""" + return AsyncScenarioBuilder("test-scenario", mock_async_client) + + def test_instantiation(self, mock_async_client: MagicMock) -> None: + """Test builder initialization and repr.""" + builder = AsyncScenarioBuilder("my-scenario", mock_async_client) + + assert builder._client is mock_async_client + assert builder._name == "my-scenario" + assert builder.name == "my-scenario" + assert repr(builder) == "" + + def test_from_blueprint_and_snapshot( + self, mock_builder: AsyncScenarioBuilder, mock_blueprint: AsyncBlueprint, mock_snapshot: AsyncSnapshot + ) -> None: + """Test blueprint/snapshot setting returns self and are mutually exclusive.""" + # from_blueprint returns self and sets blueprint + result = mock_builder.from_blueprint(mock_blueprint) + assert result is mock_builder + assert mock_builder._blueprint is mock_blueprint + assert mock_builder._snapshot is None + + # from_snapshot returns self, sets snapshot, and clears blueprint + result = mock_builder.from_snapshot(mock_snapshot) + assert result is mock_builder + assert mock_builder._snapshot is mock_snapshot + assert mock_builder._blueprint is None + + # from_blueprint clears snapshot + mock_builder.from_blueprint(mock_blueprint) + assert mock_builder._blueprint is mock_blueprint + assert mock_builder._snapshot is None + + def test_scorers(self, mock_builder: AsyncScenarioBuilder) -> None: + """Test all scorer types, optional params, and multiple scorers.""" + # Test scorer with test files + test_files: list[ScorerTestBasedScoringFunctionTestFile] = [ + {"file_path": "test_main.py", "file_contents": "def test_foo(): pass"} + ] + result = mock_builder.add_test_command_scorer( + "test-scorer", test_command="pytest", weight=2.0, test_files=test_files + ) + assert result is mock_builder + assert mock_builder._scorers[0]["name"] == "test-scorer" + assert mock_builder._scorers[0]["weight"] == 2.0 + assert mock_builder._scorers[0]["scorer"]["type"] == "test_based_scorer" + assert mock_builder._scorers[0]["scorer"].get("test_command") == "pytest" + assert mock_builder._scorers[0]["scorer"].get("test_files") == test_files + + # Command scorer + mock_builder.add_shell_command_scorer("cmd-scorer", command="./check.sh") + assert mock_builder._scorers[1]["scorer"]["type"] == "command_scorer" + assert mock_builder._scorers[1]["scorer"].get("command") == "./check.sh" + + # Bash scorer + mock_builder.add_bash_script_scorer("bash-scorer", bash_script="echo 'score=1.0'") + assert mock_builder._scorers[2]["scorer"]["type"] == "bash_script_scorer" + assert mock_builder._scorers[2]["scorer"].get("bash_script") == "echo 'score=1.0'" + + # Python scorer with optional params + mock_builder.add_python_script_scorer( + "python-scorer", + python_script="print('1.0')", + python_version_constraint=">=3.10", + requirements_contents="numpy", + ) + assert mock_builder._scorers[3]["scorer"]["type"] == "python_script_scorer" + assert mock_builder._scorers[3]["scorer"].get("python_version_constraint") == ">=3.10" + assert mock_builder._scorers[3]["scorer"].get("requirements_contents") == "numpy" + + # AST grep scorer with optional lang + mock_builder.add_ast_grep_scorer("ast-scorer", pattern="$A.foo()", search_directory="/src", lang="python") + assert mock_builder._scorers[4]["scorer"]["type"] == "ast_grep_scorer" + assert mock_builder._scorers[4]["scorer"].get("pattern") == "$A.foo()" + assert mock_builder._scorers[4]["scorer"].get("lang") == "python" + + # Custom scorer with optional params + mock_builder.add_custom_scorer( + "custom-scorer", custom_scorer_type="my_scorer", scorer_params={"threshold": 0.5} + ) + assert mock_builder._scorers[5]["scorer"]["type"] == "custom_scorer" + assert mock_builder._scorers[5]["scorer"].get("custom_scorer_type") == "my_scorer" + assert mock_builder._scorers[5]["scorer"].get("scorer_params") == {"threshold": 0.5} + + # Verify multiple scorers accumulated + assert len(mock_builder._scorers) == 6 + + def test_add_scorer_rejects_invalid_weight(self, mock_builder: AsyncScenarioBuilder) -> None: + """Test that adding a scorer with zero or negative weight raises ValueError.""" + with pytest.raises(ValueError, match="Scorer weight must be positive"): + mock_builder.add_bash_script_scorer("bad", bash_script="echo 1", weight=0.0) + + with pytest.raises(ValueError, match="Scorer weight must be positive"): + mock_builder.add_bash_script_scorer("bad", bash_script="echo 1", weight=-1.0) + + def test_build_validation(self, mock_builder: AsyncScenarioBuilder) -> None: + """Test build raises for missing required fields.""" + # Missing problem statement + mock_builder.add_test_command_scorer("test", test_command="pytest") + with pytest.raises(ValueError, match="Problem statement is required"): + mock_builder.build() + + # Missing scorer (new builder) + builder2 = AsyncScenarioBuilder("test2", mock_builder._client) + builder2.with_problem_statement("Fix the bug") + with pytest.raises(ValueError, match="At least one scorer is required"): + builder2.build() + + def test_build_with_all_options(self, mock_builder: AsyncScenarioBuilder, mock_blueprint: AsyncBlueprint) -> None: + """Test build with all optional fields set.""" + mock_builder.with_problem_statement("Fix the bug") + mock_builder.with_additional_context({"hint": "line 42"}) + mock_builder.add_test_command_scorer("tests", test_command="pytest") + mock_builder.from_blueprint(mock_blueprint) + mock_builder.with_working_directory("/app") + mock_builder.with_metadata({"team": "infra"}) + mock_builder.with_reference_output("diff content") + mock_builder.with_required_env_vars(["API_KEY"]) + mock_builder.with_required_secrets(["db_pass"]) + mock_builder.with_validation_type("FORWARD") + + params = mock_builder.build() + + assert params["name"] == "test-scenario" + assert params["input_context"]["problem_statement"] == "Fix the bug" + assert params["input_context"].get("additional_context") == {"hint": "line 42"} + env_params = params.get("environment_parameters") + assert env_params is not None + assert env_params.get("blueprint_id") == "bp-123" + assert env_params.get("working_directory") == "/app" + assert params.get("metadata") == {"team": "infra"} + assert params.get("reference_output") == "diff content" + assert params.get("required_environment_variables") == ["API_KEY"] + assert params.get("required_secret_names") == ["db_pass"] + assert params.get("validation_type") == "FORWARD" + + def test_build_normalizes_weights(self, mock_builder: AsyncScenarioBuilder) -> None: + """Test that build normalizes scorer weights to sum to 1.0.""" + mock_builder.with_problem_statement("Fix the bug") + mock_builder.add_bash_script_scorer("scorer1", bash_script="echo 1", weight=1.0) + mock_builder.add_bash_script_scorer("scorer2", bash_script="echo 2", weight=2.0) + mock_builder.add_bash_script_scorer("scorer3", bash_script="echo 3", weight=3.0) + + params = mock_builder.build() + scorers = list(params["scoring_contract"]["scoring_function_parameters"]) + + # Weights 1, 2, 3 should normalize to 1/6, 2/6, 3/6 + assert len(scorers) == 3 + assert abs(scorers[0]["weight"] - 1 / 6) < 0.0001 + assert abs(scorers[1]["weight"] - 2 / 6) < 0.0001 + assert abs(scorers[2]["weight"] - 3 / 6) < 0.0001 + + # Total should be 1.0 + total = sum(s["weight"] for s in scorers) + assert abs(total - 1.0) < 0.0001 + + @pytest.mark.asyncio + async def test_push_calls_api_and_returns_scenario( + self, mock_builder: AsyncScenarioBuilder, mock_async_client: MagicMock + ) -> None: + """Test push() calls API with correct params and returns AsyncScenario.""" + mock_async_client.scenarios.create.return_value.id = "scn-new-123" + + mock_builder.with_problem_statement("Fix the bug") + mock_builder.add_test_command_scorer("tests", test_command="pytest") + + scenario = await mock_builder.push() + + mock_async_client.scenarios.create.assert_called_once() + call_kwargs = mock_async_client.scenarios.create.call_args.kwargs + assert call_kwargs["name"] == "test-scenario" + assert call_kwargs["input_context"]["problem_statement"] == "Fix the bug" + + assert scenario.id == "scn-new-123" + + def test_fluent_chaining(self, mock_builder: AsyncScenarioBuilder, mock_blueprint: AsyncBlueprint) -> None: + """Test that all builder methods can be chained fluently.""" + result = ( + mock_builder.from_blueprint(mock_blueprint) + .with_working_directory("/app") + .with_problem_statement("Fix the bug") + .with_additional_context({"hint": "check main.py"}) + .add_test_command_scorer("tests", test_command="pytest") + .with_metadata({"team": "infra"}) + .with_reference_output("diff content") + .with_required_env_vars(["API_KEY"]) + .with_required_secrets(["secret"]) + .with_validation_type("FORWARD") + ) + + assert result is mock_builder + assert mock_builder._blueprint is mock_blueprint + assert mock_builder._working_directory == "/app" + assert mock_builder._problem_statement == "Fix the bug" + assert len(mock_builder._scorers) == 1 + + def test_preview_with_no_config(self, mock_builder: AsyncScenarioBuilder) -> None: + """Test preview() works with no configuration (only name from constructor).""" + preview = mock_builder.preview() + + assert isinstance(preview, ScenarioPreview) + assert preview.name == "test-scenario" + assert preview.input_context is not None + assert preview.input_context.problem_statement is None + assert preview.input_context.additional_context is None + assert preview.scoring_contract is not None + assert len(preview.scoring_contract.scoring_function_parameters) == 0 + assert preview.environment is None + assert len(preview.metadata) == 0 + assert preview.reference_output is None + assert preview.required_environment_variables is None + assert preview.required_secret_names is None + assert preview.validation_type is None + + def test_preview_with_full_config(self, mock_builder: AsyncScenarioBuilder, mock_blueprint: AsyncBlueprint) -> None: + """Test preview() with all fields configured, including weight normalization.""" + mock_builder.with_problem_statement("Fix the bug") + mock_builder.with_additional_context({"hint": "line 42"}) + mock_builder.from_blueprint(mock_blueprint) + mock_builder.with_working_directory("/app") + mock_builder.with_metadata({"team": "infra"}) + mock_builder.with_reference_output("diff content") + mock_builder.with_required_env_vars(["API_KEY"]) + mock_builder.with_required_secrets(["db_pass"]) + mock_builder.with_validation_type("FORWARD") + # Add multiple scorers with different weights to test normalization + mock_builder.add_bash_script_scorer("scorer1", bash_script="echo 1", weight=1.0) + mock_builder.add_bash_script_scorer("scorer2", bash_script="echo 2", weight=2.0) + mock_builder.add_bash_script_scorer("scorer3", bash_script="echo 3", weight=3.0) + + preview = mock_builder.preview() + + # Verify it returns ScenarioPreview + assert isinstance(preview, ScenarioPreview) + + # Verify all fields are populated + assert preview.name == "test-scenario" + assert preview.input_context is not None + assert preview.input_context.problem_statement == "Fix the bug" + assert preview.input_context.additional_context == {"hint": "line 42"} + assert preview.environment is not None + assert preview.environment.blueprint_id == "bp-123" + assert preview.environment.working_directory == "/app" + assert preview.metadata == {"team": "infra"} + assert preview.reference_output == "diff content" + assert preview.required_environment_variables == ["API_KEY"] + assert preview.required_secret_names == ["db_pass"] + assert preview.validation_type == "FORWARD" + + # Verify weights are normalized (1, 2, 3 -> 1/6, 2/6, 3/6) + assert preview.scoring_contract is not None + scorers = preview.scoring_contract.scoring_function_parameters + assert len(scorers) == 3 + assert abs(scorers[0].weight - 1 / 6) < 0.0001 + assert abs(scorers[1].weight - 2 / 6) < 0.0001 + assert abs(scorers[2].weight - 3 / 6) < 0.0001 + assert abs(sum(s.weight for s in scorers) - 1.0) < 0.0001 diff --git a/tests/sdk/test_execution.py b/tests/sdk/test_execution.py index fa2aaca2f..63b244d0e 100644 --- a/tests/sdk/test_execution.py +++ b/tests/sdk/test_execution.py @@ -12,7 +12,8 @@ TASK_COMPLETION_SHORT, MockExecutionView, ) -from runloop_api_client.sdk.execution import Execution, _StreamingGroup +from runloop_api_client.sdk import Execution +from runloop_api_client.sdk.execution import _StreamingGroup # Legacy aliases for backward compatibility during transition SHORT_SLEEP = THREAD_STARTUP_DELAY diff --git a/tests/sdk/test_execution_result.py b/tests/sdk/test_execution_result.py index 60d51827f..689b108d5 100644 --- a/tests/sdk/test_execution_result.py +++ b/tests/sdk/test_execution_result.py @@ -6,7 +6,7 @@ from unittest.mock import Mock from tests.sdk.conftest import MockExecutionView -from runloop_api_client.sdk.execution_result import ExecutionResult +from runloop_api_client.sdk import ExecutionResult class TestExecutionResult: diff --git a/tests/sdk/test_ops.py b/tests/sdk/test_ops.py index fcca7bcbf..7ac503933 100644 --- a/tests/sdk/test_ops.py +++ b/tests/sdk/test_ops.py @@ -20,14 +20,21 @@ MockBlueprintView, create_mock_httpx_response, ) -from runloop_api_client.sdk import Agent, Devbox, Scorer, Snapshot, Blueprint, StorageObject -from runloop_api_client.sdk.sync import ( +from runloop_api_client.sdk import ( + Agent, + Devbox, + Scorer, AgentOps, + Scenario, + Snapshot, + Blueprint, DevboxOps, ScorerOps, RunloopSDK, + ScenarioOps, SnapshotOps, BlueprintOps, + StorageObject, StorageObjectOps, ) from runloop_api_client.lib.polling import PollingConfig @@ -1015,8 +1022,6 @@ class TestScenarioOps: def test_from_id(self, mock_client: Mock) -> None: """Test from_id method.""" - from runloop_api_client.sdk import Scenario - from runloop_api_client.sdk.sync import ScenarioOps ops = ScenarioOps(mock_client) scenario = ops.from_id("scn_123") @@ -1026,7 +1031,6 @@ def test_from_id(self, mock_client: Mock) -> None: def test_list_empty(self, mock_client: Mock) -> None: """Test list method with empty results.""" - from runloop_api_client.sdk.sync import ScenarioOps mock_client.scenarios.list.return_value = [] @@ -1038,8 +1042,6 @@ def test_list_empty(self, mock_client: Mock) -> None: def test_list_single(self, mock_client: Mock, scenario_view: MockScenarioView) -> None: """Test list method with single result.""" - from runloop_api_client.sdk import Scenario - from runloop_api_client.sdk.sync import ScenarioOps mock_client.scenarios.list.return_value = [scenario_view] @@ -1053,8 +1055,6 @@ def test_list_single(self, mock_client: Mock, scenario_view: MockScenarioView) - def test_list_multiple(self, mock_client: Mock) -> None: """Test list method with multiple results.""" - from runloop_api_client.sdk import Scenario - from runloop_api_client.sdk.sync import ScenarioOps scenario_view1 = MockScenarioView(id="scn_001", name="scenario-1") scenario_view2 = MockScenarioView(id="scn_002", name="scenario-2") @@ -1076,36 +1076,36 @@ class TestRunloopSDK: def test_init(self) -> None: """Test RunloopSDK initialization.""" - sdk = RunloopSDK(bearer_token="test-token") - assert sdk.api is not None - assert isinstance(sdk.agent, AgentOps) - assert isinstance(sdk.devbox, DevboxOps) - assert isinstance(sdk.scorer, ScorerOps) - assert isinstance(sdk.snapshot, SnapshotOps) - assert isinstance(sdk.blueprint, BlueprintOps) - assert isinstance(sdk.storage_object, StorageObjectOps) + runloop = RunloopSDK(bearer_token="test-token") + assert runloop.api is not None + assert isinstance(runloop.agent, AgentOps) + assert isinstance(runloop.devbox, DevboxOps) + assert isinstance(runloop.scorer, ScorerOps) + assert isinstance(runloop.snapshot, SnapshotOps) + assert isinstance(runloop.blueprint, BlueprintOps) + assert isinstance(runloop.storage_object, StorageObjectOps) def test_init_with_max_retries(self) -> None: """Test RunloopSDK initialization with max_retries.""" - sdk = RunloopSDK(bearer_token="test-token", max_retries=3) - assert sdk.api is not None + runloop = RunloopSDK(bearer_token="test-token", max_retries=3) + assert runloop.api is not None def test_close(self) -> None: """Test close method.""" - sdk = RunloopSDK(bearer_token="test-token") + runloop = RunloopSDK(bearer_token="test-token") # Verify close doesn't raise - sdk.close() + runloop.close() def test_context_manager(self) -> None: """Test context manager behavior.""" - with RunloopSDK(bearer_token="test-token") as sdk: - assert sdk.api is not None + with RunloopSDK(bearer_token="test-token") as runloop: + assert runloop.api is not None # Verify context manager properly closes (implementation detail of context manager protocol) def test_api_property(self) -> None: """Test api property access.""" - sdk = RunloopSDK(bearer_token="test-token") - assert sdk.api is not None - assert hasattr(sdk.api, "devboxes") - assert hasattr(sdk.api, "blueprints") - assert hasattr(sdk.api, "objects") + runloop = RunloopSDK(bearer_token="test-token") + assert runloop.api is not None + assert hasattr(runloop.api, "devboxes") + assert hasattr(runloop.api, "blueprints") + assert hasattr(runloop.api, "objects") diff --git a/tests/sdk/test_scenario_builder.py b/tests/sdk/test_scenario_builder.py new file mode 100644 index 000000000..75597d5f3 --- /dev/null +++ b/tests/sdk/test_scenario_builder.py @@ -0,0 +1,287 @@ +"""Unit tests for ScenarioBuilder class.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from runloop_api_client.sdk import Snapshot, Blueprint, ScenarioBuilder, ScenarioPreview +from runloop_api_client.types.scoring_function_param import ScorerTestBasedScoringFunctionTestFile + + +class TestScenarioBuilder: + """Tests for the synchronous ScenarioBuilder.""" + + @pytest.fixture + def mock_client(self) -> MagicMock: + """Create a mock Runloop client.""" + client = MagicMock() + return client + + @pytest.fixture + def mock_blueprint(self, mock_client: MagicMock) -> Blueprint: + """Create a mock Blueprint object.""" + return Blueprint(mock_client, "bp-123") + + @pytest.fixture + def mock_snapshot(self, mock_client: MagicMock) -> Snapshot: + """Create a mock Snapshot object.""" + return Snapshot(mock_client, "snap-123") + + @pytest.fixture + def mock_builder(self, mock_client: MagicMock) -> ScenarioBuilder: + """Create a ScenarioBuilder instance with mock client.""" + return ScenarioBuilder("test-scenario", mock_client) + + def test_instantiation(self, mock_client: MagicMock) -> None: + """Test builder initialization and repr.""" + builder = ScenarioBuilder("my-scenario", mock_client) + + assert builder._client is mock_client + assert builder._name == "my-scenario" + assert builder.name == "my-scenario" + assert repr(builder) == "" + + def test_from_blueprint_and_snapshot( + self, mock_builder: ScenarioBuilder, mock_blueprint: Blueprint, mock_snapshot: Snapshot + ) -> None: + """Test blueprint/snapshot setting returns self and are mutually exclusive.""" + # from_blueprint returns self and sets blueprint + result = mock_builder.from_blueprint(mock_blueprint) + assert result is mock_builder + assert mock_builder._blueprint is mock_blueprint + assert mock_builder._snapshot is None + + # from_snapshot returns self, sets snapshot, and clears blueprint + result = mock_builder.from_snapshot(mock_snapshot) + assert result is mock_builder + assert mock_builder._snapshot is mock_snapshot + assert mock_builder._blueprint is None + + # from_blueprint clears snapshot + mock_builder.from_blueprint(mock_blueprint) + assert mock_builder._blueprint is mock_blueprint + assert mock_builder._snapshot is None + + def test_scorers(self, mock_builder: ScenarioBuilder) -> None: + """Test all scorer types, optional params, and multiple scorers.""" + # Test scorer with test files + test_files: list[ScorerTestBasedScoringFunctionTestFile] = [ + {"file_path": "test_main.py", "file_contents": "def test_foo(): pass"} + ] + result = mock_builder.add_test_command_scorer( + "test-scorer", test_command="pytest", weight=2.0, test_files=test_files + ) + assert result is mock_builder + assert mock_builder._scorers[0]["name"] == "test-scorer" + assert mock_builder._scorers[0]["weight"] == 2.0 + assert mock_builder._scorers[0]["scorer"]["type"] == "test_based_scorer" + assert mock_builder._scorers[0]["scorer"].get("test_command") == "pytest" + assert mock_builder._scorers[0]["scorer"].get("test_files") == test_files + + # Command scorer + mock_builder.add_shell_command_scorer("cmd-scorer", command="./check.sh") + assert mock_builder._scorers[1]["scorer"]["type"] == "command_scorer" + assert mock_builder._scorers[1]["scorer"].get("command") == "./check.sh" + + # Bash scorer + mock_builder.add_bash_script_scorer("bash-scorer", bash_script="echo 'score=1.0'") + assert mock_builder._scorers[2]["scorer"]["type"] == "bash_script_scorer" + assert mock_builder._scorers[2]["scorer"].get("bash_script") == "echo 'score=1.0'" + + # Python scorer with optional params + mock_builder.add_python_script_scorer( + "python-scorer", + python_script="print('1.0')", + python_version_constraint=">=3.10", + requirements_contents="numpy", + ) + assert mock_builder._scorers[3]["scorer"]["type"] == "python_script_scorer" + assert mock_builder._scorers[3]["scorer"].get("python_version_constraint") == ">=3.10" + assert mock_builder._scorers[3]["scorer"].get("requirements_contents") == "numpy" + + # AST grep scorer with optional lang + mock_builder.add_ast_grep_scorer("ast-scorer", pattern="$A.foo()", search_directory="/src", lang="python") + assert mock_builder._scorers[4]["scorer"]["type"] == "ast_grep_scorer" + assert mock_builder._scorers[4]["scorer"].get("pattern") == "$A.foo()" + assert mock_builder._scorers[4]["scorer"].get("lang") == "python" + + # Custom scorer with optional params + mock_builder.add_custom_scorer( + "custom-scorer", custom_scorer_type="my_scorer", scorer_params={"threshold": 0.5} + ) + assert mock_builder._scorers[5]["scorer"]["type"] == "custom_scorer" + assert mock_builder._scorers[5]["scorer"].get("custom_scorer_type") == "my_scorer" + assert mock_builder._scorers[5]["scorer"].get("scorer_params") == {"threshold": 0.5} + + # Verify multiple scorers accumulated + assert len(mock_builder._scorers) == 6 + + def test_add_scorer_rejects_invalid_weight(self, mock_builder: ScenarioBuilder) -> None: + """Test that adding a scorer with zero or negative weight raises ValueError.""" + with pytest.raises(ValueError, match="Scorer weight must be positive"): + mock_builder.add_bash_script_scorer("bad", bash_script="echo 1", weight=0.0) + + with pytest.raises(ValueError, match="Scorer weight must be positive"): + mock_builder.add_bash_script_scorer("bad", bash_script="echo 1", weight=-1.0) + + def test_build_validation(self, mock_builder: ScenarioBuilder) -> None: + """Test build raises for missing required fields.""" + # Missing problem statement + mock_builder.add_test_command_scorer("test", test_command="pytest") + with pytest.raises(ValueError, match="Problem statement is required"): + mock_builder.build() + + # Missing scorer (new builder) + builder2 = ScenarioBuilder("test2", mock_builder._client) + builder2.with_problem_statement("Fix the bug") + with pytest.raises(ValueError, match="At least one scorer is required"): + builder2.build() + + def test_build_with_all_options(self, mock_builder: ScenarioBuilder, mock_blueprint: Blueprint) -> None: + """Test build with all optional fields set.""" + mock_builder.with_problem_statement("Fix the bug") + mock_builder.with_additional_context({"hint": "line 42"}) + mock_builder.add_test_command_scorer("tests", test_command="pytest") + mock_builder.from_blueprint(mock_blueprint) + mock_builder.with_working_directory("/app") + mock_builder.with_metadata({"team": "infra"}) + mock_builder.with_reference_output("diff content") + mock_builder.with_required_env_vars(["API_KEY"]) + mock_builder.with_required_secrets(["db_pass"]) + mock_builder.with_validation_type("FORWARD") + + params = mock_builder.build() + + assert params["name"] == "test-scenario" + assert params["input_context"]["problem_statement"] == "Fix the bug" + assert params["input_context"].get("additional_context") == {"hint": "line 42"} + env_params = params.get("environment_parameters") + assert env_params is not None + assert env_params.get("blueprint_id") == "bp-123" + assert env_params.get("working_directory") == "/app" + assert params.get("metadata") == {"team": "infra"} + assert params.get("reference_output") == "diff content" + assert params.get("required_environment_variables") == ["API_KEY"] + assert params.get("required_secret_names") == ["db_pass"] + assert params.get("validation_type") == "FORWARD" + + def test_build_normalizes_weights(self, mock_builder: ScenarioBuilder) -> None: + """Test that build normalizes scorer weights to sum to 1.0.""" + mock_builder.with_problem_statement("Fix the bug") + mock_builder.add_bash_script_scorer("scorer1", bash_script="echo 1", weight=1.0) + mock_builder.add_bash_script_scorer("scorer2", bash_script="echo 2", weight=2.0) + mock_builder.add_bash_script_scorer("scorer3", bash_script="echo 3", weight=3.0) + + params = mock_builder.build() + scorers = list(params["scoring_contract"]["scoring_function_parameters"]) + + # Weights 1, 2, 3 should normalize to 1/6, 2/6, 3/6 + assert len(scorers) == 3 + assert abs(scorers[0]["weight"] - 1 / 6) < 0.0001 + assert abs(scorers[1]["weight"] - 2 / 6) < 0.0001 + assert abs(scorers[2]["weight"] - 3 / 6) < 0.0001 + + # Total should be 1.0 + total = sum(s["weight"] for s in scorers) + assert abs(total - 1.0) < 0.0001 + + def test_push_calls_api_and_returns_scenario(self, mock_builder: ScenarioBuilder, mock_client: MagicMock) -> None: + """Test push() calls API with correct params and returns Scenario.""" + mock_client.scenarios.create.return_value.id = "scn-new-123" + + mock_builder.with_problem_statement("Fix the bug") + mock_builder.add_test_command_scorer("tests", test_command="pytest") + + scenario = mock_builder.push() + + mock_client.scenarios.create.assert_called_once() + call_kwargs = mock_client.scenarios.create.call_args.kwargs + assert call_kwargs["name"] == "test-scenario" + assert call_kwargs["input_context"]["problem_statement"] == "Fix the bug" + + assert scenario.id == "scn-new-123" + + def test_fluent_chaining(self, mock_builder: ScenarioBuilder, mock_blueprint: Blueprint) -> None: + """Test that all builder methods can be chained fluently.""" + result = ( + mock_builder.from_blueprint(mock_blueprint) + .with_working_directory("/app") + .with_problem_statement("Fix the bug") + .with_additional_context({"hint": "check main.py"}) + .add_test_command_scorer("tests", test_command="pytest") + .with_metadata({"team": "infra"}) + .with_reference_output("diff content") + .with_required_env_vars(["API_KEY"]) + .with_required_secrets(["secret"]) + .with_validation_type("FORWARD") + ) + + assert result is mock_builder + assert mock_builder._blueprint is mock_blueprint + assert mock_builder._working_directory == "/app" + assert mock_builder._problem_statement == "Fix the bug" + assert len(mock_builder._scorers) == 1 + + def test_preview_with_no_config(self, mock_builder: ScenarioBuilder) -> None: + """Test preview() works with no configuration (only name from constructor).""" + preview = mock_builder.preview() + + assert isinstance(preview, ScenarioPreview) + assert preview.name == "test-scenario" + assert preview.input_context is not None + assert preview.input_context.problem_statement is None + assert preview.input_context.additional_context is None + assert preview.scoring_contract is not None + assert len(preview.scoring_contract.scoring_function_parameters) == 0 + assert preview.environment is None + assert len(preview.metadata) == 0 + assert preview.reference_output is None + assert preview.required_environment_variables is None + assert preview.required_secret_names is None + assert preview.validation_type is None + + def test_preview_with_full_config(self, mock_builder: ScenarioBuilder, mock_blueprint: Blueprint) -> None: + """Test preview() with all fields configured, including weight normalization.""" + mock_builder.with_problem_statement("Fix the bug") + mock_builder.with_additional_context({"hint": "line 42"}) + mock_builder.from_blueprint(mock_blueprint) + mock_builder.with_working_directory("/app") + mock_builder.with_metadata({"team": "infra"}) + mock_builder.with_reference_output("diff content") + mock_builder.with_required_env_vars(["API_KEY"]) + mock_builder.with_required_secrets(["db_pass"]) + mock_builder.with_validation_type("FORWARD") + # Add multiple scorers with different weights to test normalization + mock_builder.add_bash_script_scorer("scorer1", bash_script="echo 1", weight=1.0) + mock_builder.add_bash_script_scorer("scorer2", bash_script="echo 2", weight=2.0) + mock_builder.add_bash_script_scorer("scorer3", bash_script="echo 3", weight=3.0) + + preview = mock_builder.preview() + + # Verify it returns ScenarioPreview + assert isinstance(preview, ScenarioPreview) + + # Verify all fields are populated + assert preview.name == "test-scenario" + assert preview.input_context is not None + assert preview.input_context.problem_statement == "Fix the bug" + assert preview.input_context.additional_context == {"hint": "line 42"} + assert preview.environment is not None + assert preview.environment.blueprint_id == "bp-123" + assert preview.environment.working_directory == "/app" + assert preview.metadata == {"team": "infra"} + assert preview.reference_output == "diff content" + assert preview.required_environment_variables == ["API_KEY"] + assert preview.required_secret_names == ["db_pass"] + assert preview.validation_type == "FORWARD" + + # Verify weights are normalized (1, 2, 3 -> 1/6, 2/6, 3/6) + assert preview.scoring_contract is not None + scorers = preview.scoring_contract.scoring_function_parameters + assert len(scorers) == 3 + assert abs(scorers[0].weight - 1 / 6) < 0.0001 + assert abs(scorers[1].weight - 2 / 6) < 0.0001 + assert abs(scorers[2].weight - 3 / 6) < 0.0001 + assert abs(sum(s.weight for s in scorers) - 1.0) < 0.0001 diff --git a/tests/smoketests/sdk/conftest.py b/tests/smoketests/sdk/conftest.py index 003b0f314..b17a4cc1c 100644 --- a/tests/smoketests/sdk/conftest.py +++ b/tests/smoketests/sdk/conftest.py @@ -24,16 +24,16 @@ def sdk_client() -> Iterator[RunloopSDK]: if not bearer_token: pytest.skip("RUNLOOP_API_KEY environment variable not set") - client = RunloopSDK( + runloop = RunloopSDK( bearer_token=bearer_token, base_url=base_url, ) try: - yield client + yield runloop finally: try: - client.close() + runloop.close() except Exception: pass @@ -52,17 +52,17 @@ async def async_sdk_client() -> AsyncIterator[AsyncRunloopSDK]: if not bearer_token: pytest.skip("RUNLOOP_API_KEY environment variable not set") - client = AsyncRunloopSDK( + runloop = AsyncRunloopSDK( bearer_token=bearer_token, base_url=base_url, ) try: - async with client: - yield client + async with runloop: + yield runloop except Exception: # If context manager fails, try manual cleanup try: - await client.aclose() + await runloop.aclose() except Exception: pass diff --git a/tests/smoketests/sdk/test_async_scenario.py b/tests/smoketests/sdk/test_async_scenario.py index 1a6a6a6e7..b0abf6a41 100644 --- a/tests/smoketests/sdk/test_async_scenario.py +++ b/tests/smoketests/sdk/test_async_scenario.py @@ -4,12 +4,63 @@ import pytest -from runloop_api_client.sdk import AsyncRunloopSDK +from runloop_api_client.sdk import AsyncRunloopSDK, AsyncScenarioBuilder +from tests.smoketests.utils import unique_name +from runloop_api_client.types import ScenarioView +from runloop_api_client.sdk._types import SDKScenarioUpdateParams +from runloop_api_client.sdk._helpers import filter_params pytestmark = [pytest.mark.smoketest] TWO_MINUTE_TIMEOUT = 120 FIVE_MINUTE_TIMEOUT = 300 +TEN_MINUTE_TIMEOUT = 600 + +# Metadata tag for all smoketest scenarios (for easy identification/cleanup) +SMOKETEST_METADATA = {"smoketest": "true"} + + +async def push_or_update_scenario(sdk_client: AsyncRunloopSDK, builder: AsyncScenarioBuilder) -> ScenarioView: + """Push a new scenario or update existing one with the same name. + + This is a workaround until scenario delete endpoint is available. + Uses fixed scenario names to avoid littering the platform with test scenarios. + + When updating an existing scenario, this function will delete the OLD blueprint/snapshot + that's no longer needed (if different from the new one). The NEW blueprint/snapshot + is kept so the scenario remains runnable. + """ + # Check if scenario already exists + scenarios = await sdk_client.scenario.list(name=builder.name, limit=1) + + if scenarios: + # Get old scenario info to find old blueprint/snapshot IDs + scenario = scenarios[0] + old_scenario_info = await scenario.get_info() + old_env = old_scenario_info.environment + old_blueprint_id = old_env.blueprint_id if old_env else None + old_snapshot_id = old_env.snapshot_id if old_env else None + + # Get new blueprint/snapshot IDs from builder + new_blueprint_id = builder._blueprint.id if builder._blueprint else None + new_snapshot_id = builder._snapshot.id if builder._snapshot else None + + # Update existing scenario with builder's params + params = builder.build() + result = await scenario.update(**filter_params(params, SDKScenarioUpdateParams)) + + # Delete OLD blueprint/snapshot if they're being replaced + if old_blueprint_id and old_blueprint_id != new_blueprint_id: + await sdk_client.blueprint.from_id(old_blueprint_id).delete() + + if old_snapshot_id and old_snapshot_id != new_snapshot_id: + await sdk_client.snapshot.from_id(old_snapshot_id).delete() + + return result + else: + # Create new scenario - keep the blueprint/snapshot (scenario needs them) + scenario = await builder.push() + return await scenario.get_info() class TestAsyncScenarioRetrieval: @@ -52,7 +103,7 @@ class TestAsyncScenarioRun: """Test async scenario run operations.""" @pytest.mark.timeout(FIVE_MINUTE_TIMEOUT) - async def test_scenario_run_lifecycle(self, async_sdk_client: AsyncRunloopSDK) -> None: + async def test_scenario_run_async_lifecycle(self, async_sdk_client: AsyncRunloopSDK) -> None: """Test running a scenario and accessing the devbox. This test: @@ -63,7 +114,7 @@ async def test_scenario_run_lifecycle(self, async_sdk_client: AsyncRunloopSDK) - 5. Cancels the run """ # Find a scenario to run - scenarios = await async_sdk_client.scenario.list(limit=5) + scenarios = await async_sdk_client.scenario.list(limit=1) if not scenarios: pytest.skip("No scenarios available to test run") @@ -72,6 +123,7 @@ async def test_scenario_run_lifecycle(self, async_sdk_client: AsyncRunloopSDK) - # Start a run run = await scenario.run_async(run_name="sdk-smoketest-async-run") + devbox = None try: assert run.id is not None @@ -82,7 +134,8 @@ async def test_scenario_run_lifecycle(self, async_sdk_client: AsyncRunloopSDK) - # Access devbox devbox = run.devbox - assert devbox.id == run.devbox_id + info = await devbox.get_info() + assert info.status == "running" # Get run info info = await run.get_info() @@ -94,13 +147,14 @@ async def test_scenario_run_lifecycle(self, async_sdk_client: AsyncRunloopSDK) - try: await run.cancel() except Exception: - pass # Best effort cleanup + if devbox: + await devbox.shutdown() @pytest.mark.timeout(FIVE_MINUTE_TIMEOUT) - async def test_scenario_run_and_await_env_ready(self, async_sdk_client: AsyncRunloopSDK) -> None: - """Test run_and_await_env_ready convenience method.""" + async def test_scenario_run(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test run convenience method.""" # Find a scenario to run - scenarios = await async_sdk_client.scenario.list(limit=5) + scenarios = await async_sdk_client.scenario.list(limit=1) if not scenarios: pytest.skip("No scenarios available to test run") @@ -108,6 +162,7 @@ async def test_scenario_run_and_await_env_ready(self, async_sdk_client: AsyncRun # Start a run and wait for environment in one call run = await scenario.run(run_name="sdk-smoketest-async-await") + devbox = None try: assert run.id is not None @@ -123,4 +178,83 @@ async def test_scenario_run_and_await_env_ready(self, async_sdk_client: AsyncRun try: await run.cancel() except Exception: - pass + if devbox: + await devbox.shutdown() + + +class TestAsyncScenarioBuilder: + """Test AsyncScenarioBuilder operations.""" + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + async def test_scenario_builder_minimal(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test creating/updating a minimal scenario with just problem statement and scorer.""" + builder = ( + async_sdk_client.scenario.builder("sdk-smoketest-async-builder-minimal") + .with_problem_statement("Async minimal test problem statement") + .with_metadata(SMOKETEST_METADATA) + .add_shell_command_scorer("async-minimal-scorer", command="echo 1.0") + ) + + info = await push_or_update_scenario(async_sdk_client, builder) + + assert info.name == "sdk-smoketest-async-builder-minimal" + assert info.input_context.problem_statement == "Async minimal test problem statement" + assert len(info.scoring_contract.scoring_function_parameters) == 1 + assert info.scoring_contract.scoring_function_parameters[0].name == "async-minimal-scorer" + + @pytest.mark.timeout(FIVE_MINUTE_TIMEOUT) + async def test_scenario_builder_with_blueprint(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test creating/updating a scenario from a blueprint.""" + blueprint = await async_sdk_client.blueprint.create( + name=unique_name("sdk-smoketest-async-scenario-bp"), + dockerfile="FROM ubuntu:20.04", + ) + + builder = ( + async_sdk_client.scenario.builder("sdk-smoketest-async-builder-blueprint") + .from_blueprint(blueprint) + .with_working_directory("/home/user") + .with_problem_statement("Async blueprint test problem") + .with_metadata(SMOKETEST_METADATA) + .add_shell_command_scorer("async-blueprint-scorer", command="echo 1.0") + ) + + info = await push_or_update_scenario(async_sdk_client, builder) + + assert info.name == "sdk-smoketest-async-builder-blueprint" + assert info.input_context.problem_statement == "Async blueprint test problem" + assert info.environment is not None + assert info.environment.blueprint_id == blueprint.id + assert info.environment.working_directory == "/home/user" + + @pytest.mark.timeout(TEN_MINUTE_TIMEOUT) + async def test_scenario_builder_with_snapshot(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test creating/updating a scenario from a snapshot.""" + # Create blueprint -> devbox -> snapshot chain + blueprint = await async_sdk_client.blueprint.create( + name=unique_name("sdk-smoketest-async-scenario-snap-bp"), + dockerfile="FROM ubuntu:20.04", + ) + devbox = await async_sdk_client.devbox.create(blueprint_id=blueprint.id) + snapshot = await devbox.snapshot_disk(name=unique_name("sdk-smoketest-async-scenario-snap")) + + # Shut down the devbox - it's not needed after creating the snapshot + try: + await devbox.shutdown() + except Exception: + pass + + builder = ( + async_sdk_client.scenario.builder("sdk-smoketest-async-builder-snapshot") + .from_snapshot(snapshot) + .with_problem_statement("Async snapshot test problem") + .with_metadata(SMOKETEST_METADATA) + .add_shell_command_scorer("async-snapshot-scorer", command="echo 1.0") + ) + + info = await push_or_update_scenario(async_sdk_client, builder) + + assert info.name == "sdk-smoketest-async-builder-snapshot" + assert info.input_context.problem_statement == "Async snapshot test problem" + assert info.environment is not None + assert info.environment.snapshot_id == snapshot.id diff --git a/tests/smoketests/sdk/test_async_sdk.py b/tests/smoketests/sdk/test_async_sdk.py index 49f7e961d..fd8c03ca8 100644 --- a/tests/smoketests/sdk/test_async_sdk.py +++ b/tests/smoketests/sdk/test_async_sdk.py @@ -16,12 +16,15 @@ class TestAsyncRunloopSDKInitialization: @pytest.mark.timeout(THIRTY_SECOND_TIMEOUT) async def test_sdk_instance_creation(self, async_sdk_client: AsyncRunloopSDK) -> None: - """Test that async SDK instance is created successfully with all client properties.""" + """Test that async SDK instance is created successfully with all operations.""" assert async_sdk_client is not None assert async_sdk_client.devbox is not None assert async_sdk_client.blueprint is not None assert async_sdk_client.snapshot is not None assert async_sdk_client.storage_object is not None + assert async_sdk_client.scorer is not None + assert async_sdk_client.agent is not None + assert async_sdk_client.scenario is not None @pytest.mark.timeout(THIRTY_SECOND_TIMEOUT) async def test_legacy_api_access(self, async_sdk_client: AsyncRunloopSDK) -> None: diff --git a/tests/smoketests/sdk/test_scenario.py b/tests/smoketests/sdk/test_scenario.py index af8d81486..4128cfa29 100644 --- a/tests/smoketests/sdk/test_scenario.py +++ b/tests/smoketests/sdk/test_scenario.py @@ -4,12 +4,62 @@ import pytest -from runloop_api_client.sdk import RunloopSDK +from runloop_api_client.sdk import RunloopSDK, ScenarioBuilder +from tests.smoketests.utils import unique_name +from runloop_api_client.types import ScenarioView +from runloop_api_client.sdk._types import SDKScenarioUpdateParams +from runloop_api_client.sdk._helpers import filter_params pytestmark = [pytest.mark.smoketest] TWO_MINUTE_TIMEOUT = 120 FIVE_MINUTE_TIMEOUT = 300 +TEN_MINUTE_TIMEOUT = 600 + +# Metadata tag for all smoketest scenarios (for easy identification/cleanup) +SMOKETEST_METADATA = {"smoketest": "true"} + + +def push_or_update_scenario(sdk_client: RunloopSDK, builder: ScenarioBuilder) -> ScenarioView: + """Push a new scenario or update existing one with the same name. + + This is a workaround until scenario delete endpoint is available. + Uses fixed scenario names to avoid littering the platform with test scenarios. + + When updating an existing scenario, this function will delete the OLD blueprint/snapshot + that's no longer needed (if different from the new one). The NEW blueprint/snapshot + is kept so the scenario remains runnable. + """ + # Check if scenario already exists + scenarios = sdk_client.scenario.list(name=builder.name, limit=1) + + if scenarios: + # Get old scenario info to find old blueprint/snapshot IDs + scenario = scenarios[0] + env = scenario.get_info().environment + old_blueprint_id = env.blueprint_id if env else None + old_snapshot_id = env.snapshot_id if env else None + + # Get new blueprint/snapshot IDs from builder + new_blueprint_id = builder._blueprint.id if builder._blueprint else None + new_snapshot_id = builder._snapshot.id if builder._snapshot else None + + # Update existing scenario with builder's params + params = builder.build() + result = scenario.update(**filter_params(params, SDKScenarioUpdateParams)) + + # Delete OLD blueprint/snapshot if they're being replaced + if old_blueprint_id and old_blueprint_id != new_blueprint_id: + sdk_client.blueprint.from_id(old_blueprint_id).delete() + + if old_snapshot_id and old_snapshot_id != new_snapshot_id: + sdk_client.snapshot.from_id(old_snapshot_id).delete() + + return result + else: + # Create new scenario - keep the blueprint/snapshot (scenario needs them) + scenario = builder.push() + return scenario.get_info() class TestScenarioRetrieval: @@ -52,7 +102,7 @@ class TestScenarioRun: """Test scenario run operations.""" @pytest.mark.timeout(FIVE_MINUTE_TIMEOUT) - def test_scenario_run_lifecycle(self, sdk_client: RunloopSDK) -> None: + def test_scenario_run_async_lifecycle(self, sdk_client: RunloopSDK) -> None: """Test running a scenario and accessing the devbox. This test: @@ -63,7 +113,7 @@ def test_scenario_run_lifecycle(self, sdk_client: RunloopSDK) -> None: 5. Cancels the run """ # Find a scenario to run - scenarios = sdk_client.scenario.list(limit=5) + scenarios = sdk_client.scenario.list(limit=1) if not scenarios: pytest.skip("No scenarios available to test run") @@ -72,6 +122,7 @@ def test_scenario_run_lifecycle(self, sdk_client: RunloopSDK) -> None: # Start a run run = scenario.run_async(run_name="sdk-smoketest-run") + devbox = None try: assert run.id is not None @@ -82,7 +133,8 @@ def test_scenario_run_lifecycle(self, sdk_client: RunloopSDK) -> None: # Access devbox devbox = run.devbox - assert devbox.id == run.devbox_id + info = devbox.get_info() + assert info.status == "running" # Get run info info = run.get_info() @@ -94,13 +146,14 @@ def test_scenario_run_lifecycle(self, sdk_client: RunloopSDK) -> None: try: run.cancel() except Exception: - pass # Best effort cleanup + if devbox: + devbox.shutdown() @pytest.mark.timeout(FIVE_MINUTE_TIMEOUT) - def test_scenario_run_and_await_env_ready(self, sdk_client: RunloopSDK) -> None: - """Test run_and_await_env_ready convenience method.""" + def test_scenario_run(self, sdk_client: RunloopSDK) -> None: + """Test run convenience method.""" # Find a scenario to run - scenarios = sdk_client.scenario.list(limit=5) + scenarios = sdk_client.scenario.list(limit=1) if not scenarios: pytest.skip("No scenarios available to test run") @@ -108,6 +161,7 @@ def test_scenario_run_and_await_env_ready(self, sdk_client: RunloopSDK) -> None: # Start a run and wait for environment in one call run = scenario.run(run_name="sdk-smoketest-await") + devbox = None try: assert run.id is not None @@ -123,4 +177,83 @@ def test_scenario_run_and_await_env_ready(self, sdk_client: RunloopSDK) -> None: try: run.cancel() except Exception: - pass + if devbox: + devbox.shutdown() + + +class TestScenarioBuilder: + """Test ScenarioBuilder operations.""" + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + def test_scenario_builder_minimal(self, sdk_client: RunloopSDK) -> None: + """Test creating/updating a minimal scenario with just problem statement and scorer.""" + builder = ( + sdk_client.scenario.builder("sdk-smoketest-builder-minimal") + .with_problem_statement("Minimal test problem statement") + .with_metadata(SMOKETEST_METADATA) + .add_shell_command_scorer("minimal-scorer", command="echo 1.0") + ) + + info = push_or_update_scenario(sdk_client, builder) + + assert info.name == "sdk-smoketest-builder-minimal" + assert info.input_context.problem_statement == "Minimal test problem statement" + assert len(info.scoring_contract.scoring_function_parameters) == 1 + assert info.scoring_contract.scoring_function_parameters[0].name == "minimal-scorer" + + @pytest.mark.timeout(FIVE_MINUTE_TIMEOUT) + def test_scenario_builder_with_blueprint(self, sdk_client: RunloopSDK) -> None: + """Test creating/updating a scenario from a blueprint.""" + blueprint = sdk_client.blueprint.create( + name=unique_name("sdk-smoketest-scenario-bp"), + dockerfile="FROM ubuntu:20.04", + ) + + builder = ( + sdk_client.scenario.builder("sdk-smoketest-builder-blueprint") + .from_blueprint(blueprint) + .with_working_directory("/home/user") + .with_problem_statement("Blueprint test problem") + .with_metadata(SMOKETEST_METADATA) + .add_shell_command_scorer("blueprint-scorer", command="echo 1.0") + ) + + info = push_or_update_scenario(sdk_client, builder) + + assert info.name == "sdk-smoketest-builder-blueprint" + assert info.input_context.problem_statement == "Blueprint test problem" + assert info.environment is not None + assert info.environment.blueprint_id == blueprint.id + assert info.environment.working_directory == "/home/user" + + @pytest.mark.timeout(TEN_MINUTE_TIMEOUT) + def test_scenario_builder_with_snapshot(self, sdk_client: RunloopSDK) -> None: + """Test creating/updating a scenario from a snapshot.""" + # Create blueprint -> devbox -> snapshot chain + blueprint = sdk_client.blueprint.create( + name=unique_name("sdk-smoketest-scenario-snap-bp"), + dockerfile="FROM ubuntu:20.04", + ) + devbox = sdk_client.devbox.create(blueprint_id=blueprint.id) + snapshot = devbox.snapshot_disk(name=unique_name("sdk-smoketest-scenario-snap")) + + # Shut down the devbox - it's not needed after creating the snapshot + try: + devbox.shutdown() + except Exception: + pass + + builder = ( + sdk_client.scenario.builder("sdk-smoketest-builder-snapshot") + .from_snapshot(snapshot) + .with_problem_statement("Snapshot test problem") + .with_metadata(SMOKETEST_METADATA) + .add_shell_command_scorer("snapshot-scorer", command="echo 1.0") + ) + + info = push_or_update_scenario(sdk_client, builder) + + assert info.name == "sdk-smoketest-builder-snapshot" + assert info.input_context.problem_statement == "Snapshot test problem" + assert info.environment is not None + assert info.environment.snapshot_id == snapshot.id diff --git a/tests/smoketests/sdk/test_sdk.py b/tests/smoketests/sdk/test_sdk.py index b55a98112..f79b88d43 100644 --- a/tests/smoketests/sdk/test_sdk.py +++ b/tests/smoketests/sdk/test_sdk.py @@ -16,12 +16,15 @@ class TestRunloopSDKInitialization: @pytest.mark.timeout(THIRTY_SECOND_TIMEOUT) def test_sdk_instance_creation(self, sdk_client: RunloopSDK) -> None: - """Test that SDK instance is created successfully with all client properties.""" + """Test that SDK instance is created successfully with all operations.""" assert sdk_client is not None assert sdk_client.devbox is not None assert sdk_client.blueprint is not None assert sdk_client.snapshot is not None assert sdk_client.storage_object is not None + assert sdk_client.scorer is not None + assert sdk_client.agent is not None + assert sdk_client.scenario is not None @pytest.mark.timeout(THIRTY_SECOND_TIMEOUT) def test_legacy_api_access(self, sdk_client: RunloopSDK) -> None: