# Testing stuff

For testing.

In [None]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain_core.tools import tool
from langchain_ollama import ChatOllama
from langchain.chat_models import init_chat_model
from langgraph.graph import START, StateGraph, END, MessagesState

from typing import List, Optional, Dict, Any
from typing_extensions import TypedDict

import subprocess
import json

from pydantic import BaseModel, Field

In [118]:
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Do not make up information, just run the `execute_command_help` and `execute_command_version` tools on the provided input command."
            "and store their output as the 'version_text' and 'help_text' fields of the ToolInfo object."
            "Do nothing else.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

test_help = """
toastbox - A delightful notification and message management tool

Usage: toastbox [OPTIONS] COMMAND

A command-line tool for creating, managing, and delivering toast notifications
and messages across your system and network.

Options:
  -v, --version          Show version information
  -c, --config PATH      Path to config file (default: ~/.toastbox/config.yml)
  -q, --quiet           Suppress all non-error output
  -h, --help            Show this help message

Commands:
  send      Send a toast notification
  listen    Start listening for incoming toasts
  history   View notification history
  config    Manage toastbox configuration

Run 'toastbox COMMAND --help' for more information on a command.

Examples:
  toastbox send "Hello, World!"
  toastbox listen --port 8080
  toastbox history --last 10
"""

test_version = "toastbox version 2.4"

In [114]:
"""Invocation agent: Executes CLI commands to capture help and version information."""

import subprocess
import json
from typing import Dict, Any
from langgraph.graph import START, StateGraph, END


def capture_help_and_version(executable: str) -> Dict[str, str]:
    """Run `<executable> --help` and `<executable> --version`, return text outputs."""
    help_text, version_text = None, None

    # --help
    try:
        result = subprocess.run(
            [executable, "--help"],
            capture_output=True,
            text=True,
            timeout=30
        )
        help_text = (result.stdout or result.stderr).strip()
    except FileNotFoundError:
        raise Exception(f"Executable '{executable}' not found. Check that it is installed.")
    except subprocess.TimeoutExpired:
        raise Exception(f"Timeout running {executable} --help")
    except Exception as e:
        raise Exception(f"Error running {executable} --help: {e}")

    # --version
    try:
        result = subprocess.run(
            [executable, "--version"],
            capture_output=True,
            text=True,
            timeout=10,
            errors = "replace"
        )
        version_text = (result.stdout).strip()
    except Exception:
        version_text = None

    return {"help_text": help_text, "version_text": version_text}


def invocation_agent(state: WorkflowState) -> Dict[str, Any]:
    """
    Invocation agent: Captures CLI help and version information.
    Returns the raw text output for the parsing agent to process.
    """
    executable = state.get("executable")
    if not executable:
        raise Exception("Executable name missing from state")

    try:
        # Capture CLI text
        cli_output = capture_help_and_version(executable)
        
        # Create basic tool info with raw text
        tool_info: ToolInfo = {
            "tool": executable,
            "help_text": cli_output["help_text"],
            "version_text": cli_output["version_text"],
            "subcommands": [],
            "global_parameters": []
        }

        print("Complete invocation")
        
        return {"tool_info": tool_info}
        
    except Exception as e:
        # Return error state
        tool_info: ToolInfo = {
            "tool": executable,
            "error": str(e),
            "subcommands": [],
            "global_parameters": []
        }
        return {"tool_info": tool_info}


# Create invocation subgraph
invocation_builder = StateGraph(WorkflowState)
invocation_builder.add_node("invocation_agent", invocation_agent)
invocation_builder.add_edge(START, "invocation_agent")
invocation_builder.add_edge("invocation_agent", END)

# Compile the invocation graph for export
invocation_graph = invocation_builder.compile()


if __name__ == "__main__":
    # Simple test
    test_state = {"executable": "bedtools"}
    result = invocation_graph.invoke(test_state)
    print(json.dumps(result, indent=2))

Complete invocation
{
  "messages": [],
  "executable": "bedtools",
  "tool_info": {
    "tool": "bedtools",
    "help_text": "bedtools is a powerful toolset for genome arithmetic.\n\nVersion:   v2.31.1\nAbout:     developed in the quinlanlab.org and by many contributors worldwide.\nDocs:      http://bedtools.readthedocs.io/\nCode:      https://github.com/arq5x/bedtools2\nMail:      https://groups.google.com/forum/#!forum/bedtools-discuss\n\nUsage:     bedtools <subcommand> [options]\n\nThe bedtools sub-commands include:\n\n[ Genome arithmetic ]\n    intersect     Find overlapping intervals in various ways.\n    window        Find overlapping intervals within a window around an interval.\n    closest       Find the closest, potentially non-overlapping interval.\n    coverage      Compute the coverage over defined intervals.\n    map           Apply a function to a column for each overlapping interval.\n    genomecov     Compute the coverage over an entire genome.\n    merge         Com

In [126]:
from langgraph.graph import MessagesState
from typing import Dict, List, Any, Optional
from typing_extensions import TypedDict

class Parameter(TypedDict, total=False):
    """Represents a CLI parameter."""
    name: str
    description: Optional[str]
    type: str  # string, int, double, path, flag, etc.
    required: bool
    default_value: Optional[str]
    is_flag: bool
    aliases: List[str]  # alternative names like -h, --help

class ContainerInfo(TypedDict, total=False):
    """Container image information for a tool from BioContainers."""
    bioconda: Optional[str]
    docker: Optional[str]
    singularity: Optional[str]

class EdamInput(TypedDict, total=False):
    """Represents an EDAM-standardized input."""
    name: str
    suffix: str  # file extension like .bam, .fasta
    edam: str    # EDAM ontology term like data_1383
    optional: bool

class EdamOutput(TypedDict, total=False):
    """Represents an EDAM-standardized output."""
    name: str
    suffix: str  # file extension like .bam, .sam
    edam: str    # EDAM ontology term like format_2572
    optional: bool

class StandardizedTool(TypedDict, total=False):
    """Represents a standardized tool with EDAM ontology terms."""
    id: str           # tool_subcommand format like "samtools_view"
    name: str         # uppercase name like "SAMTOOLS_VIEW"
    label: str        # process label like "process_low"
    inputs: List[EdamInput]
    outputs: List[EdamOutput]
    commands: str     # template command string

class Subcommand(TypedDict, total=False):
    """Represents a CLI subcommand."""
    name: str
    description: Optional[str]
    parameters: List[Parameter]
    usage: Optional[str]

class ToolInfo(TypedDict, total=False):
    """Represents information about a CLI tool."""
    tool: str
    version: Optional[str] 
    description: Optional[str]
    subcommands: List[Subcommand]
    global_parameters: List[Parameter]  # parameters that apply to all subcommands
    help_text: Optional[str]
    version_text: Optional[str]
    containers: Optional[ContainerInfo]
    error: Optional[str]

class WorkflowState(MessagesState):
    """State passed between agents in the flowgen workflow."""
    # Input
    executable: Optional[str]
    target_format: Optional[str]  # 'wdl' or 'nextflow'
    
    # Invocation agent outputs
    tool_info: Optional[ToolInfo]
    
    # Parsing agent outputs  
    parsed_subcommands: Optional[List[Subcommand]]
    
    # Standardization agent outputs
    standardized_tools: Optional[List[StandardizedTool]]
    standardized_parameters: Optional[List[Parameter]]
    
    # Troubleshooting agent outputs
    validation_errors: Optional[List[str]]
    suggested_fixes: Optional[List[str]]
    
    # Generator agent outputs
    generated_workflow: Optional[str]
    workflow_metadata: Optional[Dict[str, Any]]

In [123]:
import json
import re
import requests
from typing import Dict, Any, List
from langgraph.graph import START, StateGraph, END
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_ollama import ChatOllama
from sub_agents.schema import WorkflowState, ToolInfo, Subcommand, Parameter


def extract_version_from_text(version_text: str) -> str:
    """Extract clean version number from version text."""
    if not version_text:
        return None
    
    # Look for version patterns like "1.22.1", "v1.22.1", etc.
    version_match = re.search(r'v?(\d+\.\d+(?:\.\d+)?)', version_text)
    if version_match:
        return version_match.group(1)
    
    return None


def parsing_agent(state: WorkflowState) -> Dict[str, Any]:
    """
    Parsing agent: Uses LLM to parse help text and extract detailed parameter information.
    Takes the raw help/version text from invocation agent and produces structured subcommands and parameters.
    """
    tool_info = state.get("tool_info")
    if not tool_info:
        raise Exception("Tool info missing from state")
    
    if tool_info.get("error"):
        # Pass through error state
        return {"tool_info": tool_info}
    
    help_text = tool_info.get("help_text")
    version_text = tool_info.get("version_text")
    executable = tool_info.get("tool")
    
    if not help_text:
        raise Exception("Help text missing from tool info")

    url = "http://localhost:11434/api/generate"
    

    # System instruction for detailed parsing
    system_msg = SystemMessage(content="""
You are an expert CLI parser that extracts detailed subcommands and parameters information from help text.
Given a command-line tool's help output, produce structured JSON with subcommands and their parameters.

For each subcommand, extract:
- name: subcommand name
- description: what the subcommand does
- usage: how to use the subcommand
- parameters: list of parameters for the subcommand
- is_flag: true for boolean flags, false for value parameters
- aliases: alternative names like [-h, --help]

Rules:
- Only output valid JSON, no additional text
- Extract ALL parameters for each subcommand
- Infer parameter types from descriptions and usage patterns
- Mark parameters as required/optional based on help text formatting
- For tools without subcommands, put all parameters under a "main" subcommand
- If parsing fails, return error in the JSON
""")

    # Example JSON format
    example_json = {
        "tool": "samtools",
        "version": "1.22.1", 
        "description": "Tools for manipulating SAM/BAM/CRAM files",
        "subcommands": [
            {
                "name": "view",
                "description": "SAM<->BAM<->CRAM conversion",
                "usage": "samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]",
                "parameters": []
            },
            {
                "name": "index",
                "description": "Index a BAM file",
                "usage": "samtools index [options] <in.bam> [out.index]",
                "parameters": []
            }
        ]
    }

    # Human input
    human_msg = HumanMessage(content=f"""
Tool: {executable}

Parse the below version help text and extract detailed parameter information for each subcommand. Strictly generate the json in the below format.

--version output:
{version_text}

--help output:
{help_text}

Output JSON should follow this format:

{json.dumps(example_json, indent=2)}

DO NOT GENERATE ANY EXTRA or ADDITIONAL TEXT
""")

    payload = {
        "model": "llama3.1:8b",
        "system": system_msg.content,
        "prompt": human_msg.content,
        "stream": True,
    }

    try:
        with requests.post(url, json=payload, stream=True, timeout=120) as response:
            response.raise_for_status()

            collected_chunks = []

            out_file = "llama_response.txt"  
            file_handle = open(out_file, "w", encoding="utf-8") if out_file else None

            for line in response.iter_lines():
                if line:
                    try:
                        data = json.loads(line.decode("utf-8"))
                        chunk = data.get("response", "")
                        collected_chunks.append(chunk)

                        # Write to file immediately if specified
                        if file_handle:
                            file_handle.write(chunk)
                            file_handle.flush()

                        if data.get("done", False):
                            break
                    except json.JSONDecodeError:
                        # Sometimes partial lines can appear
                        continue

            if file_handle:
                file_handle.close()

            raw_response = "".join(collected_chunks)
            
            raw_response = re.sub('```json', '', raw_response)
            raw_response = re.sub('```', '', raw_response).strip()
            parsed = json.loads(raw_response)
            
        # Clean up version if needed
        if parsed.get("version") and version_text:
            clean_version = extract_version_from_text(version_text)
            if clean_version:
                parsed["version"] = clean_version
        
        # Update tool_info with parsed results
        updated_tool_info: ToolInfo = {
            **tool_info,
            "version": parsed.get("version"),
            "description": parsed.get("description"),
            "subcommands": parsed.get("subcommands", []),
            "global_parameters": parsed.get("global_parameters", [])
        }

        print("Complete parsing")
        
        return {
            "tool_info": updated_tool_info,
            "parsed_subcommands": parsed.get("subcommands", [])
        }
        
    except Exception as e:
        # Return error state
        error_tool_info: ToolInfo = {
            **tool_info,
            "error": f"LLM parsing failed: {str(e)}"
        }
        return {"tool_info": error_tool_info}

# Create invocation & parser graphs subgraph
graph_builder = StateGraph(WorkflowState)
graph_builder.add_node("invocation_agent", invocation_agent)
graph_builder.add_node("parsing_agent", parsing_agent)
graph_builder.add_edge(START, "invocation_agent")
graph_builder.add_edge("invocation_agent", "parsing_agent")
graph_builder.add_edge("parsing_agent", END)

# Compile the invocation graph for export
invocation_graph = graph_builder.compile()


if __name__ == "__main__":
    # Simple test
    test_state = {"executable": "bedtools"}
    result = invocation_graph.invoke(test_state)
    print(json.dumps(result, indent=2))

ModuleNotFoundError: No module named 'sub_agents'

In [122]:
import re

def parse_version(version_string: str) -> str:
    """
    Extract numeric version from a version string.
    
    Strips common prefixes and extracts the version number pattern.
    Keeps semantic versioning including pre-release identifiers.
    
    Args:
        version_string: Raw version string from command output
        
    Returns:
        Cleaned version string with just numeric components
        
    Examples:
        >>> parse_version("v0.1.8")
        '0.1.8'
        >>> parse_version("running 9.2")
        '9.2'
        >>> parse_version("version: 1.8.4-a")
        '1.8.4-a'
        >>> parse_version("toastbox version 2.4.1")
        '2.4.1'
        >>> parse_version("Version 3.2.1-beta.1+build.123")
        '3.2.1-beta.1+build.123'
    """
    if not version_string:
        return ""
    
    # Pattern explanation:
    # \d+ - one or more digits (major version)
    # (?:\.\d+)* - zero or more groups of dot followed by digits (minor, patch, etc.)
    # (?:[-+][a-zA-Z0-9.]+)* - zero or more pre-release or build metadata segments
    pattern = r'\d+(?:\.\d+)*(?:[-+][a-zA-Z0-9.]+)*'
    
    match = re.search(pattern, version_string)
    if match:
        return match.group(0)
    
    # If no match found, return cleaned string (strip whitespace)
    return version_string.strip()


# Test cases
if __name__ == "__main__":
    test_cases = [
        ("""v0.1.8 rahblah blah
        toast is oh so tasty, yes
        """, "0.1.8"),
        ("running 9.2", "9.2"),
        ("version: 1.8.4-a", "1.8.4-a"),
        ("toastbox version 2.4.1", "2.4.1"),
        ("Version 3.2.1", "3.2.1"),
        ("v1.2.3-beta.1+build.123", "1.2.3-beta.1+build.123"),
        ("Git version 2.39.1", "2.39.1"),
        ("docker 24.0.5", "24.0.5"),
        ("Python 3.11.4", "3.11.4"),
        ("v10.0.0-rc.1", "10.0.0-rc.1"),
        ("1.0", "1.0"),
        ("5", "5"),
    ]
    
    print("Testing version parser:")
    print("-" * 60)
    for input_str, expected in test_cases:
        result = parse_version(input_str)
        status = "✓" if result == expected else "✗"
        print(f"{status} parse_version('{input_str}')")
        print(f"  Expected: '{expected}'")
        print(f"  Got:      '{result}'")
        if result != expected:
            print("  FAILED!")
        print()

Testing version parser:
------------------------------------------------------------
✓ parse_version('v0.1.8 rahblah blah
        toast is oh so tasty, yes
        ')
  Expected: '0.1.8'
  Got:      '0.1.8'

✓ parse_version('running 9.2')
  Expected: '9.2'
  Got:      '9.2'

✓ parse_version('version: 1.8.4-a')
  Expected: '1.8.4-a'
  Got:      '1.8.4-a'

✓ parse_version('toastbox version 2.4.1')
  Expected: '2.4.1'
  Got:      '2.4.1'

✓ parse_version('Version 3.2.1')
  Expected: '3.2.1'
  Got:      '3.2.1'

✓ parse_version('v1.2.3-beta.1+build.123')
  Expected: '1.2.3-beta.1+build.123'
  Got:      '1.2.3-beta.1+build.123'

✓ parse_version('Git version 2.39.1')
  Expected: '2.39.1'
  Got:      '2.39.1'

✓ parse_version('docker 24.0.5')
  Expected: '24.0.5'
  Got:      '24.0.5'

✓ parse_version('Python 3.11.4')
  Expected: '3.11.4'
  Got:      '3.11.4'

✓ parse_version('v10.0.0-rc.1')
  Expected: '10.0.0-rc.1'
  Got:      '10.0.0-rc.1'

✓ parse_version('1.0')
  Expected: '1.0'
  Got:   

In [132]:
def parsing_version_agent(state: WorkflowState) -> Dict[str, Any]:
    """
    Parsing agent: Uses LLM to parse help text and extract detailed parameter information.
    Takes the raw help/version text from invocation agent and produces structured subcommands and parameters.
    """
    tool_info = state.get("tool_info")
    if not tool_info:
        raise Exception("Tool info missing from state")
    
    if tool_info.get("error"):
        # Pass through error state
        return {"tool_info": tool_info}
    
    version_text = tool_info.get("version_text")
    
    if version_text:
        clean_version = parse_version(version_text)
        
        # Update tool_info with parsed results
        tool_info: ToolInfo = {
            **tool_info,
            "version": clean_version,
        }

        print(f"Extracted {clean_version} as version")
    else:
        print("No version_text found, skipping version extraction")
    
    return {
        "tool_info": tool_info
    }


def request(executable: str, version: str) -> dict:
    """Make request to BioContainers given an executable (tool name) and version."""
    url = f"https://api.biocontainers.pro/ga4gh/trs/v2/tools/{executable}/versions/{executable}-{version}"
    # Example: https://api.biocontainers.pro/ga4gh/trs/v2/tools/samtools/versions/samtools-1.19
    print(f"Running request for {executable}-{version}")
    
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            data = r.json()

            # Parse requests output:

            # Initialize variables to hold image names
            bioconda = docker = singularity = None
            images = data.get("images", [])
            # Loop through images and assign image name based on type
            for x in images:
                if x.get("image_type", "").lower() == "conda":
                    bioconda = x.get("image_name")
                elif x.get("image_type", "").lower() == "docker":
                    docker = x.get("image_name")
                elif x.get("image_type", "").lower() == "singularity":
                    singularity = x.get("image_name")
            # Return parsed fields (and full API response if included)
            return {
                "bioconda": bioconda,
                "docker": docker,
                "singularity": singularity
            }
        else:
            # In the case of a non-success HTTP status code
            return {"error": r.status_code, "message": r.text}
    except requests.Timeout:
        # Handle timeout specifically
        return {"error": "timeout", "message": "Request timed out"}
    except Exception as e:
        # For any other exceptions
        return {"error": "exception", "message": str(e)}


def container_agent(state: WorkflowState) -> Dict[str, Any]:
    """
    Container agent: Captures container environment information for a given tool and version.
    Returns structured JSON output for the parsing agent to process. 
    """
    tool_info = state.get("tool_info")
    if not tool_info:
        raise Exception("Tool info missing from state")
    
    if tool_info.get("error"):
        # Pass through error state
        return {"tool_info": tool_info}
    
    executable = tool_info.get("tool")
    version = tool_info.get("version")
    if not executable:
        raise Exception("Executable name missing from state")
    if not version:
        raise Exception("Version missing from state")

    out = {}
    
    try:
        # Capture container info
        cli_output = request(executable, version)
        print(cli_output)

        # Create basic container info with raw text
        containers: ContainerInfo = {
            "bioconda": cli_output.get("bioconda"),
            "docker": cli_output.get("docker"),
            "singularity": cli_output.get("singularity")
        }

        # Update tool_info with parsed results
        tool_info: ToolInfo = {
            **tool_info,
            "containers": containers,
        }

        print("Complete container request")

        out = {
            "tool_info": tool_info
        }

    except Exception as e:
        # Return error state
        containers: ContainerInfo = {
            "bioconda": None,
            "docker": None,
            "singularity": None
        }

        print(f"Error in container function: {e}")

        # Update tool_info with parsed results
        tool_info: ToolInfo = {
            **tool_info,
            "containers": containers,
        }

        out = {
            "tool_info": tool_info
        }

    return out

# Create invocation & parser graphs subgraph
graph_builder = StateGraph(WorkflowState)
graph_builder.add_node("invocation_agent", invocation_agent)
graph_builder.add_node("parsing_version_agent", parsing_version_agent)
graph_builder.add_node("container_agent", container_agent)
graph_builder.add_edge(START, "invocation_agent")
graph_builder.add_edge("invocation_agent", "parsing_version_agent")
graph_builder.add_edge("parsing_version_agent", "container_agent")
graph_builder.add_edge("container_agent", END)

# Compile the invocation graph for export
invocation_graph = graph_builder.compile()


if __name__ == "__main__":
    # Simple test
    test_state = {"executable": "bedtools"}
    result = invocation_graph.invoke(test_state)
    print(json.dumps(result, indent=2))

Complete invocation
Extracted 2.31.1 as version
Running request for bedtools-2.31.1
{'bioconda': 'bedtools==2.31.1--h13024bc_3', 'docker': 'quay.io/biocontainers/bedtools:2.31.1--h13024bc_3', 'singularity': 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--h13024bc_3'}
Complete container request
{
  "messages": [],
  "executable": "bedtools",
  "tool_info": {
    "tool": "bedtools",
    "help_text": "bedtools is a powerful toolset for genome arithmetic.\n\nVersion:   v2.31.1\nAbout:     developed in the quinlanlab.org and by many contributors worldwide.\nDocs:      http://bedtools.readthedocs.io/\nCode:      https://github.com/arq5x/bedtools2\nMail:      https://groups.google.com/forum/#!forum/bedtools-discuss\n\nUsage:     bedtools <subcommand> [options]\n\nThe bedtools sub-commands include:\n\n[ Genome arithmetic ]\n    intersect     Find overlapping intervals in various ways.\n    window        Find overlapping intervals within a window around an interval.\n    closest  