In [42]:
# Cell 1 – environment setup (local Mac version)

import os
import sys
from pathlib import Path

def _is_colab() -> bool:
    try:
        import google.colab  # type: ignore
        return True
    except ImportError:
        return False

if _is_colab():
    # Running on Colab – keep the original mount logic
    from google.colab import drive
    drive.mount('/content/drive')

    base = Path("/content/drive/MyDrive/cv_multimodal/project")
else:
    # Running on your Mac – point to the Google Drive mirror
    base = Path(
        "/Users/guyan/Library/CloudStorage/"
        "GoogleDrive-rc989@cornell.edu/我的云端硬盘/"
        "cv_multimodal/project"
    )
    if not base.exists():
        raise FileNotFoundError(f"Project path not found: {base}")
GDRIVE_ROOT = str(base)

PROJECT_ROOT = base / "computer-vision-clean"
MULTI_AGENT_ROOT = PROJECT_ROOT / "multi-agent"

MEMORY_ROOT = MULTI_AGENT_ROOT / "memory"
LEDGERS_ROOT = MULTI_AGENT_ROOT / "ledgers"
REPORTS_ROOT = MULTI_AGENT_ROOT / "reports"
LOGS_ROOT = MULTI_AGENT_ROOT / "logs"

sys.path.insert(0, str(MULTI_AGENT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT))

for path in (
    MEMORY_ROOT / "procedural_cache",
    LEDGERS_ROOT,
    REPORTS_ROOT / "execution_summary",
    LOGS_ROOT / "execution_cycles",
):
    path.mkdir(parents=True, exist_ok=True)

print("✅ Environment ready")
print(f"📁 Project root: {PROJECT_ROOT}")
print(f"🤖 Multi-agent root: {MULTI_AGENT_ROOT}")
print(f"🧠 Memory system: {MEMORY_ROOT}")
print(f"📊 Ledgers: {LEDGERS_ROOT}")
print(f"📄 Reports: {REPORTS_ROOT}")



✅ Environment ready
📁 Project root: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean
🤖 Multi-agent root: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent
🧠 Memory system: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/memory
📊 Ledgers: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/ledgers
📄 Reports: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/reports


In [43]:
#cell_2
import os
from pathlib import Path

print("="*80)
print("🔍 FINDING AND LOADING API KEYS")
print("="*80)

# Search for .env file in multiple locations
search_paths = [
    '/Users/guyan/Desktop/env',
    f'{GDRIVE_ROOT}/.env',
    f'{GDRIVE_ROOT}/computer-vision-clean/.env',
    '/content/drive/MyDrive/cv_multimodal/.env',
    '/content/drive/My Drive/cv_multimodal/project/.env',
    f'{PROJECT_ROOT}/.env',
    str(Path.home() / '.env'),
]

env_file = None
for path in search_paths:
    if Path(path).exists():
        env_file = Path(path)
        print(f"\n✅ Found .env file: {path}")
        print(f"   Size: {env_file.stat().st_size} bytes\n")
        break
    else:
        print(f"   Checking: {path}... not found")

if not env_file:
    print("\n🔍 Searching entire cv_multimodal directory...")
    base = Path('/content/drive/MyDrive/cv_multimodal')
    if base.exists():
        all_env = list(base.rglob('*.env')) + list(base.rglob('.env*'))
        if all_env:
            print(f"\n✅ Found .env files:")
            for f in all_env:
                print(f"   📄 {f}")
            env_file = all_env[0]

    if not env_file:
        print("\n❌ No .env file found!")
        print("\n💡 Options:")
        print("   1. Upload .env to MyDrive/cv_multimodal/project/")
        print("   2. Use Colab Secrets (🔑 icon in left sidebar)")
        raise FileNotFoundError("No .env file found - check Google Drive or use Colab Secrets")

# Load all API keys
loaded_keys = []
with open(env_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith('#') and '=' in line:
            key, value = line.split('=', 1)
            key = key.strip()
            value = value.strip().strip('"').strip("'")  # Remove quotes
            os.environ[key] = value
            loaded_keys.append(key)

# Verify required keys for multi-agent system
required = {
    'ANTHROPIC_API_KEY': 'Claude Sonnet (Ops, Quality, Infrastructure)',
    'OPENAI_API_KEY': 'GPT-4 (Critical Evaluator)',
    'GOOGLE_API_KEY': 'Gemini (Research Advisor)'
}

print("Verifying required API keys:\n")
all_loaded = True

for key, usage in required.items():
    value = os.environ.get(key)
    if value and len(value) > 10:
        masked = f"{value[:8]}...{value[-4:]}"
        print(f"✅ {key}")
        print(f"   Value: {masked}")
        print(f"   Used by: {usage}\n")
    else:
        print(f"❌ {key} - NOT FOUND OR INVALID")
        print(f"   Needed by: {usage}\n")
        all_loaded = False

print("="*80)
if all_loaded:
    print("✅ ALL API KEYS LOADED SUCCESSFULLY")
    print(f"✅ Loaded {len(loaded_keys)} total keys from: {env_file}")
    print("✅ Ready to initialize agents")
else:
    print("❌ SOME API KEYS MISSING OR INVALID")
    print("⚠️ Agent execution will fail without all 3 keys")
print("="*80)





🔍 FINDING AND LOADING API KEYS

✅ Found .env file: /Users/guyan/Desktop/env
   Size: 436 bytes

Verifying required API keys:

✅ ANTHROPIC_API_KEY
   Value: sk-ant-a...7gAA
   Used by: Claude Sonnet (Ops, Quality, Infrastructure)

✅ OPENAI_API_KEY
   Value: sk-proj-...-6oA
   Used by: GPT-4 (Critical Evaluator)

✅ GOOGLE_API_KEY
   Value: AIzaSyBL...FVCI
   Used by: Gemini (Research Advisor)

✅ ALL API KEYS LOADED SUCCESSFULLY
✅ Loaded 3 total keys from: /Users/guyan/Desktop/env
✅ Ready to initialize agents


In [3]:
  #cell 3
  # Install dependencies
  !pip install -q anthropic openai google-generativeai python-dotenv pyyaml mlflow tiktoken
  !pip install -q torch torchvision transformers open_clip_torch pillow matplotlib seaborn

  print("✅ Dependencies installed")
  print("✅ V3.5 Reflection Service dependencies included (pyyaml for memory system)")



✅ Dependencies installed
✅ V3.5 Reflection Service dependencies included (pyyaml for memory system)


In [4]:
#cell 4
import json
import yaml
from datetime import datetime
from pathlib import Path

# Read pending actions from Planning Team
pending_actions_file = Path(MULTI_AGENT_ROOT) / 'reports/handoff/pending_actions.json'

if not pending_actions_file.exists():
    print(f"❌ No pending actions found at {pending_actions_file}")
    print("⚠️ Planning Team must generate pending_actions.json first")
    raise FileNotFoundError(f"Missing: {pending_actions_file}")

with open(pending_actions_file, 'r') as f:
    pending_actions = json.load(f)

print("="*80)
print("📥 PENDING ACTIONS FROM PLANNING TEAM")
print("="*80)
print(f"\n📋 Meeting ID: {pending_actions.get('meeting_id')}")
print(f"🗓️  Generated: {pending_actions.get('generated_at')}")
print(f"🎯 Context: {pending_actions.get('context')}")

decisions = pending_actions.get('decisions', [])
print(f"\n📊 Total tasks: {len(decisions)}")

# Group by priority
high_priority = [d for d in decisions if d.get('priority') == 'HIGH']
medium_priority = [d for d in decisions if d.get('priority') == 'MEDIUM']
low_priority = [d for d in decisions if d.get('priority') == 'LOW']

print(f"   ⭐ HIGH: {len(high_priority)}")
print(f"   🟠 MEDIUM: {len(medium_priority)}")
print(f"   🔵 LOW: {len(low_priority)}")

print("\n📋 Task List:")
for i, decision in enumerate(decisions, 1):
    priority = decision.get('priority', 'UNKNOWN')
    action = decision.get('action', 'No action specified')
    owner = decision.get('owner', 'unassigned')
    deadline = decision.get('deadline', 'No deadline')

    priority_icon = '⭐' if priority == 'HIGH' else '🟠' if priority == 'MEDIUM' else '🔵'
    print(f"\n{i}. {priority_icon} [{priority}] {action}")
    print(f"   👤 Owner: {owner}")
    print(f"   ⏰ Deadline: {deadline}")

print("\n" + "="*80)

# V3.5: Load Memory System (Learned Rules from Previous Cycles)
print("\n🧠 V3.5 MEMORY SYSTEM CHECK")
print("="*80)

# Check for previous cycle's reflection summary
reflection_summary_file = Path(REPORTS_ROOT) / 'reflection_summary.md'
if reflection_summary_file.exists():
    print(f"✅ Found reflection summary from previous cycle")
    print(f"   📄 {reflection_summary_file}")
    with open(reflection_summary_file, 'r') as f:
        summary_preview = f.read()[:200]
        print(f"   Preview: {summary_preview}...")
else:
    print(f"ℹ️  No reflection summary from previous cycle (first run)")

# Load semantic memory (long-term learned rules)
semantic_file = Path(MEMORY_ROOT) / 'semantic.yml'
if semantic_file.exists():
    with open(semantic_file, 'r') as f:
        semantic_memory = yaml.safe_load(f)

    learned_patterns = semantic_memory.get('learned_patterns', [])
    if learned_patterns:
        print(f"\n✅ Loaded {len(learned_patterns)} learned rules from semantic memory:")
        for rule in learned_patterns[:3]:  # Show first 3
            print(f"   • {rule.get('pattern')} → {rule.get('fix')}")
            print(f"     Confidence: {rule.get('confidence', 0):.2f}")
        if len(learned_patterns) > 3:
            print(f"   ... and {len(learned_patterns) - 3} more rules")
    else:
        print(f"ℹ️  No learned patterns yet (will be built during execution)")
else:
    print(f"ℹ️  No semantic memory file (first run)")

# Check procedural cache (proven templates)
procedural_cache_dir = Path(MEMORY_ROOT) / 'procedural_cache'
if procedural_cache_dir.exists():
    templates = list(procedural_cache_dir.glob('*.json'))
    if templates:
        print(f"\n✅ Found {len(templates)} proven job templates in procedural cache")
        for template in templates[:2]:  # Show first 2
            print(f"   📄 {template.name}")
    else:
        print(f"ℹ️  No proven templates yet (will be built during execution)")

print("\n" + "="*80)
print("✅ Ready to execute tasks with memory-enhanced decision making")
print("="*80)

# cell 5
# Import multi-agent system components
os.chdir(MULTI_AGENT_ROOT)

from agents.roles import Agent, AgentConfig, AgentTeam
from agents.router import AgentRouter, RoutingStrategy, Message
from tools.file_bridge import FileBridge, create_default_policies

print("✅ Multi-agent system imported")

# V3.5: Initialize Reflection Service and Retry Mechanisms
from reflection_service import ReflectionService, process_execution_reflection
from retry_mechanisms import RetryMechanisms

reflection_service = ReflectionService(MULTI_AGENT_ROOT)
retry_mechanisms = RetryMechanisms(MULTI_AGENT_ROOT)

print("✅ V3.5 Reflection Service initialized")
print(f"   🧠 Memory root: {MEMORY_ROOT}")
print(f"   📊 Confidence threshold (τ): {reflection_service.tau}")
print(f"   🔄 Max retries: Approval={retry_mechanisms.MAX_APPROVAL_RETRIES}, "
      f"Exec={retry_mechanisms.MAX_EXEC_RETRIES}, Verify={retry_mechanisms.MAX_VERIF_RETRIES}")

# Initialize Executive Team (3 agents)
executive_team_agents = {}
prompt_dir = Path(MULTI_AGENT_ROOT) / 'agents/prompts/executive_team'

# Define Executive Team configuration
executive_config = {
    'ops_commander': {
        'name': 'Ops Commander',
        'model': 'claude-sonnet-4-20250514',
        'provider': 'anthropic',
        'role': 'Execute research experiments and deployments',
        'prompt_file': '02_ops_commander.md',
        'max_tokens': 8192  # Increased to allow longer responses
    },
    'quality_safety': {
        'name': 'Quality & Safety Officer',
        'model': 'claude-sonnet-4-20250514',
        'provider': 'anthropic',
        'role': 'Ensure code quality, safety, and reproducibility',
        'prompt_file': '01_quality_safety_officer.md',
        'max_tokens': 8192
    },
    'infrastructure': {
        'name': 'Infrastructure & Performance Monitor',
        'model': 'claude-sonnet-4-20250514',
        'provider': 'anthropic',
        'role': 'Monitor infrastructure and performance',
        'prompt_file': '03_infrastructure_performance_monitor.md',
        'max_tokens': 8192
    }
}

print("\n🤖 Initializing Executive Team:")
init_log = {
    "timestamp": datetime.utcnow().isoformat(),
    "team": "executive",
    "v3_5_features": True,
    "agents": []
}

for agent_id, config in executive_config.items():
    agent_cfg = AgentConfig(
        name=config['name'],
        model=config['model'],
        provider=config['provider'],
        role=config['role'],
        max_tokens=config.get('max_tokens', 8192),  # Increased from default 2000
        prompt_file=config['prompt_file']
    )
    executive_team_agents[agent_id] = Agent(agent_cfg, prompt_dir)
    print(f"   ✅ {config['name']} ({config['model']})")

    # V3.5: Log agent initialization
    init_log["agents"].append({
        "id": agent_id,
        "name": config['name'],
        "model": config['model'],
        "status": "READY"
    })

# Create agent team and router
executive_team = AgentTeam(executive_team_agents)
executive_router = AgentRouter(executive_team)

# V3.5: Save initialization log
init_log_file = Path(LOGS_ROOT) / 'system_init' / f'exec_team_init_{datetime.utcnow().strftime("%Y%m%d_%H%M%S")}.json'
with open(init_log_file, 'w') as f:
    json.dump(init_log, f, indent=2)

print("\n✅ Executive Team initialized (3 agents)")
print(f"✅ V3.5 enhanced execution with:")
print(f"   • Phase 4.5: Approval Retry (confidence-gated)")
print(f"   • Phase 5.5: Execution Retry (sandboxed)")
print(f"   • Phase 6.5: Verification Retry (artifact regeneration)")
print(f"   • Parallel Reflection Service (non-blocking monitoring)")
print(f"\n📝 Initialization log: {init_log_file}")



📥 PENDING ACTIONS FROM PLANNING TEAM

📋 Meeting ID: None
🗓️  Generated: None
🎯 Context: None

📊 Total tasks: 0
   ⭐ HIGH: 0
   🟠 MEDIUM: 0
   🔵 LOW: 0

📋 Task List:


🧠 V3.5 MEMORY SYSTEM CHECK
✅ Found reflection summary from previous cycle
   📄 /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/reports/reflection_summary.md
   Preview: # Reflection Summary

**Total Reflections**: 8
**Average Confidence**: 0.75
**Retry Success Rate**: 100.0%
**Eligible for Retry**: 8/8

## Top Root Causes
- code_error: 8 occurrences
...

✅ Loaded 1 learned rules from semantic memory:
   • code_error → Add error handling and validation
     Confidence: 0.75
ℹ️  No proven templates yet (will be built during execution)

✅ Ready to execute tasks with memory-enhanced decision making
✅ Multi-agent system imported
✅ V3.5 Reflection Service initialized
   🧠 Memory root: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的

  "timestamp": datetime.utcnow().isoformat(),
  init_log_file = Path(LOGS_ROOT) / 'system_init' / f'exec_team_init_{datetime.utcnow().strftime("%Y%m%d_%H%M%S")}.json'


In [5]:
#cell_clip_patch
def configure_clip_attn_defaults() -> None:
    """Force CLIP models to use eager attention so output_attentions is supported."""
    try:
        from transformers import CLIPModel, CLIPTextModel, CLIPVisionModel
    except Exception as err:
        print(f"⚠️ Unable to patch CLIP attention defaults: {err}")
        return

    def _set_config(config):
        if hasattr(config, "attn_implementation"):
            config.attn_implementation = "eager"
        if hasattr(config, "output_attentions"):
            config.output_attentions = True

    def _patch_from_pretrained(cls, original_fn):
        def wrapper(*args, **kwargs):
            model = original_fn(*args, **kwargs)
            if hasattr(model, "text_model") and hasattr(model.text_model, "config"):
                _set_config(model.text_model.config)
            if hasattr(model, "vision_model") and hasattr(model.vision_model, "config"):
                _set_config(model.vision_model.config)
            return model
        return wrapper

    CLIPModel.from_pretrained = _patch_from_pretrained(CLIPModel, CLIPModel.from_pretrained)
    CLIPTextModel.from_pretrained = _patch_from_pretrained(CLIPTextModel, CLIPTextModel.from_pretrained)
    CLIPVisionModel.from_pretrained = _patch_from_pretrained(CLIPVisionModel, CLIPVisionModel.from_pretrained)

    print("✅ Patched CLIP models to use eager attention implementation by default")


configure_clip_attn_defaults()



  from .autonotebook import tqdm as notebook_tqdm


✅ Patched CLIP models to use eager attention implementation by default


In [20]:

#cell_6
# Tracker & shared execution state
from typing import Any, Dict, List, Optional
class TaskExecutionTracker:
    def __init__(self):
        self.task_results = []
        self.start_time = datetime.now()
        self.current_task = None
        self.current_task_id: Optional[str] = None
        self.task_records: Dict[str, Dict[str, Any]] = {}
        self.task_timers: Dict[str, Dict[str, Any]] = {}
        self.retry_stats = {
            'approval_retries': 0,
            'execution_retries': 0,
            'verification_retries': 0,
            'total_retries': 0,
        }

    def start_task(self, task_id, action, priority):
        start_ts = datetime.now()
        task_record = {
            'task_id': task_id,
            'action': action,
            'priority': priority,
            'status': 'in_progress',
            'start_time': start_ts.isoformat(),
            'outputs': [],
            'errors': [],
            'agent_responses': {},
            'retry_attempts': {'approval': 0, 'execution': 0, 'verification': 0},
            'patches_applied': [],
            'reflection_notes': [],
            'confidence_scores': [],
            'phases': {
                'implementation': None,
                'approval': None,
                'execution': None,
                'verification': None,
            },
        }
        self.current_task = task_record
        self.current_task_id = task_id
        self.task_results = [t for t in self.task_results if t['task_id'] != task_id]
        self.task_records[task_id] = task_record
        self.task_timers[task_id] = {
            'start': start_ts,
            'end': None,
            'duration': 0.0,
        }
        print(f"\n🚀 Starting Task {task_id}: {action}")
        print(f"   Priority: {priority}")

    def log_agent_response(self, agent_name, response):
        if self.current_task:
            self.current_task['agent_responses'][agent_name] = response
            print(f"   ✅ {agent_name} responded ({len(response)} chars)")

    def log_retry_attempt(self, phase, patch_id=None, confidence=None):
        if self.current_task:
            self.current_task['retry_attempts'][phase] += 1
            self.retry_stats[f'{phase}_retries'] += 1
            self.retry_stats['total_retries'] += 1
            if patch_id:
                self.current_task['patches_applied'].append(
                    {
                        'phase': phase,
                        'patch_id': patch_id,
                        'confidence': confidence,
                        'timestamp': datetime.now().isoformat(),
                    }
                )
            print(
                f"   🔄 Retry #{self.current_task['retry_attempts'][phase]} ({phase})"
                + (f" - Confidence: {confidence:.2f}" if confidence else "")
            )

    def log_reflection_note(self, reflection_note):
        if self.current_task:
            self.current_task['reflection_notes'].append(reflection_note)
            print(f"   🧠 Reflection: {reflection_note.get('why_failed', 'unknown')}")

    def log_phase_completion(self, phase, status, details=None):
        if self.current_task:
            self.current_task['phases'][phase] = {
                'status': status,
                'timestamp': datetime.now().isoformat(),
                'details': details,
            }
            status_icon = '✅' if status == 'pass' else '❌' if status == 'fail' else '⏭️'
            print(f"   {status_icon} Phase {phase}: {status}")

    def activate_task(self, task_id: str) -> None:
        task_record = self.task_records.get(task_id)
        if task_record:
            self.current_task = task_record
            self.current_task_id = task_id

    def complete_task(self, status='completed', final_status_reason=None, task_id: Optional[str] = None):
        valid_statuses = {'completed', 'failed', 'rejected', 'cancelled', 'skipped'}

        if task_id is None and status not in valid_statuses and final_status_reason in valid_statuses:
            # Support legacy call pattern: complete_task(task_id, 'completed')
            task_id = status
            status = final_status_reason
            final_status_reason = None

        if task_id:
            task_record = self.task_records.get(task_id)
            if task_record:
                self.current_task = task_record
                self.current_task_id = task_id
        else:
            task_record = self.current_task
            task_id = getattr(self, "current_task_id", None)

        if not task_record:
            print("⚠️ Unable to complete task – no active task context.")
            return

        task_record['status'] = status
        task_record['status_reason'] = final_status_reason
        task_record['end_time'] = datetime.now().isoformat()
        total_retries = sum(task_record['retry_attempts'].values())
        task_record['total_retries'] = total_retries

        # Update task timers
        timer_entry = self.task_timers.setdefault(task_id, {})
        end_time = datetime.fromisoformat(task_record['end_time'])
        timer_entry['end'] = end_time
        start_value = timer_entry.get('start')
        if isinstance(start_value, datetime):
            start_dt = start_value
        elif isinstance(start_value, str):
            start_dt = datetime.fromisoformat(start_value)
        else:
            start_dt = datetime.fromisoformat(task_record['start_time'])
            timer_entry['start'] = start_dt
        timer_entry['duration'] = max((end_time - start_dt).total_seconds(), 0.0)

        # Replace existing record for deterministic ordering
        self.task_results = [t for t in self.task_results if t['task_id'] != task_id]
        self.task_results.append(task_record)

        print(f"   ✅ Task completed in {timer_entry['duration']:.1f}s - Status: {status}")
        if total_retries > 0:
            print(f"   🔄 Total retries: {total_retries}")

        if self.current_task_id == task_id:
            self.current_task = None
            self.current_task_id = None

    def get_summary(self):
        completed = len([t for t in self.task_results if t['status'] == 'completed'])
        failed = len([t for t in self.task_results if t['status'] == 'failed'])
        total_duration = (datetime.now() - self.start_time).total_seconds()

        return {
            'total_tasks': len(self.task_results),
            'completed': completed,
            'failed': failed,
            'total_duration_seconds': total_duration,
            'retry_stats': self.retry_stats,
            'task_results': self.task_results,
        }


# Initialize tracker
tracker = TaskExecutionTracker()
print("✅ V3.6 Task execution tracker initialized\n")

✅ V3.6 Task execution tracker initialized



In [12]:
#cell_7


def _fresh_execution_stats() -> Dict[str, Any]:
    return {
        'tasks_processed': 0,
        'tasks_approved': 0,
        'tasks_executed': 0,
        'tasks_completed': 0,
        'retry_attempts': {'approval': 0, 'execution': 0, 'verification': 0},
    }


execution_stats: Dict[str, Any] = _fresh_execution_stats()
task_contexts: Dict[str, Dict[str, Any]] = {}
current_task_order: List[str] = []


def reset_execution_state() -> None:
    """Reset tracker, stats, and cached task contexts."""
    global tracker, execution_stats, task_contexts, current_task_order
    tracker = TaskExecutionTracker()
    execution_stats = _fresh_execution_stats()
    task_contexts = {}
    current_task_order = []
    print("🔄 Execution state reset")


def parse_agent_verdict(response: str) -> Dict[str, Any]:
    """Extract structured verdict from agent response."""
    verdict_patterns = [
        r'\*\*.*?Final Verdict:\*\*\s*(✅\s*APPROVED|⚠️\s*CAUTION|❌\s*BLOCKED)',
        r'Final Verdict:\s*(APPROVED|CAUTION|BLOCKED)',
        r'Status:\s*(APPROVED|REJECTED)',
    ]

    for pattern in verdict_patterns:
        match = re.search(pattern, response, re.IGNORECASE)
        if match:
            verdict_text = match.group(1).upper()
            if 'APPROVED' in verdict_text or '✅' in verdict_text:
                return {'structured': True, 'status': 'APPROVED', 'confidence': 0.9}
            if 'BLOCKED' in verdict_text or '❌' in verdict_text:
                return {'structured': True, 'status': 'REJECTED', 'confidence': 0.9}
            return {'structured': True, 'status': 'CAUTION', 'confidence': 0.5}

    approved_keywords = ['approved', 'looks good', 'ready', 'pass', '✅']
    rejected_keywords = ['blocked', 'rejected', 'unsafe', 'fail', '❌']

    approved_count = sum(1 for kw in approved_keywords if kw in response.lower())
    rejected_count = sum(1 for kw in rejected_keywords if kw in response.lower())

    if approved_count > rejected_count:
        return {'structured': False, 'status': 'APPROVED', 'confidence': 0.6}
    if rejected_count > approved_count:
        return {'structured': False, 'status': 'REJECTED', 'confidence': 0.6}
    return {'structured': False, 'status': 'REJECTED', 'confidence': 0.3}


def determine_task_status_v2(
    ops_response: str,
    quality_response: str,
    infra_response: str,
) -> str:
    """Enhanced approval gate with structured verdict support."""
    ops_verdict = parse_agent_verdict(ops_response)
    quality_verdict = parse_agent_verdict(quality_response)
    infra_verdict = parse_agent_verdict(infra_response)

    ops_approved = ops_verdict['status'] == 'APPROVED'
    quality_approved = quality_verdict['status'] == 'APPROVED'
    infra_approved = infra_verdict['status'] == 'APPROVED'

    print(f"\n   🔍 Approval Gate Analysis:")
    print(f"      Ops: {ops_verdict['status']} (conf: {ops_verdict['confidence']:.2f})")
    print(f"      Quality: {quality_verdict['status']} (conf: {quality_verdict['confidence']:.2f})")
    print(f"      Infrastructure: {infra_verdict['status']} (conf: {infra_verdict['confidence']:.2f})")

    if ops_approved and quality_approved and infra_approved:
        avg_conf = (
            ops_verdict['confidence']
            + quality_verdict['confidence']
            + infra_verdict['confidence']
        ) / 3
        print(f"      ✅ ALL GATES APPROVED (avg confidence: {avg_conf:.2f})")
        return "approved"
    print("      ❌ SOME GATES REJECTED")
    return "rejected"





In [13]:
#cell 7
# ═══════════════════════════════════════════════════════════════════════════
# FINAL COMPLETE PHASES 3-7 LOOP (V3.6 WITH TRACKER INITIALIZATION)
# ═══════════════════════════════════════════════════════════════════════════
# Copy this ENTIRE file into ONE Colab cell after Cell 8 (agent initialization)
# ═══════════════════════════════════════════════════════════════════════════

import os
import re
from pathlib import Path
from datetime import datetime

from reflection_service import ReflectionService, process_execution_reflection
from retry_mechanisms import RetryMechanisms


# Locate multi-agent root (supports Google Drive trajectories)
_candidate_roots = []

env_root = os.environ.get("MULTI_AGENT_ROOT")
if env_root:
    _candidate_roots.append(Path(env_root))

_candidate_roots.extend([
    Path("/Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent"),
    Path("/content/drive/MyDrive/cv_multimodal/project/computer-vision-clean/multi-agent"),
    Path.cwd() / "multi-agent",
    Path.cwd(),
])

MULTI_AGENT_ROOT = next((p for p in _candidate_roots if p and p.exists()), Path.cwd())

print(f"📂 Multi-agent root resolved to: {MULTI_AGENT_ROOT}")

_ledger_paths = {
    "reflections": MULTI_AGENT_ROOT / "ledgers/reflections.jsonl",
    "episodic_memory": MULTI_AGENT_ROOT / "memory/episodic.jsonl",
    "semantic_memory": MULTI_AGENT_ROOT / "memory/semantic.yml",
    "patches_dir": MULTI_AGENT_ROOT / "patches",
}

for name, path in _ledger_paths.items():
    status = "✅" if path.exists() else "⚠️"
    print(f"   {status} {name}: {path}")

reflection_service = ReflectionService(str(MULTI_AGENT_ROOT))
retry_mechanisms = RetryMechanisms(str(MULTI_AGENT_ROOT))
print(f"   🧠 Reflection service τ threshold: {reflection_service.tau}")

#cell_clip_patch
_CLIP_ATTENTION_PATCHED = False


def _apply_clip_attention_patch() -> None:
    """Force CLIP models to use eager attention so output_attentions is supported."""
    global _CLIP_ATTENTION_PATCHED
    if _CLIP_ATTENTION_PATCHED:
        return

    try:
        from transformers import CLIPModel, CLIPTextModel, CLIPVisionModel
    except Exception as err:
        print(f"⚠️ Unable to patch CLIP attention defaults: {err}")
        return

    def _force_eager(module) -> None:
        if module is None:
            return
        if hasattr(module, "set_attn_implementation"):
            try:
                module.set_attn_implementation("eager")
            except TypeError:
                pass
        config = getattr(module, "config", None)
        if config is not None:
            if hasattr(config, "attn_implementation"):
                config.attn_implementation = "eager"
            if hasattr(config, "output_attentions"):
                config.output_attentions = True

    def _configure_model(model):
        _force_eager(model)
        for attr in ("vision_model", "text_model"):
            _force_eager(getattr(model, attr, None))
        for attr in ("vision_model", "text_model"):
            sub = getattr(model, attr, None)
            if sub is None:
                continue
            for child in getattr(sub, "modules", lambda: [])():
                _force_eager(child)
        return model

    def _wrap_from_pretrained(cls, original_fn):
        def wrapper(*args, **kwargs):
            kwargs.setdefault("attn_implementation", "eager")
            model = original_fn(*args, **kwargs)
            return _configure_model(model)

        return wrapper

    CLIPModel.from_pretrained = _wrap_from_pretrained(CLIPModel, CLIPModel.from_pretrained)
    CLIPTextModel.from_pretrained = _wrap_from_pretrained(CLIPTextModel, CLIPTextModel.from_pretrained)
    CLIPVisionModel.from_pretrained = _wrap_from_pretrained(CLIPVisionModel, CLIPVisionModel.from_pretrained)

    _CLIP_ATTENTION_PATCHED = True
    print("✅ Patched CLIP models to use eager attention implementation by default")


def configure_clip_attn_defaults() -> None:
    _apply_clip_attention_patch()


def ensure_clip_attention_patch() -> None:
    _apply_clip_attention_patch()


configure_clip_attn_defaults()

#cell_6
# Tracker & shared execution state
class TaskExecutionTracker:
    def __init__(self):
        self.task_results = []
        self.start_time = datetime.now()
        self.current_task = None
        self.retry_stats = {
            'approval_retries': 0,
            'execution_retries': 0,
            'verification_retries': 0,
            'total_retries': 0,
        }

    def start_task(self, task_id, action, priority):
        self.current_task = {
            'task_id': task_id,
            'action': action,
            'priority': priority,
            'status': 'in_progress',
            'start_time': datetime.now().isoformat(),
            'outputs': [],
            'errors': [],
            'agent_responses': {},
            'retry_attempts': {'approval': 0, 'execution': 0, 'verification': 0},
            'patches_applied': [],
            'reflection_notes': [],
            'confidence_scores': [],
            'phases': {
                'implementation': None,
                'approval': None,
                'execution': None,
                'verification': None,
            },
        }
        print(f"\n🚀 Starting Task {task_id}: {action}")
        print(f"   Priority: {priority}")

    def log_agent_response(self, agent_name, response):
        if self.current_task:
            self.current_task['agent_responses'][agent_name] = response
            print(f"   ✅ {agent_name} responded ({len(response)} chars)")

    def log_retry_attempt(self, phase, patch_id=None, confidence=None):
        if self.current_task:
            self.current_task['retry_attempts'][phase] += 1
            self.retry_stats[f'{phase}_retries'] += 1
            self.retry_stats['total_retries'] += 1
            if patch_id:
                self.current_task['patches_applied'].append(
                    {
                        'phase': phase,
                        'patch_id': patch_id,
                        'confidence': confidence,
                        'timestamp': datetime.now().isoformat(),
                    }
                )
            print(
                f"   🔄 Retry #{self.current_task['retry_attempts'][phase]} ({phase})"
                + (f" - Confidence: {confidence:.2f}" if confidence else "")
            )

    def log_reflection_note(self, reflection_note):
        if self.current_task:
            self.current_task['reflection_notes'].append(reflection_note)
            print(f"   🧠 Reflection: {reflection_note.get('why_failed', 'unknown')}")

    def log_phase_completion(self, phase, status, details=None):
        if self.current_task:
            self.current_task['phases'][phase] = {
                'status': status,
                'timestamp': datetime.now().isoformat(),
                'details': details,
            }
            status_icon = '✅' if status == 'pass' else '❌' if status == 'fail' else '⏭️'
            print(f"   {status_icon} Phase {phase}: {status}")

    def complete_task(self, status='completed', final_status_reason=None):
        if self.current_task:
            self.current_task['status'] = status
            self.current_task['status_reason'] = final_status_reason
            self.current_task['end_time'] = datetime.now().isoformat()
            total_retries = sum(self.current_task['retry_attempts'].values())
            self.current_task['total_retries'] = total_retries
            self.task_results.append(self.current_task)

            duration = (
                datetime.fromisoformat(self.current_task['end_time'])
                - datetime.fromisoformat(self.current_task['start_time'])
            ).total_seconds()

            print(f"   ✅ Task completed in {duration:.1f}s - Status: {status}")
            if total_retries > 0:
                print(f"   🔄 Total retries: {total_retries}")

            self.current_task = None

    def get_summary(self):
        completed = len([t for t in self.task_results if t['status'] == 'completed'])
        failed = len([t for t in self.task_results if t['status'] == 'failed'])
        total_duration = (datetime.now() - self.start_time).total_seconds()

        return {
            'total_tasks': len(self.task_results),
            'completed': completed,
            'failed': failed,
            'total_duration_seconds': total_duration,
            'retry_stats': self.retry_stats,
            'task_results': self.task_results,
        }


# Initialize tracker
tracker = TaskExecutionTracker()
print("✅ V3.6 Task execution tracker initialized\n")


📂 Multi-agent root resolved to: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent
   ✅ reflections: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/ledgers/reflections.jsonl
   ✅ episodic_memory: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/memory/episodic.jsonl
   ✅ semantic_memory: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/memory/semantic.yml
   ✅ patches_dir: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/patches
   🧠 Reflection service τ threshold: 0.7
✅ Patched CLIP models to use eager attention implementation by default
✅ V3.6 Task execution tracker initialized



In [52]:
def ensure_phase_0() -> None:
    """Prepare synthetic assets used across execution phases."""
    if _phase0_state["completed"]:
        return

    print("Phase 0 complete.")
    print("TEST_IMAGE_PATH:", os.environ.get("TEST_IMAGE_PATH"))
    print("TEST_DATA_DIR:", os.environ.get("TEST_DATA_DIR"))

#PHASE 0: Pre-Execution Infrastructure Setup")
    try:
        from PIL import Image
        import numpy as np

        if Path("/content").exists():
            test_dir = Path("/content/test_data")
        else:
            test_dir = MULTI_AGENT_ROOT / "test_data"
        test_dir.mkdir(exist_ok=True)

        print("Creating test images for model validation...")
        for i in range(5):
            test_img = Image.fromarray(
                np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
            )
            test_img.save(test_dir / f"test_image_{i}.png")

        test_images = list(test_dir.glob("*.png"))
        print(f"Created {len(test_images)} test images at {test_dir}")
        print(f" Primary test image: {test_dir / "test_image_0.png"}")

        os.environ["TEST_IMAGE_PATH"] = str((test_dir / "test_image_0.png").resolve())
        os.environ["TEST_DATA_DIR"] = str(test_dir.resolve())

        # Attempt to fetch a real image via Pexels if API key available
        pexels_key = os.environ.get("PEXELS_API_KEY")
        if pexels_key:
            try:
                try:
                    import requests  # type: ignore
                except ImportError:
                    requests = None

                search_url = "https://api.pexels.com/v1/search"
                params = {"query": "architecture", "per_page": 1}
                headers = {"Authorization": pexels_key}

                if requests is not None:
                    resp = requests.get(search_url, params=params, headers=headers, timeout=10)
                    resp.raise_for_status()
                    data = resp.json()
                    image_url = data["photos"][0]["src"]["medium"]
                    image_bytes = requests.get(image_url, timeout=10).content
                else:
                    from urllib.request import Request, urlopen
                    import json as _json

                    req = Request(search_url + "?query=architecture&per_page=1", headers=headers)
                    with urlopen(req, timeout=10) as resp:
                        data = _json.loads(resp.read().decode("utf-8"))
                    image_url = data["photos"][0]["src"]["medium"]
                    with urlopen(image_url, timeout=10) as img_resp:
                        image_bytes = img_resp.read()

                real_path = test_dir / "test_image_pexels.jpg"
                with open(real_path, "wb") as f:
                    f.write(image_bytes)

                os.environ["TEST_IMAGE_PATH"] = str(real_path.resolve())
                print(f"   📷 Downloaded Pexels test image → {real_path}")
            except Exception as dl_err:
                print(f"   ⚠️ Unable to download Pexels image: {dl_err}")

        _phase0_state["completed"] = True
        _phase0_state["data_dir"] = str(test_dir)
        print("  Phase 0 infrastructure setup complete")
    except Exception as e:
        print(f" Phase 0 setup warning: {e}")
        print("  Continuing execution - agents will handle infrastructure setup")
    print()


In [None]:
import os
from pathlib import Path

pics_dir = Path("/Users/guyan/Desktop/pics")
fallback_image = next(pics_dir.glob("*"), None)
if fallback_image:
    os.environ["TEST_IMAGE_PATH"] = str(fallback_image.resolve())
    print("Overrode TEST_IMAGE_PATH →", os.environ["TEST_IMAGE_PATH"])


In [15]:
#cell_8_5
# ═══════════════════════════════════════════════════════════════
# PHASE 4.5: APPROVAL RETRY
# ═══════════════════════════════════════════════════════════════

def phase4_5_approval_retry(ctx: Dict[str, Any]) -> str:
    """Phase 4.5 – Approval retry with Ops Commander receiving agent feedback."""
    task_id = ctx['task_id']
    action = ctx['action']
    priority = ctx['priority']
    task = ctx['task']
    agent_responses = ctx['agent_responses']

    tracker.activate_task(task_id)

    print(f"\n🔧 PHASE 4.5: Approval Retry ({task_id})")

    quality_response = agent_responses['quality_safety']
    infra_response = agent_responses['infrastructure']

    retry_prompt = f"""# PHASE 4.5: APPROVAL RETRY REQUEST

Your implementation for Task {task_id} was **REJECTED** in the approval gate.

## FEEDBACK FROM YOUR TEAM

### Quality & Safety Officer Review:
```
{quality_response}
```

### Infrastructure & Performance Monitor Review:
```
{infra_response}
```

---

## INFRASTRUCTURE REMINDER (Phase 0 Setup Available)
**Test resources for validation only:**
- Test images: `/content/test_data/test_image_*.png` (5 images, 224x224 RGB)
- Primary test image: `os.environ.get('TEST_IMAGE_PATH')`

**CRITICAL - For CLIP validation with test images:**
```python
test_img = Image.open(os.environ.get('TEST_IMAGE_PATH'))
inputs = processor(text=["a photo"], images=test_img, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs, output_attentions=True)
attention = outputs.attentions
```

**For actual experiments - use real images:**
- Pexels API key in Colab secrets: `userdata.get('PIXEL_API')`

---

## YOUR MISSION: Provide a CORRECTED Implementation

**Address ALL issues raised above:**

1. **If Quality & Safety blocked you:**
   - Fix code structure issues
   - Add missing error handling
   - Remove sys.exit() calls
   - Close code blocks properly (``` on new line!)
   - Ensure code is complete (not truncated)

2. **If Infrastructure blocked you:**
   - Adjust resource requirements
   - Add environment checks

3. **Code Block Requirements (CRITICAL):**
   ```python
   # Your CORRECTED implementation
   ```  ← **MUST end with ``` on new line**

4. **Keep under 16,000 characters**

---

## Response Format:

**Ops Commander Final Verdict:** ✅ APPROVED (retry)

## Changes Made
- [List specific fixes based on feedback]

## Corrected Implementation

```python
# Your complete, corrected code here
```

**Provide your CORRECTED implementation now:**
"""

    print("   📝 Requesting corrected implementation from Ops Commander...")
    corrected_ops_response = ops_commander.respond(retry_prompt)
    agent_responses['ops_commander'] = corrected_ops_response
    tracker.log_retry_attempt('approval', patch_id='approval_retry_1', confidence=0.80)

    # Quality & Safety reviews
    print("   🔍 Quality & Safety reviewing corrected implementation...")
    corrected_quality_response = quality_safety.respond(f"""# REVIEW CORRECTED IMPLEMENTATION

Ops Commander provided corrected implementation. Check if issues were fixed.

Previous feedback: {quality_response[:500]}...

Corrected implementation: {corrected_ops_response[:1000]}...

**Verdict:** [✅ APPROVED / ❌ BLOCKED]
""")
    agent_responses['quality_safety'] = corrected_quality_response

    # Infrastructure reviews
    print("   🔍 Infrastructure reviewing corrected implementation...")
    corrected_infra_response = infrastructure.respond(f"""# REVIEW CORRECTED IMPLEMENTATION

Ops Commander provided corrected implementation. Check if issues were fixed.

Previous feedback: {infra_response[:500]}...

Corrected implementation: {corrected_ops_response[:1000]}...

**Verdict:** [✅ APPROVED / ❌ BLOCKED]
""")
    agent_responses['infrastructure'] = corrected_infra_response

    # Re-run approval gate
    print("   🔐 Re-running approval gate with corrected implementation...")
    new_status = determine_task_status_v2(
        corrected_ops_response,
        corrected_quality_response,
        corrected_infra_response
    )

    if new_status == "approved":
        print("   ✅ APPROVED after correction")
        execution_stats['retry_attempts']['approval'] += 1
        tracker.log_phase_completion('approval', 'pass')
    else:
        print("   ❌ Still REJECTED after correction")
        tracker.log_phase_completion('approval', 'fail')

    ctx['preliminary_status'] = new_status
    return new_status


# ═══════════════════════════════════════════════════════════════
# PHASE 5: EXECUTION
# ═══════════════════════════════════════════════════════════════

def phase5_execute(ctx: Dict[str, Any], rerun: bool = False) -> bool:
    """Phase 5 – Execute approved code blocks."""
    from pathlib import Path

    task_id = ctx['task_id']
    agent_responses = ctx['agent_responses']

    tracker.activate_task(task_id)

    if not rerun:
        print(f"\n⚡ PHASE 5: Execution ({task_id})")

    ops_response = agent_responses.get('ops_commander', '')
    code_blocks = _extract_code_blocks(ops_response)

    if not code_blocks:
        print("   ℹ️  No executable code blocks detected – treating as documentation task")
        tracker.log_phase_completion('execution', 'pass', {'reason': 'no_code_blocks'})
        ctx['execution_success'] = True
        ctx['_execution_runs'] += 1
        return True

    ensure_clip_attention_patch()
    print(f"   📦 Found {len(code_blocks)} code block(s)")

    execution_telemetry = ctx.get('execution_telemetry', {'task_id': task_id, 'exit_code': 0})

    for idx, code in enumerate(code_blocks):
        try:
            # Ensure no dangling MLflow runs carry over between code blocks
            try:
                import mlflow

                active = mlflow.active_run()
                if active:
                    print("      ℹ️ Ending previous MLflow run before executing new code block")
                    mlflow.end_run()
            except ModuleNotFoundError:
                pass
            except Exception as mlflow_err:
                print(f"      ⚠️ Unable to finalize prior MLflow run: {mlflow_err}")

            ensure_clip_attention_patch()
            _ensure_numpy_json_patch()
            exec_globals = {
                '__name__': '__main__',
                'MULTI_AGENT_ROOT': MULTI_AGENT_ROOT,
                'Path': Path,
                'ensure_clip_attention_patch': ensure_clip_attention_patch,
            }
            exec(code, exec_globals)
            print(f"      ✅ Block {idx + 1} executed successfully")

            resolved_run_id = _resolve_mlflow_run_id(exec_globals)
            if resolved_run_id:
                execution_telemetry['run_id'] = resolved_run_id
                if exec_globals.get('run_id') != resolved_run_id:
                    exec_globals['run_id'] = resolved_run_id
                    print(f"      ℹ️ Normalized MLflow run_id → {resolved_run_id}")
            if 'mlflow_run_name' in exec_globals:
                execution_telemetry['run_name'] = exec_globals['mlflow_run_name']
            elif 'run_id' in exec_globals:
                execution_telemetry['run_id'] = exec_globals['run_id']

                try:
                    import mlflow
                    mlflow.get_run(exec_globals['run_id'])
                except Exception:
                    print("      ⚠️ Provided run_id could not be verified with MLflow")

            if 'run_name' not in execution_telemetry:
                for key in ('RUN_TAG', 'run_tag', 'RUN_NAME', 'run_name'):
                    if key in exec_globals:
                        execution_telemetry['run_name'] = exec_globals[key]
                        break

            if 'config' in exec_globals:
                execution_telemetry['config'] = exec_globals['config']

            execution_stats['tasks_executed'] += 1
            tracker.log_phase_completion('execution', 'pass')
            ctx['execution_success'] = True
            ctx['execution_telemetry'] = execution_telemetry
            ctx['_execution_runs'] += 1

            # Post-execution hooks for task-specific artifacts
            try:
                from pathlib import Path
                import shutil

                if task_id == "W1-003":
                    source_path = Path("/content/test_data/data/week1_query_set.json")
                    fallback_path = Path("/content/data/week1_query_set.json")
                    if source_path.exists():
                        fallback_path.parent.mkdir(parents=True, exist_ok=True)
                        shutil.copy(source_path, fallback_path)
                        print(f"      ℹ️ Synced query set to {fallback_path}")
            except Exception as hook_err:
                print(f"      ⚠️ Post-execution hook failed: {hook_err}")

            try:
                _verify_research_outputs(task_id)
            except FileNotFoundError as validation_err:
                raise RuntimeError(str(validation_err)) from validation_err

            return True

        except SystemExit as e:
            print(f"      ❌ Block {idx + 1} called sys.exit({e.code})")
            execution_telemetry['error'] = f"sys.exit({e.code})"
            execution_telemetry['error_type'] = 'SystemExit'
            execution_telemetry['exit_code'] = e.code or 1
            break
        except Exception as e:
            print(f"      ❌ Block {idx + 1} failed: {e}")
            execution_telemetry['error'] = str(e)
            execution_telemetry['error_type'] = type(e).__name__
            execution_telemetry['exit_code'] = 1
            break

    ctx['execution_success'] = False
    ctx['execution_telemetry'] = execution_telemetry
    ctx['_execution_runs'] += 1
    tracker.log_phase_completion('execution', 'fail', {'error': execution_telemetry.get('error')})
    return False


# ═══════════════════════════════════════════════════════════════
# PHASE 5.5: EXECUTION RETRY
# ═══════════════════════════════════════════════════════════════

def phase5_retry(ctx: Dict[str, Any]) -> bool:
    """Phase 5.5 – Execution retry with error feedback and reflection fallback."""
    from pathlib import Path

    task_id = ctx['task_id']
    agent_responses = ctx['agent_responses']
    execution_telemetry = ctx.get('execution_telemetry', {'task_id': task_id})
    error_message = execution_telemetry.get('error', 'Unknown error')
    error_type = execution_telemetry.get('error_type', 'Exception')

    tracker.activate_task(task_id)

    print(f"\n🔄 PHASE 5.5: Execution Retry ({task_id})")

    retry_prompt = f"""# PHASE 5.5: EXECUTION RETRY REQUEST

Your code for Task {task_id} **FAILED DURING EXECUTION**.

## FAILURE DETAILS
**Error Type:** {error_type}
**Error Message:**
```
{error_message}
```
**Exit Code:** {execution_telemetry.get('exit_code', 1)}

---

## TEAM FEEDBACK (Snapshot)
### Quality & Safety Officer
```
{agent_responses.get('quality_safety', '')[:900]}...
```

### Infrastructure Monitor
```
{agent_responses.get('infrastructure', '')[:900]}...
```

---

## BEFORE CODING
- Review execution logs and traceback
- Close any unfinished code blocks (end with ``` on its own line)
- Address Quality & Safety + Infrastructure feedback
- Ensure CLIP validation includes BOTH text and images when applicable
- NEVER call sys.exit(); raise exceptions instead
- For Task W1-004: log per-layer entropy/dispersion statistics to MLflow **and** save `results/week1/clip_baseline_metrics.json`
- For Task W1-006: generate `results/week1/statistical_analysis.md`, `results/week1/hypothesis_tests.json`, and `results/week1/effect_sizes.csv`

---

## Response Format
**Ops Commander Final Verdict:** ✅ APPROVED (execution retry)

## Root Cause Analysis
- [Explain exact failure]
- [Reference peer feedback used in the fix]

## Fixes Applied
- [List code changes]
- [Call out MLflow / CLIP handling if relevant]

## Corrected Implementation
```python
# COMPLETE, EXECUTABLE FIXED CODE
```
"""

    retry_success = False
    use_reflection_retry = False

    print("   🔧 Requesting fixed implementation from Ops Commander (V3.6)...")
    try:
        fixed_ops_response = ops_commander.respond(retry_prompt)
        agent_responses['ops_commander'] = fixed_ops_response
        tracker.log_retry_attempt('execution', patch_id='execution_retry_v3.6', confidence=0.75)

        code_blocks = _extract_code_blocks(fixed_ops_response)
        if not code_blocks:
            print("   ⚠️ No code blocks returned in retry response")
            use_reflection_retry = True
        else:
            # Ensure no dangling MLflow runs carry over between code blocks
            try:
                import mlflow

                active = mlflow.active_run()
                if active:
                    print("      ℹ️ Ending previous MLflow run before executing retry code block")
                    mlflow.end_run()
            except ModuleNotFoundError:
                pass
            except Exception as mlflow_err:
                print(f"      ⚠️ Unable to finalize prior MLflow run before retry: {mlflow_err}")

            ensure_clip_attention_patch()
            _ensure_numpy_json_patch()
            print(f"   📦 Found {len(code_blocks)} code block(s) in retry response")
            for idx, code in enumerate(code_blocks):
                try:
                    ensure_clip_attention_patch()
                    _ensure_numpy_json_patch()
                    exec_globals = {
                        '__name__': '__main__',
                        'MULTI_AGENT_ROOT': MULTI_AGENT_ROOT,
                        'Path': Path,
                        'ensure_clip_attention_patch': ensure_clip_attention_patch,
                    }
                    exec(code, exec_globals)
                    print(f"      ✅ Retry block {idx + 1} executed successfully")

                    if 'run_id' in exec_globals:
                        execution_telemetry['run_id'] = exec_globals['run_id']

                    execution_stats['retry_attempts']['execution'] += 1
                    tracker.log_phase_completion('execution', 'pass')
                    ctx['execution_success'] = True
                    ctx['execution_telemetry'] = execution_telemetry
                    ctx['_execution_runs'] += 1
                    return True
                except SystemExit as e:
                    print(f"      ❌ Retry block {idx + 1} called sys.exit({e.code})")
                    execution_telemetry['error'] = f"sys.exit({e.code})"
                    execution_telemetry['error_type'] = 'SystemExit'
                    execution_telemetry['exit_code'] = e.code or 1
                    use_reflection_retry = True
                    break
                except Exception as e:
                    print(f"      ❌ Retry block {idx + 1} failed: {e}")
                    execution_telemetry['error'] = str(e)
                    execution_telemetry['error_type'] = type(e).__name__
                    execution_telemetry['exit_code'] = 1
                    use_reflection_retry = True
                    break
    except Exception as e:
        print(f"   ⚠️ Retry request failed: {e}")
        use_reflection_retry = True

    if not retry_success and use_reflection_retry:
        print("   🧠 Falling back to reflection-guided retry...")
        error_msg_lower = str(execution_telemetry.get('error', '')).lower()
        error_hints = []

        if 'pixel_values' in error_msg_lower or 'pixel values' in error_msg_lower:
            error_hints.append({
                'pattern': 'pixel_values_missing',
                'diagnosis': 'CLIP/model requires image input',
                'fix': 'Load test image via Image.open(os.environ.get("TEST_IMAGE_PATH")) and pass as images=',
                'confidence': 0.95,
            })
            print("      💡 Hint: Include images when using CLIP processor")

        if any(term in error_msg_lower for term in ['no attention', 'attention layers', 'attentions']):
            error_hints.append({
                'pattern': 'attention_layers_missing',
                'diagnosis': 'CLIP attention tensors not returned',
                'fix': 'Call model(..., output_attentions=True) and set attn_implementation="eager"',
                'confidence': 0.95,
            })
            print("      💡 Hint: Enable output_attentions for CLIP")

        if 'import' in error_msg_lower or 'module' in error_msg_lower:
            error_hints.append({
                'pattern': 'import_error',
                'diagnosis': 'Missing import statement',
                'fix': 'Add required import at top of code block',
                'confidence': 0.90,
            })
            print("      💡 Hint: Missing import detected")

        if 'nameerror' in error_msg_lower or 'not defined' in error_msg_lower:
            error_hints.append({
                'pattern': 'undefined_variable',
                'diagnosis': 'Variable or function referenced before definition',
                'fix': 'Define variable/function before use',
                'confidence': 0.85,
            })
            print("      💡 Hint: Undefined variable or function")

        if error_hints:
            execution_telemetry['error_hints'] = error_hints
            execution_telemetry['error_pattern_detected'] = error_hints[0]['pattern']

        reflection_result = process_execution_reflection(
            multi_agent_root=str(MULTI_AGENT_ROOT),
            task_id=task_id,
            phase='execution',
            telemetry=execution_telemetry,
            attempt_count=ctx.get('_execution_runs', 0),
        )
        tracker.log_reflection_note(reflection_result.get('reflection_note', ''))

        policy = reflection_result.get('policy')
        risk_scores = reflection_result.get('risk_scores', {})
        confidence = risk_scores.get('confidence', 0.0)

        if policy == 'RETRY' and confidence >= retry_mechanisms.tau:
            retry_success, updated_task = retry_mechanisms.phase_5_5_execution_retry(
                task_result=ctx.get('task_result', {}),
                execution_telemetry=execution_telemetry,
                attempt_count=ctx.get('_execution_runs', 0),
            )

            if retry_success:
                print(f"      ✅ Reflection-approved patch applied (confidence {confidence:.2f})")
                tracker.log_retry_attempt('execution', patch_id=updated_task.get('patch_applied'), confidence=confidence)
                ctx.setdefault('task_result', {}).update(updated_task)
                ctx['execution_success'] = True
                ctx['execution_telemetry'] = execution_telemetry
                ctx['_execution_runs'] += 1
                return True
            else:
                print(f"      ⚠️ Reflection retry skipped: {updated_task.get('retry_reason', 'unknown reason')}")
        else:
            print(f"      ⚠️ Reflection policy={policy} confidence={confidence:.2f} – no automated retry")

    ctx['execution_success'] = False
    ctx['execution_telemetry'] = execution_telemetry
    ctx['_execution_runs'] += 1
    tracker.log_phase_completion('execution', 'fail', {'retry': 'exhausted'})
    return False


# ═══════════════════════════════════════════════════════════════
# PHASE 6: VERIFICATION
# ═══════════════════════════════════════════════════════════════

def phase6_verify(ctx: Dict[str, Any]) -> bool:
    """Phase 6 – Verify execution results."""
    task_id = ctx['task_id']
    action = ctx.get('action', '')
    execution_telemetry = ctx.get('execution_telemetry', {})

    tracker.activate_task(task_id)

    print(f"\n🔍 PHASE 6: Verification ({task_id})")

    requires_mlflow = any(
        kw in action.lower() for kw in ['execute', 'run', 'experiment', 'diagnostic', 'train']
    )

    run_id = execution_telemetry.get('run_id')
    run_name = execution_telemetry.get('run_name')

    if requires_mlflow:
        candidate_globals = {'run_id': run_id}
        if run_name:
            candidate_globals['run_name'] = run_name
            candidate_globals['RUN_TAG'] = run_name

        config = execution_telemetry.get('config')
        if isinstance(config, dict):
            experiment_name = config.get('experiment') or config.get('EXPERIMENT_NAME')
            if experiment_name:
                candidate_globals['EXPERIMENT_NAME'] = experiment_name

        if run_id and (len(str(run_id)) != 32 or not re.fullmatch(r'[0-9a-f]{32}', str(run_id))):
            resolved_run_id = _resolve_mlflow_run_id(candidate_globals)
            if resolved_run_id and resolved_run_id != run_id:
                print(f"   ℹ️ Resolved MLflow run_id → {resolved_run_id}")
                run_id = resolved_run_id
                execution_telemetry['run_id'] = resolved_run_id

    if requires_mlflow and not run_id:
        print("   ❌ No run_id found for MLflow-required task")
        tracker.log_phase_completion('verification', 'fail', {'reason': 'no_run_id'})
        ctx['verification_passed'] = False
        return False

    if not requires_mlflow:
        print("   ℹ️  No MLflow verification required for this task")
        tracker.log_phase_completion('verification', 'pass', {'reason': 'no_verification_required'})
        ctx['verification_passed'] = True
        return True

    try:
        import mlflow
        run = mlflow.get_run(run_id)
        print(f"   ✅ MLflow run verified: {run_id}")
        tracker.log_phase_completion('verification', 'pass')
        ctx['verification_passed'] = True
        return True
    except Exception as e:
        print(f"   ❌ Verification failed: {e}")
        tracker.log_phase_completion('verification', 'fail', {'error': str(e)})
        ctx['verification_passed'] = False
        return False


# ═══════════════════════════════════════════════════════════════
# PHASE 6.5: VERIFICATION RETRY
# ═══════════════════════════════════════════════════════════════

def phase6_retry(ctx: Dict[str, Any]) -> bool:
    """Phase 6.5 – Verification retry."""
    task_id = ctx['task_id']
    tracker.activate_task(task_id)
    print(f"\n🔍 PHASE 6.5: Verification Retry ({task_id})")

    print("   🔧 Requesting verification fix...")
    fixed_ops_response = ops_commander.respond(f"""# VERIFICATION FAILED

Provide code with proper MLflow logging:

```python
import mlflow
with mlflow.start_run() as run:
    run_id = run.info.run_id
    mlflow.log_metric("metric", 1.0)
print(f"run_id = {{run_id}}")
```
""")

    code_blocks = _extract_code_blocks(fixed_ops_response)
    if not code_blocks:
        return False

    execution_telemetry = ctx.get('execution_telemetry', {})
    for idx, code in enumerate(code_blocks):
        try:
            exec_globals = {'__name__': '__main__', 'MULTI_AGENT_ROOT': MULTI_AGENT_ROOT, 'Path': Path}
            exec(code, exec_globals)

            if 'run_id' in exec_globals:
                run_id = exec_globals['run_id']
                execution_telemetry['run_id'] = run_id
                print(f"      ✅ Captured run_id: {run_id}")

                import mlflow
                mlflow.get_run(run_id)
                print(f"      ✅ MLflow run verified")

                execution_stats['retry_attempts']['verification'] += 1
                tracker.log_phase_completion('verification', 'pass')
                ctx['verification_passed'] = True
                return True
        except Exception as e:
            print(f"      ❌ Retry failed: {e}")

    return False


# ═══════════════════════════════════════════════════════════════
# PHASE 7: FINAL STATUS DETERMINATION
# ═══════════════════════════════════════════════════════════════

def phase7_finalize(ctx: Dict[str, Any]) -> None:
    """Phase 7 – Determine final task status."""
    task_id = ctx['task_id']
    preliminary_status = ctx.get('preliminary_status')
    execution_success = ctx.get('execution_success', False)
    verification_passed = ctx.get('verification_passed', False)

    tracker.activate_task(task_id)

    print(f"\n📊 PHASE 7: Final Status ({task_id})")

    if preliminary_status == "approved" and execution_success and verification_passed:
        final_status = "completed"
        print(f"   ✅ Task completed successfully")
        execution_stats['tasks_completed'] += 1
        tracker.complete_task(status='completed', task_id=task_id, final_status_reason='Approved + evidence verified')
    elif preliminary_status == "approved" and execution_success:
        final_status = "failed"
        print(f"   ❌ Task FAILED - Execution succeeded but verification failed")
        tracker.complete_task(status='failed', task_id=task_id, final_status_reason='Verification failed')
    elif preliminary_status == "approved":
        final_status = "failed"
        print(f"   ❌ Task FAILED - Execution failed")
        tracker.complete_task(status='failed', task_id=task_id, final_status_reason='Execution failure')
    else:
        final_status = "rejected"
        print(f"   ❌ Task REJECTED - Did not pass approval gate")
        tracker.complete_task(status='rejected', task_id=task_id, final_status_reason='Rejected at approval gate')

    ctx['final_status'] = final_status

    task_timers = getattr(tracker, "task_timers", {})
    duration = task_timers.get(task_id, {}).get('duration', 0)
    total_retries = sum(execution_stats['retry_attempts'].values())

    print(f"   ✅ Task completed in {duration:.1f}s - Status: {final_status}")
    print(f"   🔄 Total retries: {total_retries}")

    tracker.log_phase_completion('finalize', 'pass', {
        'final_status': final_status,
        'duration': duration,
        'total_retries': total_retries
    })



In [16]:
#cell_9

def prepare_execution_cycle(selected_task_ids: Optional[List[str]] = None) -> List[str]:
    """Prepare task contexts (Phase 0/2 artifacts must already exist)."""
    if task_contexts:
        print("\nℹ️  Existing task contexts detected — call reset_execution_state() for a clean slate.")

    ensure_phase_0()

    tasks = pending_actions.get('tasks', [])
    if not tasks:
        print("\n⚠️  WARNING: No tasks found in pending_actions.json!")
        return []

    print(f"\n📋 Found {len(tasks)} tasks to process")
    sorted_tasks = sorted(
        tasks,
        key=lambda t: {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2}.get(t.get('priority', 'LOW'), 2),
    )

    prepared_ids: List[str] = []
    for task in sorted_tasks:
        if selected_task_ids and task['task_id'] not in selected_task_ids:
            continue
        ctx = initialize_task_context(task)
        prepared_ids.append(ctx['task_id'])

    if not prepared_ids:
        print("\n⚠️  WARNING: No matching tasks found for the provided filters.")
        return []

    global current_task_order
    current_task_order = prepared_ids

    print(f"   ✅ Prepared {len(prepared_ids)} task(s) for execution phases")
    return prepared_ids


def _iter_task_contexts(task_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
    """Return task contexts respecting the original scheduling order."""
    if not task_contexts:
        raise RuntimeError("No task contexts prepared. Run prepare_execution_cycle() first.")

    if task_ids:
        missing = [tid for tid in task_ids if tid not in task_contexts]
        if missing:
            print(f"\n⚠️  WARNING: Unknown task IDs requested: {', '.join(missing)}")
        ordered_ids = [tid for tid in task_ids if tid in task_contexts]
    else:
        ordered_ids = current_task_order or list(task_contexts.keys())

    return [task_contexts[tid] for tid in ordered_ids]


def run_phase3_batch(task_ids: Optional[List[str]] = None) -> None:
    """Execute Phase 3 (agent collection) for the prepared tasks."""
    for ctx in _iter_task_contexts(task_ids):
        phase3_collect(ctx)


def run_phase4_batch(task_ids: Optional[List[str]] = None) -> Dict[str, str]:
    """Execute Phase 4 (approval gate) and return status per task."""
    results: Dict[str, str] = {}
    for ctx in _iter_task_contexts(task_ids):
        status = phase4_gate(ctx)
        results[ctx['task_id']] = status
    return results


def run_phase4_5_batch(task_ids: Optional[List[str]] = None) -> Dict[str, str]:
    """Execute Phase 4.5 (approval retry) for rejected tasks."""
    results: Dict[str, str] = {}
    for ctx in _iter_task_contexts(task_ids):
        if ctx.get('preliminary_status') != "rejected":
            continue
        status = phase4_5_approval_retry(ctx)
        results[ctx['task_id']] = status
    return results


def run_phase5_batch(task_ids: Optional[List[str]] = None) -> Dict[str, bool]:
    """Execute Phase 5 (implementation execution) for approved tasks."""
    results: Dict[str, bool] = {}
    for ctx in _iter_task_contexts(task_ids):
        if ctx.get('preliminary_status') != "approved":
            continue
        success = phase5_execute(ctx)
        results[ctx['task_id']] = success
    return results


def run_phase5_retry_batch(task_ids: Optional[List[str]] = None) -> Dict[str, bool]:
    """Execute Phase 5.5 (execution retry) for tasks that failed execution."""
    results: Dict[str, bool] = {}
    for ctx in _iter_task_contexts(task_ids):
        if ctx.get('preliminary_status') != "approved" or ctx.get('execution_success'):
            continue
        success = phase5_retry(ctx)
        results[ctx['task_id']] = success
    return results


def run_phase6_batch(task_ids: Optional[List[str]] = None) -> Dict[str, bool]:
    """Execute Phase 6 (verification) for successfully executed tasks."""
    results: Dict[str, bool] = {}
    for ctx in _iter_task_contexts(task_ids):
        if not ctx.get('execution_success'):
            continue
        passed = phase6_verify(ctx)
        results[ctx['task_id']] = passed
    return results


def run_phase6_retry_batch(task_ids: Optional[List[str]] = None) -> Dict[str, bool]:
    """Execute Phase 6.5 (verification retry) for tasks that failed verification."""
    results: Dict[str, bool] = {}
    for ctx in _iter_task_contexts(task_ids):
        if not ctx.get('execution_success') or ctx.get('verification_passed'):
            continue
        passed = phase6_retry(ctx)
        results[ctx['task_id']] = passed
    return results


def run_phase7_batch(task_ids: Optional[List[str]] = None) -> None:
    """Execute Phase 7 (finalization) for the selected tasks."""
    for ctx in _iter_task_contexts(task_ids):
        phase7_finalize(ctx)


def run_execution_cycle(selected_task_ids: Optional[List[str]] = None) -> None:
    """Run Phases 3–7 end-to-end for the pending actions queue."""
    prepared_ids = prepare_execution_cycle(selected_task_ids)
    if not prepared_ids:
        return

    run_phase3_batch(prepared_ids)
    run_phase4_batch(prepared_ids)

    rejected_ids = [
        tid for tid in prepared_ids
        if task_contexts[tid].get('preliminary_status') == "rejected"
    ]
    if rejected_ids:
        run_phase4_5_batch(rejected_ids)

    approved_ids = [
        tid for tid in prepared_ids
        if task_contexts[tid].get('preliminary_status') == "approved"
    ]

    if approved_ids:
        run_phase5_batch(approved_ids)
        run_phase5_retry_batch(approved_ids)

        execution_success_ids = [
            tid for tid in approved_ids
            if task_contexts[tid].get('execution_success')
        ]

        if execution_success_ids:
            run_phase6_batch(execution_success_ids)
            run_phase6_retry_batch(execution_success_ids)

    run_phase7_batch(prepared_ids)

    print("\n" + "=" * 70)
    print("✅ ALL TASKS PROCESSED")
    print("=" * 70)
    print_execution_summary()


def print_execution_summary() -> None:
    summary = tracker.get_summary()
    print(f"   Tasks Processed: {summary['total_tasks']}")
    print(f"   Tasks Completed: {summary['completed']}")
    print(f"   Tasks Failed: {summary['failed']}")
    print(f"   Total Retries: {sum(summary['retry_stats'].values())}")
    print(f"     - Approval: {summary['retry_stats']['approval_retries']}")
    print(f"     - Execution: {summary['retry_stats']['execution_retries']}")
    print(f"     - Verification: {summary['retry_stats']['verification_retries']}")
    print(f"   Total Duration: {summary['total_duration_seconds']:.1f}s")
    print("\n   Continue to Phase 8 (Reporting & Handoff)")




In [17]:
#cell_10

def rerun_execution_phase(task_id: str, phase: str) -> None:
    ctx = task_contexts.get(task_id)
    if not ctx:
        print(f"❌ Task {task_id} not found in cached contexts.")
        return

    phase_key = phase.lower()
    if phase_key in {"5", "phase5", "execution"}:
        phase5_execute(ctx, rerun=True)
    elif phase_key in {"5.5", "phase5.5", "execution_retry"}:
        phase5_retry(ctx)
    elif phase_key in {"6", "phase6", "verification"}:
        phase6_verify(ctx)
    elif phase_key in {"6.5", "phase6.5", "verification_retry"}:
        phase6_retry(ctx)
    else:
        print("⚠️ Unknown phase. Valid options: phase5, phase5.5, phase6, phase6.5")


def export_task_contexts(label: str = "execution_cycle") -> Path:
    export_dir = Path(MULTI_AGENT_ROOT) / "ledgers" / "execution_trajectories"
    export_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    export_path = export_dir / f"{label}_{timestamp}.json"

    exportable = {}
    for task_id, ctx in task_contexts.items():
        exportable[task_id] = {
            'task': ctx['task'],
            'preliminary_status': ctx.get('preliminary_status'),
            'execution_success': ctx.get('execution_success'),
            'verification_passed': ctx.get('verification_passed'),
            'execution_telemetry': ctx.get('execution_telemetry'),
            'agent_responses': ctx.get('agent_responses'),
        }

    with export_path.open('w') as f:
        json.dump(exportable, f, indent=2)

    print(f"💾 Task contexts exported to {export_path}")
    return export_path


In [18]:
#cell_clip_patch
_CLIP_ATTENTION_PATCHED = False


def _apply_clip_attention_patch() -> None:
    """Force CLIP models to use eager attention so output_attentions is supported."""
    global _CLIP_ATTENTION_PATCHED
    if _CLIP_ATTENTION_PATCHED:
        return

    try:
        from transformers import CLIPModel, CLIPTextModel, CLIPVisionModel
    except Exception as err:
        print(f"⚠️ Unable to patch CLIP attention defaults: {err}")
        return

    def _set_config(config):
        if hasattr(config, "attn_implementation"):
            config.attn_implementation = "eager"
        if hasattr(config, "output_attentions"):
            config.output_attentions = True

    def _patch_from_pretrained(cls, original_fn):
        def wrapper(*args, **kwargs):
            model = original_fn(*args, **kwargs)
            if hasattr(model, "text_model") and hasattr(model.text_model, "config"):
                _set_config(model.text_model.config)
            if hasattr(model, "vision_model") and hasattr(model.vision_model, "config"):
                _set_config(model.vision_model.config)
            return model

        return wrapper

    CLIPModel.from_pretrained = _patch_from_pretrained(CLIPModel, CLIPModel.from_pretrained)
    CLIPTextModel.from_pretrained = _patch_from_pretrained(CLIPTextModel, CLIPTextModel.from_pretrained)
    CLIPVisionModel.from_pretrained = _patch_from_pretrained(CLIPVisionModel, CLIPVisionModel.from_pretrained)

    _CLIP_ATTENTION_PATCHED = True
    print("✅ Patched CLIP models to use eager attention implementation by default")


def configure_clip_attn_defaults() -> None:
    _apply_clip_attention_patch()


def ensure_clip_attention_patch() -> None:
    _apply_clip_attention_patch()


configure_clip_attn_defaults()


✅ Patched CLIP models to use eager attention implementation by default


In [69]:
pending_ids = [
    task_id
    for task_id, ctx in task_contexts.items()
    if ctx.get('final_status') != 'completed'
]

print("Pending tasks:", pending_ids)


Pending tasks: []


In [26]:
hasattr(tracker, "activate_task")


True

In [53]:
# Phase 0 sanity check (run once before executing W1-006)
import os

ensure_phase_0()
print("Phase 0 complete.")
print("TEST_IMAGE_PATH:", os.environ.get("TEST_IMAGE_PATH"))
print("TEST_DATA_DIR:", os.environ.get("TEST_DATA_DIR"))


Phase 0 complete.
TEST_IMAGE_PATH: None
TEST_DATA_DIR: None
Creating test images for model validation...
Created 5 test images at /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/test_data
 Primary test image: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/test_data/test_image_0.png
   📷 Downloaded Pexels test image → /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/test_data/test_image_pexels.jpg
  Phase 0 infrastructure setup complete

Phase 0 complete.
TEST_IMAGE_PATH: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/test_data/test_image_pexels.jpg
TEST_DATA_DIR: /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clea

In [54]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-006']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()


ℹ️  Existing task contexts detected — call reset_execution_state() for a clean slate.

📋 Found 8 tasks to process

🚀 Starting Task W1-006: Statistical significance analysis of Week 1 results
   Priority: MEDIUM
   ✅ Prepared 1 task(s) for execution phases

📋 PHASE 3: Implementation (W1-006)
   ✅ ops_commander responded (17742 chars)
   ✅ quality_safety responded (2995 chars)
   ℹ️ Infrastructure fallback review applied
   ✅ infrastructure responded (906 chars)
   ✅ Phase implementation: pass

🔐 PHASE 4: Approval Gate (W1-006)

   🔍 Approval Gate Analysis:
      Ops: APPROVED (conf: 0.90)
      Quality: APPROVED (conf: 0.90)
      Infrastructure: APPROVED (conf: 0.90)
      ✅ ALL GATES APPROVED (avg confidence: 0.90)
   ✅ Phase approval: pass

⚡ PHASE 5: Execution (W1-006)
   📦 Found 1 code block(s)
🔧 Infrastructure setup - Device: cpu
🔬 Week 1 Statistical Analysis Starting...
🔧 Infrastructure setup - Device: cpu
🔧 Loading CLIP model on cpu
✅ CLIP validated - Vision: 12, Text: 12 layer

In [None]:
# load or paste your report text
big_text_or_file_content = """
... your Week 1 GO/NO-GO report text here ...
"""

# send it through the relay
result = send_go_nogo_report(big_text_or_file_content)
print(result["content"][0]["text"])

In [55]:
import json

print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")


   Tasks Processed: 1
   Tasks Completed: 1
   Tasks Failed: 0
   Total Retries: 10
     - Approval: 3
     - Execution: 2
     - Verification: 0
   Total Duration: 4750.9s

   Continue to Phase 8 (Reporting & Handoff)
💾 Task contexts exported to /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/ledgers/execution_trajectories/week1_run_20251022T030007Z.json
Exported task contexts to /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/ledgers/execution_trajectories/week1_run_20251022T030007Z.json


In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-006']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()

NameError: name 'prepare_execution_cycle' is not defined

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-006']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()

NameError: name 'prepare_execution_cycle' is not defined

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-006']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()

NameError: name 'prepare_execution_cycle' is not defined

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-006']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()

NameError: name 'prepare_execution_cycle' is not defined

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-004']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()

In [41]:
import json

print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")

   Tasks Processed: 1
   Tasks Completed: 1
   Tasks Failed: 0
   Total Retries: 10
     - Approval: 3
     - Execution: 2
     - Verification: 0
   Total Duration: 2293.7s

   Continue to Phase 8 (Reporting & Handoff)
💾 Task contexts exported to /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/ledgers/execution_trajectories/week1_run_20251022T021910Z.json
Exported task contexts to /Users/guyan/Library/CloudStorage/GoogleDrive-rc989@cornell.edu/我的云端硬盘/cv_multimodal/project/computer-vision-clean/multi-agent/ledgers/execution_trajectories/week1_run_20251022T021910Z.json


In [None]:


# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-005']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

print("\n====== EXECUTION SUMMARY ======")
print_execution_summary()


In [None]:
import json

# Phase 8 summary + exports
print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")


In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-006']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

In [None]:
import json

# Phase 8 summary + exports
print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")


In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-007']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

In [None]:
import json

# Phase 8 summary + exports
print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-008']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

In [None]:
import json

# Phase 8 summary + exports
print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-001']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

In [None]:
import json

# Phase 8 summary + exports
print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")

In [None]:
# Only prepare the second task (add more IDs to the list if you want to batch others)
selected_tasks = ['W1-002']
prepared = prepare_execution_cycle(selected_tasks)

if not prepared:
    raise RuntimeError("Task preparation failed; confirm the task ID appears in pending_actions.json")

# Phase 3 → 7, end-to-end
run_phase3_batch(prepared)
run_phase4_batch(prepared)

# Retry cycle only kicks in when Phase 4 rejected something
rejected = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'rejected']
if rejected:
    print("\n🔄 Running approval retries...")
    run_phase4_5_batch(rejected)

approved = [tid for tid in prepared if task_contexts[tid].get('preliminary_status') == 'approved']
if approved:
    run_phase5_batch(approved)
    run_phase5_retry_batch(approved)

    executed = [tid for tid in approved if task_contexts[tid].get('execution_success')]
    if executed:
        run_phase6_batch(executed)
        run_phase6_retry_batch(executed)

run_phase7_batch(prepared)

In [None]:
import json

# Phase 8 summary + exports
print_execution_summary()

summary = tracker.get_summary()
with open('week1_execution_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

snapshot_path = export_task_contexts("week1_run")
print(f"Exported task contexts to {snapshot_path}")