In [1]:
import kagglehub
import pandas as pd
import numpy as np
import os
import getpass
from openai import OpenAI
import scipy
from langchain_openai import ChatOpenAI
from typing import List, Any, Optional, Dict
from typing_extensions import TypedDict
import re
from pprint import pprint
from pydantic import BaseModel, Field, field_validator
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage
from langchain.schema import HumanMessage, SystemMessage
import time
from pathlib import Path
from langchain_community.callbacks import get_openai_callback
import re
import matplotlib.pyplot as plt
import io
import contextlib
import base64
import plotly
from category_encoders import *
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import psutil
import warnings
import seaborn as sns
import sys
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
from dataclasses import dataclass, field

# Import iterative system components
from core.planner_agent import PlannerAgent, PlannerAgentData
from orchestrators.orchestrator import IterativeOrchestrator
from reporting.exporters import IterativeReportExporter
from reporting.task_boards import TaskChecklist
from reporting.summarizer import ReportSummarizer
from core.pipeline_state import PipelineState
from reporting.QA import QualityAssurance
from reporting.validator import unit_test_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setup your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")  # Make sure this is set in your environment

if not api_key:
    print("⚠️ Warning: OPENAI_API_KEY not found. Please set your API key.")
  
# Initialize LLM (same as your original setup)
llm = ChatOpenAI(
    model="gpt-4.1-nano",
    #model="gpt-4.1-mini",   # Use gpt-4o if you have access #nano
    temperature=0.5,
    openai_api_key=api_key
)

llm_coder = ChatOpenAI(
    #model="gpt-4.1-nano",
    model="gpt-4.1-mini",   # Use gpt-4o if you have access #nano
    temperature=0.2,
    openai_api_key=api_key
)

print("✅ LLM configured successfully!")

✅ LLM configured successfully!


In [3]:
# Option 2: Load your own CSV
df = pd.read_excel('data/Pumpkin_Seeds_Dataset.xlsx')
#df_test = pd.read_csv('data/df_test.csv')

print(f"✅ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"📋 Columns: {list(df.columns)}")


✅ Dataset loaded: 2500 rows, 13 columns
📋 Columns: ['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length', 'Convex_Area', 'Equiv_Diameter', 'Eccentricity', 'Solidity', 'Extent', 'Roundness', 'Aspect_Ration', 'Compactness', 'Class']


In [4]:
# Generate context
summary_stats = df.describe(include='all').to_string()
column_info = df.dtypes.to_string()
col_names = ", ".join(df.columns)
context = f"""## Dataset: Pumpkin Seed Data

### Schema:
{column_info}

### Summary Statistics:
{summary_stats}

### Column Names:
{col_names}"""

In [5]:
# Project configuration
PROJECT_NAME = "pumpkin_seeds_iterative_v2"
PIPELINE_TASKS = [
    "Exploratory Data Analysis (EDA)",
    "Feature Engineering", 
    "Model Selection & Evaluation"
]

In [6]:
# Initialize Pipeline State
pipeline_state = PipelineState(project_name=PROJECT_NAME)

# Check existing project state
completed_phases = pipeline_state.get_completed_phases()
print(f"\n📁 Iterative Project: {PROJECT_NAME}")

if completed_phases:
    print(f"✅ Found existing phases: {completed_phases}")
else:
    print("🆕 Starting new iterative project")

# Initialize defaults
remaining_tasks = PIPELINE_TASKS
current_df = df

# Auto-resume from last completed phase
if completed_phases:
    last_phase = completed_phases[-1]
    print(f"🔄 Auto-resuming from last completed phase: {last_phase}")
    pipeline_state.load_from_phase(last_phase)
    
    try:
        last_index = PIPELINE_TASKS.index(last_phase) + 1
        remaining_tasks = PIPELINE_TASKS[last_index:]
        current_df = pipeline_state.df if pipeline_state.df is not None else df
        print(f"📊 Loaded dataframe shape: {current_df.shape}")
    except (ValueError, IndexError):
        print("❌ Error in resume logic, starting fresh")
        remaining_tasks = PIPELINE_TASKS
else:
    print("🆕 Starting fresh iterative pipeline...")
    remaining_tasks = PIPELINE_TASKS

# Execution Summary
print(f"\n📋 Iterative Execution Plan:")
print(f"   Project: {PROJECT_NAME}")
print(f"   Architecture: 3-Agent Iterative (Planner → Developer → Auditor → Developer)")
print(f"   Completed phases: {completed_phases}")
print(f"   Remaining phases: {remaining_tasks}")
print(f"   DataFrame shape: {current_df.shape}")

if not remaining_tasks:
    print("\n✅ All phases completed! Nothing to do.")
    print("💡 Tip: Change PROJECT_NAME or delete cache to start over")
    exit()

print(f"\n🚀 Will execute {len(remaining_tasks)} phase(s) using iterative process")



📁 Iterative Project: pumpkin_seeds_iterative_v2
🆕 Starting new iterative project
🆕 Starting fresh iterative pipeline...

📋 Iterative Execution Plan:
   Project: pumpkin_seeds_iterative_v2
   Architecture: 3-Agent Iterative (Planner → Developer → Auditor → Developer)
   Completed phases: []
   Remaining phases: ['Exploratory Data Analysis (EDA)', 'Feature Engineering', 'Model Selection & Evaluation']
   DataFrame shape: (2500, 13)

🚀 Will execute 3 phase(s) using iterative process


In [7]:
# Main execution loop
for task in remaining_tasks:
    print(f"\n" + "="*60)
    print(f"🔄 Running iterative pipeline phase: {task}")
    print("="*60)

    pipeline_state.clear_validation_log()

    # Timer start
    start = time.perf_counter()
    print("⏱️ Timer started")

    # Token tracking
    if get_openai_callback is not None:
        ctx = get_openai_callback()
    else:
        ctx = None

    with ctx as cb:
        # Generate 3-agent personas for iterative workflow

        # Get summary of previous phases
        prior_summary = pipeline_state.get_contextual_summary(last_n=1)
        if prior_summary:
            print(f"\n📄 Prior context loaded: {len(prior_summary)} characters")

        # Create and run iterative orchestrator
        print(f"\n🎯 Starting iterative orchestration...")
        orchestrator = IterativeOrchestrator(
            df=current_df,
            topic=task,
            llm=llm,
            llm_coder=llm_coder,
            summary=prior_summary,
            pipeline_state=pipeline_state
        )
        
        # Execute the 4-step iterative process
        phase_results = orchestrator.run()
        
        # Update DataFrames from orchestrator
        pipeline_state.df = orchestrator.executor.df

        # Generate summary using existing summarizer
        summary_path = pipeline_state.save_dir / "reports" / f"{PROJECT_NAME}_{task.lower().replace(' ', '_').replace('&', 'and')}_summary.html"
        summarizer = ReportSummarizer(
            results=phase_results,
            task=task,
            output_path=summary_path,
            llm=llm_coder,
            pipeline_state=pipeline_state
        )
        summarizer.run()
    
        # Save phase to pipeline cache
        pipeline_state.save_phase(task, [{k: v for k, v in r.items() if k != "images"} for r in phase_results])

        # Generate iterative reports
        print(f"\n📄 Generating iterative reports for {task}...")
        
        iterative_exporter = IterativeReportExporter(task=task, results=phase_results)
        iterative_exporter.export_all(project_name=PROJECT_NAME, save_dir=pipeline_state.save_dir / "reports")
        
        # Validation reports
        validation_log = pipeline_state.get_validation_log()
        if validation_log:
            validation_filename = pipeline_state.save_dir / "reports" / f"{PROJECT_NAME}_{task.lower().replace(' ', '_').replace('&', 'and')}_validation.html"
            unit_test_report(
                validation_log=validation_log,
                phase_name=task,
                project_name=PROJECT_NAME,
                filename=validation_filename
            )
        
        # QA reports
        qa_agent = QualityAssurance(pipeline_state, llm=llm_coder)
        qa_agent.validate_results(phase_results, task)
        qa_agent.export_report(phase_name=task, fmt="csv", save_dir=pipeline_state.save_dir / "reports")

        # Checklist validation
        checklist_validator = TaskChecklist(llm=llm_coder, task=task)
        report = checklist_validator.validate_phase(task, phase_results)
        checklist_validator.export_report(report, pipeline_state.save_dir / "reports" / f"{task.lower().replace(' ', '_')}_checklist_report.json")

        # Timer end
        elapsed_sec = time.perf_counter() - start
        print("\n\n")
        print(f"⏱️  {task} took {elapsed_sec:.2f}s")
        print(f"🪙  Tokens — total: {cb.total_tokens}, prompt: {cb.prompt_tokens}, completion: {cb.completion_tokens}")

        # Save metrics
        metrics_file = pipeline_state.save_dir / "reports" / f"{task}_iterative_metrics.txt"
        with open(metrics_file, "w") as f:
            f.write(f"Iterative Process: {task} took {elapsed_sec:.2f}s\n")
            f.write(f"Tokens — total: {cb.total_tokens}, prompt: {cb.prompt_tokens}, completion: {cb.completion_tokens}\n")
            if hasattr(cb, "total_cost"):
                f.write(f"Estimated API cost: ${cb.total_cost:.4f}\n")
            f.write(f"Architecture: 3-Agent Iterative (Planner → Developer → Auditor → Developer)\n")
            
print("\n" + "="*60)
print("✅ ITERATIVE PIPELINE EXECUTION COMPLETED!")
print("="*60)

print(f"\n📊 Final Iterative Pipeline Summary:")
summary = pipeline_state.get_project_summary()
for key, value in summary.items():
    print(f"   {key}: {value}")

print(f"\n🔄 Process Architecture: 3-Agent Iterative")
print(f"   1. Planner: Strategic planning and task decomposition")
print(f"   2. Developer: Initial implementation")
print(f"   3. Auditor: Quality review and feedback")
print(f"   4. Developer: Refined implementation based on feedback")

print(f"\n📁 Generated Files (Project: {PROJECT_NAME}):")
import glob
report_dir = pipeline_state.save_dir / "reports"
if report_dir.exists():
    for file_path in report_dir.glob("*"):
        if file_path.is_file():
            print(f"   📄 {file_path.name}")

print(f"\n💡 To view results:")
print(f"   📂 Check: {pipeline_state.save_dir}")
print(f"   📊 Reports: {report_dir}")
print(f"   🔄 Architecture: Iterative 3-Agent System")


🔄 Running iterative pipeline phase: Exploratory Data Analysis (EDA)
⏱️ Timer started

🎯 Starting iterative orchestration...
🔒 Data Integrity Validator initialized:
   Expected shape: (2500, 13)
   Essential columns: 13
   Target column: Class

🧭 Planning phase (single planner)...
📋 Planner produced 8 subtasks.

💻 Developer executing subtasks...
1. Data Overview and Summary Statistics
📝 No Code History Found
plan: Begin by loading the dataset and generating a comprehensive summary of the data including the count, mean, standard deviation, minimum, maximum, and quartiles for each numerical feature. This will provide an initial understanding of the data's central tendency and spread. Also, review the data types of each column to confirm they are appropriate for their content.
🟢 Auditor: ACCEPT
✅ Developer code executed successfully
📄 Extracted Code:
 import pandas as pd
import numpy as np

# Display summary statistics for numerical columns
print("Summary Statistics for Numerical Features

KeyboardInterrupt: 