# # OpenAD Code-Generation Benchmark Notebook
# This notebook benchmarks the OpenAD code-generation pipeline across multiple libraries (PyOD, PyGOD, Darts, sktime).
# It measures success rate, total runtime, InfoMiner durations, and LLM token usage, then exports results.json and summary tables.

In [None]:
import os, sys, types, json
%pip install tiktoken faiss-cpu pandas matplotlib pygod
# ensure project root is on path
sys.path.append(os.getcwd())
sys.path.append(os.path.dirname(os.getcwd()))



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [88]:
# ## 1. Setup Imports and Instrumentation
# Install required packages (if needed) and import modules



import time
import json
import pandas as pd
import matplotlib.pyplot as plt

# Import your instrumentation and pipeline
from benchmark.instrumentation import InstrumentedChatOpenAI, InstrumentedInfoMiner, InstrumentedCoder
from main import compiled_full_graph, FullToolState

import langchain_openai

# Monkey-patch ChatOpenAI to our instrumented version
langchain_openai.ChatOpenAI = InstrumentedChatOpenAI


In [50]:
## 2. Helper Functions

def run_library_benchmark(lib_name, experiment_config):
    """
    Runs the OpenAD pipeline up to code generation for a single library.
    Returns per-algorithm metrics.
    """
    # Prepare initial state
    base_state: FullToolState = {
        "messages": [],
        "current_tool": "",
        "input_parameters": {},
        "data_path_train": experiment_config['dataset_train'],
        "data_path_test": experiment_config['dataset_test'],
        "package_name": lib_name,
        "agent_infominer": InstrumentedInfoMiner(),
        "agent_coder": InstrumentedCoder(),
        "agent_reviewer": None,
        "agent_evaluator": None,
        "agent_optimizer": None,
        "vectorstore": None,
        "code_quality": None,
        "should_rerun": False,
        "agent_preprocessor": None,
        "agent_selector": None,
        "experiment_config": experiment_config,
        "results": None,
        "algorithm_doc": None,
    }
    
    # Invoke only the process_all_tools node to skip preprocessor/selector
    from main import process_all_tools
    final_state = compiled_full_graph.invoke(base_state, config={"recursion_limit": 20})
    # Extract metrics
    metrics = []
    for tool, tstate in final_state['results']:
        coder = base_state['agent_coder']
        infom = base_state['agent_infominer']
        # success if code generated and no exception
        success = hasattr(tstate.get('code_quality'), 'code')
        gen_time = coder.last_generation_duration
        info_time = infom.last_query_duration
        tokens_in = langchain_openai.ChatOpenAI().input_tokens
        tokens_out = langchain_openai.ChatOpenAI().output_tokens
        metrics.append({
            'library': lib_name,
            'algorithm': tool,
            'success': success,
            'generation_time': gen_time,
            'infominer_time': info_time,
            'tokens_in': tokens_in,
            'tokens_out': tokens_out,
        })
    return metrics

In [51]:
%pip install torch_geometric

from pygod.utils import load_data
import os
import torch

os.makedirs('pygod_data', exist_ok=True)
for name in ['weibo']:
    path = f'pygod_data/{name}.pt'
    if not os.path.exists(path):
        print(f"Downloading '{name}' dataset...")
        data = load_data(name)
        torch.save(data, path)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [52]:

# ## 3. Define Experiment Configurations
# Provide dataset paths for each library
exp_configs = {
    'pyod': {
        'algorithm': ['ABOD','LOF','IForest'],
        'dataset_train': './data/glass_train.mat',
        'dataset_test': './data/glass_test.mat',
        'parameters': {'contamination': 0.1}
    },
    'pygod': {
        'algorithm': ['OCGNN','GCN','SCAN'],
        'dataset_train': './pygod_data/graph1.pt',  # clone https://github.com/pygod-team/data
        'dataset_test': './pygod_data/graph2.pt',
        'parameters': {}
    },
    # 'darts': {
    #     'algorithm': ['DifferenceScorer','NormScorer'],
    #     'dataset_train': './data/yahoo_train.csv',
    #     'dataset_test': './data/yahoo_test.csv',
    #     'parameters': {}
    # },
    # 'sktime': {
    #     'algorithm': ['KMeansScorer'],
    #     'dataset_train': './data/yahoo_train.csv',
    #     'dataset_test': './data/yahoo_test.csv',
    #     'parameters': {}
    # }
}

In [147]:
# ## 2. Helper Function for PyOD InfoMiner

def run_pyod_infominer(algorithms, train_path, test_path, params):
    # Directly benchmark InfoMiner.query_docs without running the full pipeline
    infom = InstrumentedInfoMiner()
    results = []
    for algo in algorithms:
        # Time a single documentation query
        _ = infom.query_docs(algo, None, 'pyod')
        results.append({
            'algorithm': algo,
            'infominer_time': infom.last_query_duration
        })
    return results

# %% [markdown]
# ## 3. Run Benchmark for Selected PyOD Algorithms
algos = [
    'MO-GAAL','SO-GAAL','AutoEncoder','VAE','AnoGAN',
    'DeepSVDD','ALAD','AE1SVM','DevNet','LUNAR'
]
train_file = './data/glass_train.mat'
test_file  = './data/glass_test.mat'
params = {'contamination': 0.1}

metrics = run_pyod_infominer(algos, train_file, test_file, params)

# Convert to DataFrame
df = pd.DataFrame(metrics)
df.to_json('pyod_infominer_times.json', orient='records', indent=2)

display(df)

# %% [markdown]
# ## 4. Summary of InfoMiner Time
summary = df['infominer_time'].agg(['mean','std'])
display(summary)


[Cache Hit] Using recent cache for MO-GAAL
The `MO_GAAL` class in PyOD is designed for Multi-Objective Generative Adversarial Active Learning, which generates potential outliers to help classifiers effectively distinguish between normal data and outliers. To prevent mode collapse, it employs multiple generators with different objectives.

**Initialization Function (`__init__`):**

The `__init__` method initializes the `MO_GAAL` class with the following parameters:

- **contamination**: float in (0., 0.5), optional (default=0.1)
  - The proportion of outliers in the dataset. Used to define the threshold on the decision function.

- **k**: int, optional (default=10)
  - The number of sub-generators.

- **stop_epochs**: int, optional (default=20)
  - The number of training epochs. The total number of epochs equals three times this value.

- **lr_d**: float, optional (default=0.01)
  - Learning rate of the discriminator.

- **lr_g**: float, optional (default=0.0001)
  - Learning rate of th

Unnamed: 0,algorithm,infominer_time
0,MO-GAAL,0.004609
1,SO-GAAL,0.000713
2,AutoEncoder,0.00057
3,VAE,0.000506
4,AnoGAN,0.000483
5,DeepSVDD,0.000439
6,ALAD,0.000404
7,AE1SVM,0.000387
8,DevNet,0.000334
9,LUNAR,0.000349


mean    0.000879
std     0.001315
Name: infominer_time, dtype: float64

In [145]:
from instrumentation import InstrumentedInfoMiner
import pandas as pd
from IPython.display import display

# 1) Helper for PyGOD InfoMiner (includes dataset column)
def run_pygod_infominer(algorithms, dataset_name, package_name='pygod'):
    infom = InstrumentedInfoMiner()
    results = []
    for algo in algorithms:
        infom.last_query_duration = 0.0
        _ = infom.query_docs(algo, vectorstore=None, package_name=package_name)
        results.append({
            'dataset':        dataset_name,
            'algorithm':      algo,
            'infominer_time': infom.last_query_duration
        })
    return results

# 2) Specify models & dataset
pygod_algos  = [
    'AdONE','ANOMALOUS','AnomalyDAE','CONAD','DOMINAT',
    'DONE','GAAN','GUIDE','Radar','SCAN'
]
dataset_name = 'inj_cora'

# 3) Run the benchmark
metrics = run_pygod_infominer(pygod_algos, dataset_name)

# 4) Build DataFrame and pivot
df = pd.DataFrame(metrics)
pivot = df.pivot(index='dataset', columns='algorithm', values='infominer_time')

pivot['mean'] = pivot.mean(axis=1)
# 5) Display in Jupyter
display(df)      # shows the raw list of (algorithm, time)
display(pivot)   # shows the single‐row table



The `AdONE` class in PyGOD is designed for adversarial outlier detection in attributed networks. It comprises both attribute and structure autoencoders, optimizing five distinct loss functions: attribute proximity, attribute homophily, structure proximity, structure homophily, and alignment losses. The model computes three outlier scores and averages them to derive an overall score. Notably, `AdONE` operates in a transductive manner, meaning it requires retraining when predicting on unseen data. ([docs.pygod.org](https://docs.pygod.org/en/stable/generated/pygod.detector.AdONE.html?utm_source=openai))

**Initialization Parameters and Default Values:**

The `__init__` method of the `AdONE` class accepts the following parameters:

- `hid_dim` (int, optional): Hidden dimension of the model. Default: `64`.
- `num_layers` (int, optional): Total number of layers in the model. Half (floor) are for the encoder, the other half (ceil) are for decoders. Default: `4`.
- `dropout` (float, optional):

Unnamed: 0,dataset,algorithm,infominer_time
0,inj_cora,AdONE,12.433644
1,inj_cora,ANOMALOUS,8.005516
2,inj_cora,AnomalyDAE,11.383099
3,inj_cora,CONAD,11.859761
4,inj_cora,DOMINAT,13.374995
5,inj_cora,DONE,11.698351
6,inj_cora,GAAN,11.251443
7,inj_cora,GUIDE,49.877362
8,inj_cora,Radar,12.37155
9,inj_cora,SCAN,9.055996


algorithm,ANOMALOUS,AdONE,AnomalyDAE,CONAD,DOMINAT,DONE,GAAN,GUIDE,Radar,SCAN,mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
inj_cora,8.005516,12.433644,11.383099,11.859761,13.374995,11.698351,11.251443,49.877362,12.37155,9.055996,15.131172


##Token Benchmark

In [161]:
# benchmark_pyod_skip_selector.py

import sys, os, asyncio, pandas as pd
from config.config import Config

# 1) Add project root
os.environ["OPENAI_API_KEY"] = Config.OPENAI_API_KEY

# 2) Global counter and patches for ChatOpenAI._call
import  langchain_openai

# 3) Global token counter using the API's usage fields
class TokenCounter:
    def __init__(self):
        self.reset()
    def reset(self):
        self.prompt_tokens     = 0
        self.completion_tokens = 0
        self.total_tokens      = 0
    def add_usage(self, usage):
        self.prompt_tokens     += getattr(usage, "prompt_tokens", 0)
        self.completion_tokens += getattr(usage, "completion_tokens", 0)
        self.total_tokens      += getattr(usage, "total_tokens", 0)

counter = TokenCounter()

# 4) Monkey-patch LangChain ChatOpenAI._call to capture usage
_orig_llm_call = langchain_openai.ChatOpenAI._call
def _patched_llm_call(self, messages, **kwargs):
    resp = _orig_llm_call(self, messages, **kwargs)
    usage = getattr(resp, "usage", None)
    if usage:
        counter.add_usage(usage)
    return resp
langchain_openai.ChatOpenAI._call = _patched_llm_call

# 5) Monkey-patch openai.chat.completions.create for direct OpenAI calls
_orig_create = openai.chat.completions.create
def _patched_create(*args, **kwargs):
    resp = _orig_create(*args, **kwargs)
    usage = getattr(resp, "usage", None)
    if usage:
        counter.add_usage(usage)
    return resp
openai.chat.completions.create = _patched_create

# 6) Monkey-patch OpenAI.responses.create (used by AgentInfoMiner)
_orig_openai_init = openai.OpenAI.__init__
def _patched_openai_init(self, *args, **kwargs):
    _orig_openai_init(self, *args, **kwargs)
    if hasattr(self, "responses") and hasattr(self.responses, "create"):
        orig_resp = self.responses.create
        def _wrapped_responses_create(*a, **kw):
            resp = orig_resp(*a, **kw)
            usage = getattr(resp, "usage", None)
            if usage:
                counter.add_usage(usage)
            return resp
        self.responses.create = _wrapped_responses_create

openai.OpenAI.__init__ = _patched_openai_init




# 4) Import your agents
from agents.agent_infominer import AgentInfoMiner
from agents.agent_coder    import AgentCoder

# 5) Configuration
ALGOS      = [
    'MO-GAAL','SO-GAAL','AutoEncoder','VAE','AnoGAN',
    'DeepSVDD','ALAD','AE1SVM','DevNet','LUNAR'
]
TRAIN_PATH = './data/glass.mat'
TEST_PATH  = './data/glass.mat'
PARAMS     = {'contamination': 0.1}
PKG        = 'pyod'
VECTORSTORE = None

In [165]:
rows = []
for algo in ALGOS:
    # InfoMiner
    counter.reset()
    inf = AgentInfoMiner()
    doc = inf.query_docs(algo, VECTORSTORE, PKG)
    info_in  = counter.prompt_tokens
    info_out = counter.completion_tokens

    # Coder
    counter.reset()
    coder = AgentCoder()
    _ = coder.generate_code(
        algorithm        = algo,
        data_path_train  = TRAIN_PATH,
        data_path_test   = TEST_PATH,
        algorithm_doc    = doc,
        input_parameters = PARAMS,
        package_name     = PKG
    )
    code_in  = counter.prompt_tokens
    code_out = counter.completion_tokens

    rows.append({
        'algorithm':  algo,
        'info_in':    info_in,
        'info_out':   info_out,
        'code_in':    code_in,
        'code_out':   code_out
    })

[Cache Hit] Using recent cache for MO-GAAL
The `MO_GAAL` class in PyOD is designed for Multi-Objective Generative Adversarial Active Learning, which generates potential outliers to help classifiers effectively distinguish between normal data and outliers. To prevent mode collapse, it employs multiple generators with different objectives.

**Initialization Function (`__init__`):**

The `__init__` method initializes the `MO_GAAL` class with the following parameters:

- **contamination**: float in (0., 0.5), optional (default=0.1)
  - The proportion of outliers in the dataset. Used to define the threshold on the decision function.

- **k**: int, optional (default=10)
  - The number of sub-generators.

- **stop_epochs**: int, optional (default=20)
  - The number of training epochs. The total number of epochs equals three times this value.

- **lr_d**: float, optional (default=0.01)
  - Learning rate of the discriminator.

- **lr_g**: float, optional (default=0.0001)
  - Learning rate of th

In [167]:
df = pd.DataFrame(rows).set_index("algorithm")
display(df)


Unnamed: 0_level_0,info_in,info_out,code_in,code_out
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MO-GAAL,0,0,0,0
SO-GAAL,0,0,0,0
AutoEncoder,0,0,0,0
VAE,0,0,0,0
AnoGAN,0,0,0,0
DeepSVDD,0,0,0,0
ALAD,0,0,0,0
AE1SVM,0,0,0,0
DevNet,0,0,0,0
LUNAR,0,0,0,0
