# Imports and Setup

In [1]:
import sys

sys.path.append('../../')

from models.rag import SelfRAG
from models.sda import SelfDiscovery
from utils.utils import set_api_key
from utils.auto_tester import AutomaticTester

  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [2]:
set_api_key('../../utils/keys.json', 'OpenAI')
set_api_key('../../utils/keys.json', 'Anthropic')

In [3]:
# RAG Agent to be used for all tests
rag_agent = SelfRAG("gpt-4o-2024-08-06", "text-embedding-3-large")
rag_agent.load_documents("../../data/CIVIC/clean/evidence_kb.json")

In [4]:
# Set the LLM Model and Number of Examples to be tested
NUM_EXAMPLES = 30
MODEL_NAME = "gpt-4o-2024-08-06"

# Model Tests

### Baseline Agent

In [None]:
sda_agent = SelfDiscovery("openai", MODEL_NAME, use_base=True, use_exp=False, use_coh=False, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../OpenAI/gpt-4o/base/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/base/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/base/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../OpenAI/gpt-4o/base/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../OpenAI/gpt-4o/base/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../OpenAI/gpt-4o/base/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../OpenAI/gpt-4o/base/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### EXP Agent

In [None]:
sda_agent = SelfDiscovery("openai", MODEL_NAME, use_base=False, use_exp=True, use_coh=False, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../OpenAI/gpt-4o/exp/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### COH Agent

In [None]:
sda_agent = SelfDiscovery("openai", MODEL_NAME, use_base=False, use_exp=False, use_coh=True, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../OpenAI/gpt-4o/coh/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### Base + EXP Agent

In [None]:
sda_agent = SelfDiscovery("openai", MODEL_NAME, use_base=True, use_exp=True, use_coh=False, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### Base + EXP + COH Agent

In [5]:
sda_agent = SelfDiscovery("openai", MODEL_NAME, use_base=True, use_exp=True, use_coh=True, modules_path="../../models/reasoning_modules/")

In [6]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../OpenAI/gpt-4o/base_exp_coh/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [7]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")


--- INITIALIZING AUTOMATIC TESTER ---
Input filepath: ../../data/CIVIC/clean/context/baseline.json
Output filepath: ../OpenAI/gpt-4o/base_exp_coh/baseline.json

--- STARTING RUN_TEST ---
Task Type: explanation
Use RAG: False
Number of Examples: 30

--- FORMATTING ENTRIES ---
Task Type: explanation
Successfully loaded 66 entries from ../../data/CIVIC/clean/context/baseline.json

Formatting entry 1/66

Formatting entry 2/66

Formatting entry 3/66

Formatting entry 4/66

Formatting entry 5/66

Formatting entry 6/66

Formatting entry 7/66

Formatting entry 8/66

Formatting entry 9/66

Formatting entry 10/66

Formatting entry 11/66

Formatting entry 12/66

Formatting entry 13/66

Formatting entry 14/66

Formatting entry 15/66

Formatting entry 16/66

Formatting entry 17/66

Formatting entry 18/66

Formatting entry 19/66

Formatting entry 20/66

Formatting entry 21/66

Formatting entry 22/66

Formatting entry 23/66

Formatting entry 24/66

Formatting entry 25/66

Formatting entry 26/66

For