# Imports and Setup

In [None]:
import sys

sys.path.append('../../')

from models.rag import SelfRAG
from models.sda import SelfDiscovery
from utils.utils import set_api_key
from utils.auto_tester import AutomaticTester

In [None]:
set_api_key('../../utils/keys.json', 'OpenAI')
set_api_key('../../utils/keys.json', 'Anthropic')

In [None]:
# RAG Agent to be used for all tests
rag_agent = SelfRAG("gpt-4o-2024-08-06", "text-embedding-3-large")
rag_agent.load_documents("../../data/CIVIC/clean/evidence_kb.json")

In [None]:
# Set the LLM Model and Number of Examples to be tested
NUM_EXAMPLES = 30
MODEL_NAME = "claude-3-5-sonnet-20240620"

# Model Tests

### Baseline Agent

In [None]:
sda_agent = SelfDiscovery("anthropic", MODEL_NAME, use_base=True, use_exp=False, use_coh=False, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### EXP Agent

In [None]:
sda_agent = SelfDiscovery("anthropic", MODEL_NAME, use_base=False, use_exp=True, use_coh=False, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/exp/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### COH Agent

In [None]:
sda_agent = SelfDiscovery("anthropic", MODEL_NAME, use_base=False, use_exp=False, use_coh=True, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/coh/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### Base + EXP Agent

In [None]:
sda_agent = SelfDiscovery("anthropic", MODEL_NAME, use_base=True, use_exp=True, use_coh=False, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")

### Base + EXP + COH Agent

In [None]:
sda_agent = SelfDiscovery("anthropic", MODEL_NAME, use_base=True, use_exp=True, use_coh=True, modules_path="../../models/reasoning_modules/")

In [None]:
test_configs = [
    {
        "input_filepath": "../../data/CIVIC/clean/context/baseline.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/baseline.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/missing_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/missing_evidence.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/wrong_evidence.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/wrong_evidence.json",
        "task_type": "explanation",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/mixed.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/mixed.json",
        "task_type": "explanation",
        "use_rag": True
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/selection_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/selection_test.json",
        "task_type": "selection",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/CIVIC/clean/context/assignment_test.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/assignment_test.json",
        "task_type": "assignment",
        "use_rag": False
    },
    {
        "input_filepath": "../../data/R4C/clean/full_data_noid.json",
        "output_filepath": "../Anthropic/claude3.5_sonnet/base_exp_coh/full_data_noid.json",
        "task_type": "explanation",
        "use_rag": False
    }
]


In [None]:
# Run tests
for config in test_configs:
    tester = AutomaticTester(sda_agent, rag_agent, config["input_filepath"], config["output_filepath"], )
    tester.run_test(task_type=config["task_type"], use_rag=config["use_rag"], num_examples=NUM_EXAMPLES)

    print(f"Test completed for {config['input_filepath']}")

print("All tests completed.")