# Run multiple evaluation modes for a configuration & set of queries

In [1]:
import yaml

from dbchat import ROOT_DIR

config_path = ROOT_DIR.parent / "tests/data/inputs/cfg_3.yml"
with open(config_path) as f:
    config = yaml.safe_load(f)
print(yaml.dump(config))

approach: sql_engine_w_reranking
database:
  metadata:
    document_id_like: '%-2'
    metadata_path: sqlite:///data/chinook.db
    table_name: table_descriptions
  path: sqlite:///data/chinook.db
index:
  class: ollama
  name: llama2reranker
  reranking:
    config_object: ReRankerLLMConfig
    reranker_kwargs:
      top_n: 3
  retriever_kwargs:
    similarity_top_k: 4
llm:
  class: ollama
  name: llama2



### Create 2 datasets, one with target/expected agent responses and one without

Split the evaluation dataset into two files: queries with expected responses, and those without.

```bash
awk -F '|' '{ if ($5 == "") print > "examples/evaluation/queries_no_response.csv"; else print > "examples/evaluation/queries_with_response.csv" }' examples/evaluation/queries.csv
```

In [None]:
from typing import List
from dbchat.evaluation.evaluate import evaluate_synthetic_judge, evaluate_synthetic_judge_with_query, evaluate_table_name_retrieval

pipeline_results: List[dict] = []

# For test data with only user query & expected tables;
test_data_path_no_responses = "examples/evaluation/queries_no_response.csv"
eval_funcs = [evaluate_table_name_retrieval]
for f in eval_funcs:
    r = f(test_data_path_no_responses, config_path)
    pipeline_results.extend(r)

# For test data with user query, and a target desired response;
test_data_path_with_responses = "examples/evaluation/queries_with_response.csv"
eval_funcs = [evaluate_synthetic_judge_with_query,
              evaluate_synthetic_judge,
              evaluate_table_name_retrieval]
for f in eval_funcs:
    r = f(test_data_path_with_responses, config_path)
    pipeline_results.extend(r)


# Using multiprocessing (untested)
 - Need to check the amount of CPU / IO processes in the pipeline

In [None]:
import multiprocessing

pipeline_results = []

# For test data with only user query & expected tables;
test_data_path_no_responses = "examples/evaluation/queries_no_response.csv"
eval_funcs_no_responses = [evaluate_table_name_retrieval]

# For test data with user query, and a target desired response;
test_data_path_with_responses = "examples/evaluation/queries_with_response.csv"
eval_funcs_with_responses = [evaluate_synthetic_judge_with_query,
                             evaluate_synthetic_judge,
                             evaluate_table_name_retrieval]

# Create a multiprocessing pool
pool = multiprocessing.Pool()

# Run the first loop functions in parallel
results_no_responses = pool.starmap(lambda f: f(test_data_path_no_responses, config_path), [(f,) for f in eval_funcs_no_responses])

# Run the second loop functions in parallel
results_with_responses = pool.starmap(lambda f: f(test_data_path_with_responses, config_path), [(f,) for f in eval_funcs_with_responses])

# Extend the pipeline_results with the results from both loops
for r in results_no_responses:
    pipeline_results.extend(r)
for r in results_with_responses:
    pipeline_results.extend(r)

# Close the multiprocessing pool
pool.close()
pool.join()