In [None]:
import os
import logging
import json
import sys

from task_oriented_dataset_search.pipeline import TodsEngine, PipelineConfig

API_KEY = os.environ.get("OPENAI_API_KEY")
SAMPLE_DATA_FOLDER = "sample_docs"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)

config = PipelineConfig(
    input_folder=SAMPLE_DATA_FOLDER,
    api_key=API_KEY,
    preprocess_workers=10,
    extract_workers=15,
    retry_limit=5,
    model="gpt-4o-mini",
    qa_model="gpt-4o",
)

In [None]:
engine = TodsEngine(config=config)
engine.build()

In [None]:
search_queries = [
    "image classification",
    "natural language inference",
    "object detection"
]
for query in search_queries:
    results = engine.search(query, top_k_datasets=3)
    print(json.dumps(results, indent=2))

In [None]:
for question in search_queries:
    answer = engine.qa(question)
    print(answer)