In [None]:
from collections import defaultdict
from glob import glob
from pathlib import Path

import promptquality as pq
from dotenv import load_dotenv

import pandas as pd
import numpy as np
from llm_handler import LLMHandler

from IPython.display import clear_output

load_dotenv("../.env")
pq.login('https://console.demo.rungalileo.io')

In [None]:
project_name = "agent-lb-v1"
datasets = [Path(file_path).stem for file_path in glob("../data/datasets/*.parquet")]
print(len(datasets))
PROJECT_ID = pq.get_project_from_name(project_name).id
llm_handler = LLMHandler()
models = llm_handler.get_all_models()
datasets

In [None]:
runs = pq.get_project_from_name(project_name).runs

run_dict = defaultdict(list)
for run in runs:
    model, dataset = run.name.split(" ")
    run_dict[model] += [dataset]
    
for key, value in run_dict.items():
    print(key, len(value))

In [None]:
from functools import lru_cache

@lru_cache(maxsize=1000)
def get_model_score_for_dataset(model, dataset):
    print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
    run_name = f"{model} {dataset}"
    run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
    rows = pq.get_rows(
        project_id=PROJECT_ID,
        run_id=run_id,
        starting_token=0,
        limit=1000,
    )
    
    scores = [round(d.metrics.tool_selection_quality, 2) for d in rows if d.metrics.tool_selection_quality is not None]
    rationales = [d.metrics.tool_selection_quality_rationale for d in rows if d.metrics.tool_selection_quality is not None]
    explanations = [d.metrics.tool_selection_quality_explanation for d in rows if d.metrics.tool_selection_quality is not None]
    mean_score = round(np.mean(scores), 2)
    return {
        "mean_score": mean_score,
        "scores": scores,
        "rationales": rationales,
        "explanations": explanations,
    }

# data = get_model_score_for_dataset(models[0], datasets[0])

In [13]:
import threading
from queue import Queue
from typing import List, Dict
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from threading import Semaphore
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map

@dataclass
class ScoringTask:
    model: str
    dataset: str

class ModelScorer:
    def __init__(self, models: List[str], datasets: List[str], max_concurrent_datasets: int = 3):
        self.models = models
        self.datasets = datasets
        self.results: Dict[tuple, float] = {}
        self.results_lock = threading.Lock()
        self.dataset_semaphore = Semaphore(max_concurrent_datasets)
        self.progress_bars = {}
        
    def create_model_progress_bar(self, model: str, total_datasets: int) -> tqdm:
        """Create a progress bar for a specific model"""
        return tqdm(
            total=total_datasets,
            desc=f"Model {model}",
            position=self.models.index(model),
            leave=True
        )
        
    def process_task(self, task: ScoringTask):
        """
        Process a single scoring task with semaphore control and progress tracking.
        """
        with self.dataset_semaphore:
            # Get or create progress bar for this model
            with self.results_lock:
                if task.model not in self.progress_bars:
                    self.progress_bars[task.model] = self.create_model_progress_bar(
                        task.model, 
                        len(self.datasets)
                    )
            
            # Process the task
            score = get_model_score_for_dataset(task.model, task.dataset)["mean_score"]
            
            # Update results and progress
            with self.results_lock:
                self.results[(task.model, task.dataset)] = score
                self.progress_bars[task.model].update(1)
                self.progress_bars[task.model].set_postfix(
                    last_dataset=task.dataset,
                    last_score=f"{score:.3f}"
                )

    def run_scoring(self) -> Dict[tuple, float]:
        """
        Run the scoring process for all model-dataset combinations with progress tracking.
        """
        tasks = [
            ScoringTask(model, dataset)
            for model in self.models
            for dataset in self.datasets
        ]
        
        # Create overall progress bar
        total_tasks = len(tasks)
        
        # Clear any existing progress bars
        for _ in range(len(self.models)):
            print()
            
        # Use ThreadPoolExecutor to manage the thread pool
        with ThreadPoolExecutor(max_workers=min(len(tasks), 10)) as executor:
            # Submit all tasks
            futures = [executor.submit(self.process_task, task) for task in tasks]
            
            # Create overall progress bar
            with tqdm(
                total=total_tasks,
                desc="Overall Progress",
                position=len(self.models),
                leave=True
            ) as overall_pbar:
                # Wait for all tasks to complete
                for future in futures:
                    future.result()
                    overall_pbar.update(1)
        
        # Close all progress bars
        for pbar in self.progress_bars.values():
            pbar.close()
                
        return self.results

In [None]:
models = ["palmyra-x-004"]

models = []
scorer = ModelScorer(models, datasets, max_concurrent_datasets=3)

print("Starting scoring process...")
start_time = time.time()

results = scorer.run_scoring()

end_time = time.time()
print(f"\nScoring completed in {end_time - start_time:.2f} seconds")

# Print results in a formatted way
print("\nFinal Scores:")
print("-" * 40)
for (model, dataset), score in sorted(results.items()):
    print(f"{model:10} | {dataset:10} | {score:.3f}")

In [None]:
# Create the initial DataFrame
results_matrix = {
    dataset: {model: scorer.results.get((model, dataset), None) 
                for model in scorer.models}
    for dataset in scorer.datasets
}
df = pd.DataFrame(results_matrix).round(3)

# Create average of toolace_single_func_call datasets
toolace_cols = ['toolace_single_func_call_1', 'toolace_single_func_call_2']
if all(col in df.columns for col in toolace_cols):
    # Calculate average of the two datasets
    df['toolace_single_func_call'] = df[toolace_cols].mean(axis=1)
    # Drop the original columns
    df = df.drop(columns=toolace_cols)

# Calculate averages
df['Model Avg'] = df.mean(axis=1)  # Average for each model across datasets
dataset_avgs = df.mean()  # Average for each dataset across models

# Add dataset averages as a new row
df.loc['Dataset Avg'] = dataset_avgs

# Sort by model performance (descending)
df_sorted = pd.concat([
    df.iloc[:-1].sort_values('Model Avg', ascending=False),
    df.iloc[-1:] # Add back the Dataset Avg row at the end
])

# Optionally round all values to 3 decimal places for clean display
df_sorted = df_sorted.round(3)
df_sorted = df_sorted[["Model Avg"] + [col for col in df_sorted.columns if col != "Model Avg"]]
df_sorted