In [None]:
import warnings
warnings.filterwarnings("ignore")

from concurrent.futures import ThreadPoolExecutor
from glob import glob
from pathlib import Path
from pprint import pprint
import threading


from dotenv import load_dotenv
from langchain.globals import set_llm_cache
from langchain_core.caches import InMemoryCache
import pandas as pd
import promptquality as pq
from tqdm.auto import tqdm
from llm_handler import LLMHandler

from IPython.display import clear_output

set_llm_cache(InMemoryCache())

load_dotenv("../.env")
pq.login("console.demo.rungalileo.io")

In [None]:
llm_handler = LLMHandler()
model = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
llm = llm_handler.get_llm(model, temperature=0.0, max_tokens=4000)
# llm.invoke("Hello, how are you?")

In [None]:
files = glob("../data/datasets/*.parquet")
print(len(files))
files

In [4]:
project_name = "agent-lb-v1"
system_msg = {
            "role": "system",
            "content": 'Your job is to use the given tools to answer the query of human. If there is no relevant tool then reply with "I cannot answer the question with given tools". If tool is available but sufficient information is not available, then ask human to get the same. You can call as many tools as you want. Use multiple tools if needed. If the tools need to be called in a sequence then just call the first tool.',
        }

chainpoll_tool_selection_scorer = pq.CustomizedChainPollScorer(
                scorer_name=pq.CustomizedScorerName.tool_selection_quality,
                model_alias=pq.Models.gpt_4o,
            )

project_id = pq.get_project_from_name(project_name)

def check_run_exists(model_name, file_name):
    run_name = f"{model_name} {file_name}"    
    try: 
        pq.get_run_from_name(run_name, project_id.id)
        print(f"Run {run_name} already exists")
        return True
    except ValueError:
        return False

In [5]:
# models = llm_handler.available_models["together"]
# models = llm_handler.get_all_models()
models = ["gemini-2.0-flash-lite-001"]

In [None]:
print_lock = threading.Lock()

def process_file(model_name, file_path):
    file_name = Path(file_path).stem
    run_name = f"{model_name} {file_name}"
    
    # Check if run already exists
    try:
        pq.get_run_from_name(run_name, project_id.id)
        return model_name, file_name, None
    except ValueError:
        pass  # Run doesn't exist, continue processing
    
    df = pd.read_parquet(file_path, engine="fastparquet")
    
    # if "file_name" in file_name:
    #     df = df.drop(df.index[64])

    evaluate_handler = pq.GalileoPromptCallback(
        project_name=project_name,
        run_name=run_name,
        scorers=[chainpoll_tool_selection_scorer],
    )

    outputs = []
    llm = llm_handler.get_llm(model_name, max_tokens=4000)
    for row in tqdm(df.itertuples(), 
                   desc=run_name, 
                   total=len(df), 
                   position=None, 
                   leave=False):
        chain = llm.bind_tools(row.tools_langchain).with_retry(
    # retry_if_exception_type=(ValueError,), # Retry only on ValueError
    wait_exponential_jitter=True, # Add jitter to the exponential backoff
    stop_after_attempt=3,
)
        outputs.append(
            chain.invoke(
                [system_msg, *row.conversation], 
                config=dict(callbacks=[evaluate_handler])
            )
        )

    evaluate_handler.finish()
    return model_name, file_name, outputs

def process_model(model_name):
    with ThreadPoolExecutor(max_workers=5) as file_executor:
        file_futures = [
            file_executor.submit(process_file, model_name, file_path)
            for file_path in files
        ]
        
        results = []
        for future in tqdm(file_futures, 
                          desc=f"Files for {model_name}", 
                          position=None, 
                          leave=False):
            results.append(future.result())
    return results

# Main execution
with ThreadPoolExecutor(max_workers=3) as model_executor:
    # Submit each model to the executor
    model_futures = [model_executor.submit(process_model, model) for model in models]
    
    # Collect all results
    all_results = []
    for future in tqdm(model_futures, 
                      desc="Overall Models Progress", 
                      position=0, 
                      leave=True):
        all_results.extend(future.result())

# Print summary
completed_runs = [(model, file) for model, file, outputs in all_results if outputs is not None]
if completed_runs:
    print("\nNewly completed runs:")
    for model, file in completed_runs:
        print(f"- {file} with {model}")
else:
    print("\nNo new runs were processed - all runs already existed")