In [1]:
import gqr
import pandas as pd
from datetime import datetime
import ollama
from tqdm import tqdm
import os

from prompt import system_prompt, user_prompt

  from .autonotebook import tqdm as notebook_tqdm


## Load Prompts
Load system and user prompts from prompt.txt

# GQR-Bench: Multi-LLM Comparison
Compare multiple Ollama models on the GQR benchmark dataset

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'


In [3]:
# Load development dataset for initial experimentation
dev_train_data, dev_eval_data = gqr.load_dev_dataset()

# Load training dataset for model development
train_data, eval_data = gqr.load_train_dataset()

# Load test datasets for final evaluation
domain_test_data = gqr.load_id_test_dataset()  # In-domain test data
ood_test_data = gqr.load_ood_test_dataset()    # Out-of-domain test data

In [4]:
# List of models to test
models_to_test = [
    # "gemma3:27b",
    # "gemma3:1b",
    # "gemma3:4b",
    # "gemma3:270m",
    # "granite3.3:8b",
    # "granite3.3:2b",
    # "llama3.2:3b",
    # "mistral:7b",
    # "phi3:3.8b",
    # "phi3:14b",
    # "phi4:14b",
    # "gpt-oss:20b",
    # "gpt-oss-safeguard:20b",
    # "phi4-mini:3.8b",
    # "qwen3:14b",
    "qwen3:8b",
    "llama3.3:70b",
]

## Model Configuration
Define all Ollama models to test

In [5]:
# Global progress bar for tracking prompts
current_pbar = None
prompt_latencies = []

def scoring_function(text: str) -> int:
    global current_pbar, current_model_name, prompt_latencies
    
    try:
        # Format the user prompt with the text
        formatted_user_prompt = user_prompt.format(query=text)
        
        # Track latency for this prompt
        prompt_start = datetime.now()
        
        # Call Ollama with system and user messages
        response = ollama.chat(
            model=current_model_name,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': formatted_user_prompt}
            ],
            think='low' if current_model_name.startswith('gpt-oss') else False,
            stream=False
        )
        
        prompt_end = datetime.now()
        latency = (prompt_end - prompt_start).total_seconds()
        prompt_latencies.append(latency)
        
        # Get the response content
        result = response['message']['content'].strip().lower()
        
        # Map category names to integers
        category_map = {
            'law': 0,
            'finance': 1,
            'health': 2,
            'other': 3
        }
        
        # Return the corresponding integer
        predicted_label = 3  # Default to 'other'
        for category, value in category_map.items():
            if category in result:
                predicted_label = value
                break
        
        # Update progress bar if provided
        if current_pbar:
            current_pbar.update(1)
        
        return predicted_label
        
    except Exception as e:
        print(f"Error with model {current_model_name}: {e}")
        if current_pbar:
            current_pbar.update(1)
        return 3

## Run Benchmarks
Loop through all models and evaluate each one

In [6]:
results = []
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_dir = '../results'
os.makedirs(results_dir, exist_ok=True)
output_file = f'{results_dir}/benchmark_results_{timestamp}.csv'

for model_name in models_to_test:
    print(f"\n{'='*60}\nTesting model: {model_name}\n{'='*60}")
    
    try:
        current_model_name = model_name
        prompt_latencies = []
        
        id_test_data = gqr.load_id_test_dataset()
        ood_test_data = gqr.load_ood_test_dataset()
        total_size = len(id_test_data) + len(ood_test_data)
        
        start_time = datetime.now()
        with tqdm(total=total_size, desc=f"{model_name}") as pbar:
            current_pbar = pbar
            id_test_data["predictions"] = [scoring_function(doc) for doc in id_test_data["text"].values]
            ood_test_data["predictions"] = [scoring_function(doc) for doc in ood_test_data["text"].values]
        elapsed_time = (datetime.now() - start_time).total_seconds()
        
        id_scores = gqr.evaluate(id_test_data["predictions"], ground_truth=id_test_data["label"])
        id_acc = id_scores["accuracy"]
        
        ood_scores_df = gqr.evaluate_by_dataset(
            ood_test_data, pred_col="predictions", true_col="label", dataset_col="dataset"
        )
        ood_acc = ood_scores_df["accuracy"].mean()
        
        gqr_score = 2 * (id_acc * ood_acc) / (id_acc + ood_acc) if (id_acc + ood_acc) > 0 else 0.0
        avg_latency = sum(prompt_latencies) / len(prompt_latencies) if prompt_latencies else None
        
        dataset_acc = dict(zip(ood_scores_df['dataset'], ood_scores_df['accuracy']))
        
        result = {
            'model': model_name,
            'avg_latency': avg_latency,
            'id_acc': id_acc,
            'ood_acc': ood_acc,
            'gqr_score': gqr_score,
            'dataset_acc': str(dataset_acc),
        }
        results.append(result)
        
        print(f"\nID: {id_acc:.4f} | OOD: {ood_acc:.4f} | GQR: {gqr_score:.4f} | Latency: {avg_latency:.3f}s")
        print(f"Per-dataset: {dataset_acc}")
        
        pd.DataFrame(results).to_csv(output_file, index=False)
        print(f"✓ Saved to: {output_file}")
        
    except Exception as e:
        print(f"Failed: {e}")
        results.append({
            'model': model_name, 'avg_latency': None, 'id_acc': None,
            'ood_acc': None, 'gqr_score': None, 'dataset_acc': None, 'error': str(e)
        })
        pd.DataFrame(results).to_csv(output_file, index=False)

print(f"\n{'='*60}\nAll models tested!\n{'='*60}")


Testing model: qwen3:8b


qwen3:8b: 100%|██████████| 22457/22457 [32:06<00:00, 11.66it/s]



ID: 0.8350 | OOD: 0.9846 | GQR: 0.9036 | Latency: 0.086s
Per-dataset: {'jigsaw': 0.9965774735532047, 'olid': 0.9930232558139535, 'hate_xplain': 0.9971356360572873, 'hate_speech_slovak ': 0.9916579770594369, 'dkhate': 0.9878419452887538, 'web_questions': 0.9335629921259843, 'ml_questions': 0.9921875}
✓ Saved to: ../results/benchmark_results_20251219_191559.csv

Testing model: llama3.3:70b


llama3.3:70b: 100%|██████████| 22457/22457 [1:23:07<00:00,  4.50it/s]


ID: 0.9618 | OOD: 0.9400 | GQR: 0.9508 | Latency: 0.222s
Per-dataset: {'jigsaw': 0.9626633478531424, 'olid': 0.827906976744186, 'hate_xplain': 0.9668070766638585, 'hate_speech_slovak ': 0.9624608967674662, 'dkhate': 0.9483282674772037, 'web_questions': 0.9119094488188977, 'ml_questions': 1.0}
✓ Saved to: ../results/benchmark_results_20251219_191559.csv

All models tested!



