In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

from hypothesaes.llm_local import _get_engine
from hypothesaes.utils import get_text_for_printing
from time import time

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score, average_precision_score

os.environ['OPENAI_KEY_SAE'] = os.environ['OAI_GENERAL']

current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"

df = pd.read_json(os.path.join(prefix, "demo-data", "yelp-demo-val-2K.json"), lines=True)

texts = df['text'].tolist()
labels = df['stars'].values

from hypothesaes.annotate import annotate

concept = 'mentions a waiter or waitress by name'
texts_to_annotate = texts[:2000]
tasks = [(text, concept) for text in texts_to_annotate]

df_annot = pd.DataFrame()

In [None]:
from hypothesaes.annotate import annotate

start = time()
ground_truth = annotate(tasks, model='gpt-4.1', n_workers=100, max_words_per_example=256)

y_gt = np.array([ground_truth[concept][text] for text in texts_to_annotate])
df_annot['ground_truth'] = y_gt

Found 0 cached items; annotating 2000 uncached items


Annotating:   0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
from hypothesaes.llm_local import _LOCAL_ENGINES
import torch

# models = ['Qwen/Qwen3-0.6B', 'Qwen/Qwen3-8B', 'google/gemma-3-4b-it']
models = ['Qwen/Qwen3-8B']
temperatures = [0.0, 0.5, 1.0]
thinking_options = [True, False]
prompt_templates = ['annotate-simple', 'annotate']

for model in models:
    for temperature in temperatures:
        for thinking in thinking_options:
            for prompt_template in prompt_templates:
                model_test_name = f'{model}_temperature={temperature}_thinking={thinking}_prompt={prompt_template}'
                if model_test_name in df_annot.columns:
                    continue
                
                if thinking:
                    max_new_tokens = 1000
                else:
                    max_new_tokens = 10
                if 'Qwen' not in model and thinking:
                    continue
                
                results = annotate(tasks, model=model, max_words_per_example=256, max_new_tokens=max_new_tokens, temperature=temperature, prompt_template=prompt_template,
                                   tokenizer_kwargs={'enable_thinking': thinking})
                df_annot[model_test_name] = np.array([results[concept][text] for text in texts_to_annotate])
    del _LOCAL_ENGINES[model]
    torch.cuda.empty_cache()

Found 0 cached items; annotating 2000 uncached items
Loading Qwen/Qwen3-8B in vLLM...
ERROR 07-20 23:44:09 [core.py:586] EngineCore failed to start.
ERROR 07-20 23:44:09 [core.py:586] Traceback (most recent call last):
ERROR 07-20 23:44:09 [core.py:586]   File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 577, in run_engine_core
ERROR 07-20 23:44:09 [core.py:586]     engine_core = EngineCoreProc(*args, **kwargs)
ERROR 07-20 23:44:09 [core.py:586]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-20 23:44:09 [core.py:586]   File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 404, in __init__
ERROR 07-20 23:44:09 [core.py:586]     super().__init__(vllm_config, executor_class, log_stats,
ERROR 07-20 23:44:09 [core.py:586]   File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 75, in __init__
ERROR 07-20 23:44:09 [c

Process EngineCore_0:
Traceback (most recent call last):
  File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 590, in run_engine_core
    raise e
  File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 577, in run_engine_core
    engine_core = EngineCoreProc(*args, **kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 404, in __init__
    super().__init__(vllm_config, executor_class, log_stats,
  File "/nas/ucb/rmovva/anaconda3/envs/hypothesaes/lib/python3.12/site-packages/vll

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

In [None]:
from sklearn.metrics import f1_score
from scipy.stats import pearsonr

print("Comparison to gpt-4.1:")
print("=" * 50)

for col in df_annot.columns:
    if col == 'ground_truth':
        continue
    
    y_pred = df_annot[col].values
    y_true = df_annot['ground_truth'].values
    
    # Only evaluate on valid predictions (0 or 1)
    valid_mask = np.isin(y_pred, [0, 1])
    
    if np.sum(valid_mask) == 0:
        print(f"{col}: No valid predictions")
        continue
    
    y_pred_valid = y_pred[valid_mask]
    y_true_valid = y_true[valid_mask]
    
    f1 = f1_score(y_true_valid, y_pred_valid)
    
    print(f"{col}:")
    print(f"  F1 score: {f1:.3f}")
    print(f"  Pearson correlation: {pearsonr(y_true_valid, y_pred_valid)[0]:.3f}")
    print(f"  Valid annotations: {np.sum(valid_mask)}/{len(y_pred)}")
    print()


Comparison to gpt-4.1:
Qwen/Qwen3-0.6B_think:
  F1 score: 0.484
  Pearson correlation: 0.483
  Valid annotations: 2000/2000

