In [21]:
from langchain.chat_models import init_chat_model

o3_mini = init_chat_model("openai:o3-mini")
claude_sonnet = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)
gemma3 = init_chat_model("ollama:gemma3:12b", temperature=0)
llama3_1 = init_chat_model("ollama:llama3.1:latest", temperature=0)
deepseek_r1 = init_chat_model("ollama:deepseek-r1:8b", temperature=0)

In [None]:
query = "what's your name"
print("OpenAI response:", o3_mini.invoke(query))
print("\nClaude response:", claude_sonnet.invoke(query))
print("\nGemma response:", gemma3.invoke(query))
print("\nLlama3.1 response:", llama3_1.invoke(query))
print("\nDeepseek-r1 response:", deepseek_r1.invoke(query))

In [16]:
import json
from typing import List, Dict, Any

file_path = "biggen_bench_instruction_idx0.json"

def load_benchmark_data(file_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def prepare_prompts(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare prompts by excluding reference_answer and score_rubric."""
    prompts = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "capability": item["capability"],
            "task": item["task"],
            "instance_idx": item["instance_idx"],
            "system_prompt": item["system_prompt"],
            "input": item["input"],
            # Exclude reference_answer and score_rubric
        }
        prompts.append(prompt)
    return prompts

def prepare_rubric(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare rubric including reference_answer and score_rubric."""
    rubric = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "reference_answer": item["reference_answer"],
            "score_rubric": item["score_rubric"]
        }
        rubric.append(prompt)
    return rubric

In [18]:
benchmark_data = load_benchmark_data(file_path)
prompts = prepare_prompts(benchmark_data)
rubric = prepare_rubric(benchmark_data)

In [42]:
type(prompts)

list

In [37]:
print(prompts[0])

{'id': 'instruction_following_multi_task_inference_0', 'capability': 'instruction_following', 'task': 'multi_task_inference', 'instance_idx': 0, 'system_prompt': 'You are a helpful, respectful and honest assistant.', 'input': "List all the terms referring to Elon Musk in a given text, and then rephrase each of them, excluding pronouns and 'Elon Musk'.\nText: Elon Musk has had an eventful year. Those events could end up hurting your portfolio. The world's richest man repeatedly stoked controversy in 2023 by placing himself at the center of a number of scandals. Musk has had a penchant for involving himself in problematic issues this year: from endorsing - and then apologizing for - blatantly antisemitic tropes on X, to re-platforming conspiracy theorist Alex Jones and telling concerned advertisers (and Disney in particular) on X to go 'f**k' themselves. And he's suffered for them. X, nee Twitter, is now projected to bring in significantly less than the $4.5 billion it earned in the year

In [19]:
#prompts의 "system_prompt"와 "input"을 출력
for prompt in prompts:
    print(prompt["system_prompt"])
    print(prompt["input"])
    print("-"*100)

You are a helpful, respectful and honest assistant.
List all the terms referring to Elon Musk in a given text, and then rephrase each of them, excluding pronouns and 'Elon Musk'.
Text: Elon Musk has had an eventful year. Those events could end up hurting your portfolio. The world's richest man repeatedly stoked controversy in 2023 by placing himself at the center of a number of scandals. Musk has had a penchant for involving himself in problematic issues this year: from endorsing - and then apologizing for - blatantly antisemitic tropes on X, to re-platforming conspiracy theorist Alex Jones and telling concerned advertisers (and Disney in particular) on X to go 'f**k' themselves. And he's suffered for them. X, nee Twitter, is now projected to bring in significantly less than the $4.5 billion it earned in the year before Musk took over the company.
----------------------------------------------------------------------------------------------------
You are a helpful, and informative assi

In [None]:
#prompts의 "system_prompt"와 "input"을 query로 사용하여 각 모델의 응답을 받아서 저장
# query = prompts[0]["input"] + ", system_prompt: " + prompts[0]["system_prompt"]

system_prompt = prompts.get('system_prompt', '')
user_input = prompts.get('input', '')
print(user_input, system_prompt=system_prompt)

In [35]:
# print("OpenAI response:", o3_mini.invoke(query))
# print("\nClaude response:", claude_sonnet.invoke(query))
print("\nGemma response:", gemma3.invoke(query))
# print("\nLlama3.1 response:", llama3_1.invoke(query))
# print("\nDeepseek-r1 response:", deepseek_r1.invoke(query))


Gemma response: content='Okay, here\'s a list of terms referring to Elon Musk in the provided text, followed by rephrased versions excluding pronouns and "Elon Musk" itself:\n\n**Terms Referring to Elon Musk & Rephrased Versions:**\n\n1.  **Elon Musk** - *The world\'s richest man*\n2.  **Musk** - *He* (This is handled by the rephrasing of "He" below)\n3.  **He** - *The individual* (This refers to actions attributed to him)\n4.  **Those events** - *The occurrences* (Referring to the events impacting the portfolio)\n5.  **The individual** - *The person* (Referring to the person involved in the scandals)\n6.  **The person** - *The figure* (Referring to the person involved in the controversies)\n7.  **The figure** - *The owner* (Referring to the person who took over the company)\n\n\n\n**Important Notes:**\n\n*   I\'ve focused on terms *directly* referring to him or his actions.\n*   The rephrased versions aim to convey the same meaning without using "Elon Musk" or pronouns.\n*   Some ter

In [55]:

gemma3_results = []

# Process each prompt in the list
for prompt in prompts:
    # Extract the required fields
    system_prompt = prompt.get('system_prompt', '')
    user_input = prompt.get('input', '')
    
    # Print for verification
    print(f"Processing prompt: {prompt.get('id', 'unknown')}")
    print(f"System prompt: {system_prompt}")
    print(f"User input: {user_input}")
    
    # Invoke your model with the extracted parameters
    # The exact syntax depends on your model's API
    try:
        # Example invocation (adjust according to your model's API)
        response = gemma3.invoke(
            user_input, 
            system_prompt=system_prompt
            # Add any other parameters your model.invoke() requires
        )
        
        # Store the result along with the prompt information
        result = {
            "prompt_id": prompt.get('id', ''),
            "system_prompt": system_prompt,
            "input": user_input,
            "response": response
        }
        gemma3_results.append(result)
        
    except Exception as e:
        print(f"Error invoking model for prompt {prompt.get('id', '')}: {e}")
        # You might want to add the error to results as well
        gemma3_results.append({
            "prompt_id": prompt.get('id', ''),
            "error": str(e)
        })
    
    print("---")

# Now 'results' contains all the responses
print(f"Processed {len(gemma3_results)} prompts")

Processing prompt: instruction_following_multi_task_inference_0
System prompt: You are a helpful, respectful and honest assistant.
User input: List all the terms referring to Elon Musk in a given text, and then rephrase each of them, excluding pronouns and 'Elon Musk'.
Text: Elon Musk has had an eventful year. Those events could end up hurting your portfolio. The world's richest man repeatedly stoked controversy in 2023 by placing himself at the center of a number of scandals. Musk has had a penchant for involving himself in problematic issues this year: from endorsing - and then apologizing for - blatantly antisemitic tropes on X, to re-platforming conspiracy theorist Alex Jones and telling concerned advertisers (and Disney in particular) on X to go 'f**k' themselves. And he's suffered for them. X, nee Twitter, is now projected to bring in significantly less than the $4.5 billion it earned in the year before Musk took over the company.
Error invoking model for prompt instruction_follo

In [63]:

gemma3_results = []

# Process each prompt in the list
for prompt in prompts:
    # Extract the required fields
    system_prompt = prompt.get('system_prompt', '')
    user_input = prompt.get('input', '')
    
    # Invoke your model with the extracted parameters
    # The exact syntax depends on your model's API
    try:
        # Example invocation (adjust according to your model's API)
        response = gemma3.invoke(
            user_input, 
            config={"system_prompt": system_prompt}
            # Add any other parameters your model.invoke() requires
        )
        
        # Store the result along with the prompt information
        result = {
            "prompt_id": prompt.get('id', ''),
            "response": response
        }
        gemma3_results.append(result)
        
    except Exception as e:
        print(f"Error invoking model for prompt {prompt.get('id', '')}: {e}")
        # You might want to add the error to results as well
        gemma3_results.append({
            "prompt_id": prompt.get('id', ''),
            "error": str(e)
        })
    
    print("---")

# Now 'results' contains all the responses
print(f"Processed {len(gemma3_results)} prompts")

Processing prompt: instruction_following_multi_task_inference_0
System prompt: You are a helpful, respectful and honest assistant.
User input: List all the terms referring to Elon Musk in a given text, and then rephrase each of them, excluding pronouns and 'Elon Musk'.
Text: Elon Musk has had an eventful year. Those events could end up hurting your portfolio. The world's richest man repeatedly stoked controversy in 2023 by placing himself at the center of a number of scandals. Musk has had a penchant for involving himself in problematic issues this year: from endorsing - and then apologizing for - blatantly antisemitic tropes on X, to re-platforming conspiracy theorist Alex Jones and telling concerned advertisers (and Disney in particular) on X to go 'f**k' themselves. And he's suffered for them. X, nee Twitter, is now projected to bring in significantly less than the $4.5 billion it earned in the year before Musk took over the company.
---
Processing prompt: instruction_following_educ

In [66]:
gemma3_results

[{'prompt_id': 'instruction_following_multi_task_inference_0',
  'system_prompt': 'You are a helpful, respectful and honest assistant.',
  'input': "List all the terms referring to Elon Musk in a given text, and then rephrase each of them, excluding pronouns and 'Elon Musk'.\nText: Elon Musk has had an eventful year. Those events could end up hurting your portfolio. The world's richest man repeatedly stoked controversy in 2023 by placing himself at the center of a number of scandals. Musk has had a penchant for involving himself in problematic issues this year: from endorsing - and then apologizing for - blatantly antisemitic tropes on X, to re-platforming conspiracy theorist Alex Jones and telling concerned advertisers (and Disney in particular) on X to go 'f**k' themselves. And he's suffered for them. X, nee Twitter, is now projected to bring in significantly less than the $4.5 billion it earned in the year before Musk took over the company.",
  'response': AIMessage(content='Okay, h

### multi-model serial response

In [67]:
def process_model_prompts(model, model_name, prompts):
    """
    Process all prompts using the specified model.
    
    Args:
        model: The initialized model object
        model_name: String name of the model for recording
        prompts: List of prompt dictionaries
        
    Returns:
        List of result dictionaries
    """
    results = []

    # Process each prompt in the list
    for prompt in prompts:
        # Extract the required fields
        system_prompt = prompt.get('system_prompt', '')
        user_input = prompt.get('input', '')
        
        print(f"Processing prompt {prompt.get('id', '')} with model {model_name}")
        
        # Invoke your model with the extracted parameters
        try:
            # Invoke the model with standardized parameters
            response = model.invoke(
                user_input, 
                config={"system_prompt": system_prompt}
                # Add any other parameters your model.invoke() requires
            )
            
            # Store the result with model name
            result = {
                "id": prompt.get('id', ''),
                "model_name": model_name,
                "response": response
            }
            results.append(result)
            
        except Exception as e:
            print(f"Error invoking {model_name} for prompt {prompt.get('id', '')}: {e}")
            results.append({
                "id": prompt.get('id', ''),
                "model_name": model_name,
                "error": str(e)
            })
        
        print("---")

    print(f"Processed {len(results)} prompts with {model_name}")
    print("--------------------------------")
    return results

In [68]:
# Process prompts with each model
gemma3_results = process_model_prompts(gemma3, "gemma3", prompts)
llama3_1_results = process_model_prompts(llama3_1, "llama3_1", prompts)
deepseek_r1_results = process_model_prompts(deepseek_r1, "deepseek_r1", prompts)

Processing prompt instruction_following_multi_task_inference_0 with model gemma3
---
Processing prompt instruction_following_education_content_creation_0 with model gemma3
---
Processed 2 prompts with gemma3
Processing prompt instruction_following_multi_task_inference_0 with model llama3_1
---
Processing prompt instruction_following_education_content_creation_0 with model llama3_1
---
Processed 2 prompts with llama3_1
Processing prompt instruction_following_multi_task_inference_0 with model deepseek_r1
---
Processing prompt instruction_following_education_content_creation_0 with model deepseek_r1
---
Processed 2 prompts with deepseek_r1


In [78]:
# Combine all results if needed
all_results = gemma3_results + llama3_1_results + deepseek_r1_results
all_results

[{'id': 'instruction_following_multi_task_inference_0',
  'model_name': 'gemma3',
  'response': AIMessage(content='Okay, here\'s a list of terms referring to Elon Musk in the provided text, followed by rephrased versions excluding pronouns and "Elon Musk" itself:\n\n**Terms Referring to Elon Musk & Rephrased Versions:**\n\n1.  **Elon Musk** - *The world\'s richest man*\n2.  **Musk** - *He* (This is excluded from the rephrasing as per the instructions)\n3.  **Those events** - *His actions*\n4.  **He** - *The individual* (Excluded as per instructions)\n5.  **The individual** - *The person* (Used to avoid repetition)\n6.  **The person** - *The figure* (Used to avoid repetition)\n7.  **The figure** - *The subject* (Used to avoid repetition)\n8.  **The subject** - *The owner* (Used to avoid repetition)\n9.  **The owner** - *The leader* (Used to avoid repetition)\n\n\n\n**Note:** Some of these rephrased versions are quite general. The best choice for a specific context would depend on the su

In [79]:
# JSON 직렬화 가능한 형태로 변환
serializable_results = []
for result in all_results:
    serialized_result = {
        "id": result.get("id", ""),
        "model_name": result.get("model_name", ""),
        "response_content": result["response"].content if hasattr(result["response"], "content") else str(result["response"])
    }
    serializable_results.append(serialized_result)
    
# JSON으로 저장
with open("results.json", "w", encoding="utf-8") as f:
    json.dump(serializable_results, f, ensure_ascii=False, indent=2)