# Positional Bias vs Reasoning

In [None]:
# Import necessary libraries
from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client
from langchain_mcp_adapters.tools import load_mcp_tools
import nest_asyncio
from langchain_anthropic import ChatAnthropic
from langchain_groq import ChatGroq

# Apply nest_asyncio to handle nested event loops
nest_asyncio.apply()

In [1]:
%env GROQ_API_KEY=your-groq-api-key-here
%env ANTHROPIC_API_KEY=your-anthropic-api-key-here


env: GROQ_API_KEY=your-groq-api-key-here
env: ANTHROPIC_API_KEY=your-anthropic-api-key-here


In [3]:
tool_selection_test_data = [
    ('What is the price of ethereum right now?', 'crypto_and_nft_tool'),
    ('What is the price of bitcoin right now?', 'crypto_and_nft_tool'),
    ('Check the floor price of the world of women nft', 'crypto_and_nft_tool'),
    ('Give me dogecoin price', 'crypto_and_nft_tool'),

    ('Add a new row and just write "To do" in it', 'google_spreadsheet_tool'),
    ('Add this data to the spreadsheet: "1, 2, 3"', 'google_spreadsheet_tool'),
    ('Add: "Sinan, Pearson" to the spreadsheet', 'google_spreadsheet_tool'),
    ('Check the calculated sum of the first column', 'google_spreadsheet_tool'),
    

    ('Go to https://loopgenius.com and tell me about it', 'firecrawl_tool'),
    ('Visit https://github.com/trending and list top repositories', 'firecrawl_tool'),
    ('Scrape the main headlines from https://news.ycombinator.com', 'firecrawl_tool'),
    ('Go to https://en.wikipedia.org/wiki/Python_(programming_language) and summarize the introduction', 'firecrawl_tool'),
    ('Check the documentation at https://pytorch.org and tell me the latest version name', 'firecrawl_tool'),
    ('Visit https://openai.com and summarize the homepage', 'firecrawl_tool'),
    ('What is the headline on https://bbc.com/news', 'firecrawl_tool'),

    ('What are the current gas prices in Chicago?', 'serp_tool'),
    ('Find the intro.co link for Sinan Ozdemir', 'serp_tool'),
    ('What is the release date of the next Marvel movie?', 'serp_tool'),
    ('Who won the latest Super Bowl?', 'serp_tool'),
    ('What are the top restaurants in New York City?', 'serp_tool'),
    ('Get the latest stock price for Apple (AAPL)', 'serp_tool'),
    ('List the top 5 popular programming languages in 2024', 'serp_tool'),
    ('Look up the conversion from 98 degrees Fahrenheit to Celsius on the web', 'serp_tool'),

    ('Convert 98 degrees Fahrenheit to Celsius using Python', 'python_repl_tool'),
    ('Write a function that yields the nth fibonacci number and use it to find the 100th fibonacci number', 'python_repl_tool'),
    ('Calculate the factorial of 10 using Python', 'python_repl_tool'),
    ('Calculate 15% of 34543.453', 'python_repl_tool'),
    ('Convert 42 kilometers to miles using a function in Python', 'python_repl_tool'),
    ('Generate a random number between 1 and 1000', 'python_repl_tool'),
    
    # CRM Contact Tool
    ('Add a new contact named John Smith with email john@example.com', 'crm_contact_tool'),
    ('Search for contacts at Google company', 'crm_contact_tool'),
    ('Update contact ID 123 to change their phone number', 'crm_contact_tool'),
    ('Delete the contact with ID 456', 'crm_contact_tool'),
    ('List all contacts in the CRM system', 'crm_contact_tool'),
    
    # eBay Price Tool
    ('Check the current eBay price for iPhone 15 Pro', 'ebay_price_tool'),
    ('What is the average price of a used MacBook Pro on eBay?', 'ebay_price_tool'),
    ('Find the eBay selling price for vintage Pokemon cards', 'ebay_price_tool'),
    ('Look up the market value of a 2020 Toyota Camry on eBay', 'ebay_price_tool'),
    ('Check eBay prices for refurbished iPad Air', 'ebay_price_tool'),
    
    # Calendar Scheduling Tool
    ('Schedule a meeting for tomorrow at 2 PM', 'calendar_scheduling_tool'),
    ('Show me my calendar for next week', 'calendar_scheduling_tool'),
    ('Update my 3 PM meeting to 4 PM', 'calendar_scheduling_tool'),
    ('Delete the event with ID 789', 'calendar_scheduling_tool'),
    ('Find available time slots for a 2-hour meeting this Friday', 'calendar_scheduling_tool'),
    
    # Social Media Tool
    ('Post "Just finished a great workout!" to Twitter', 'social_media_tool'),
    ('Schedule a LinkedIn post for next Monday about our product launch', 'social_media_tool'),
    ('Delete my Facebook post from yesterday', 'social_media_tool'),
    ('Get analytics for my Instagram post from last week', 'social_media_tool'),
    ('Post a photo to Instagram with hashtags #sunset #nature', 'social_media_tool'),
    
    # Weather Forecast Tool
    ('What\'s the weather like in Miami this weekend?', 'weather_forecast_tool'),
    ('Give me a detailed 7-day forecast for New York City', 'weather_forecast_tool'),
    ('Check the hourly weather for Chicago today', 'weather_forecast_tool'),
    ('What\'s the current temperature in Los Angeles?', 'weather_forecast_tool'),
    ('Get the weather forecast for my location for the next 3 days', 'weather_forecast_tool'),
    
    # File Storage Tool
    ('Upload my resume.pdf to the cloud storage', 'file_storage_tool'),
    ('Download the project files from /work/documents/', 'file_storage_tool'),
    ('Share the presentation.pptx file with read-only access', 'file_storage_tool'),
    ('Move the photos from /personal/ to /archive/', 'file_storage_tool'),
    ('List all files in the /projects/ folder', 'file_storage_tool'),
    
    # Database Query Tool
    ('Select all users from the customers table', 'database_query_tool'),
    ('Insert a new product record into the inventory database', 'database_query_tool'),
    ('Update the price of product ID 123 in the products table', 'database_query_tool'),
    ('Delete all orders older than 2 years from the orders table', 'database_query_tool'),
    ('Create a new table called employee_reviews', 'database_query_tool'),
    
    # Translation Tool
    ('Translate "Hello, how are you?" to Spanish', 'translation_tool'),
    ('Convert this French text to English: "Bonjour, comment allez-vous?"', 'translation_tool'),
    ('Translate the German phrase "Wie geht es dir?" to Italian', 'translation_tool'),
    ('Convert "Thank you very much" to Japanese', 'translation_tool'),
    ('Translate this business email to Portuguese', 'translation_tool'),
    
    # PDF Document Tool
    ('Extract all text from the contract.pdf file', 'pdf_document_tool'),
    ('Merge these 3 PDF files into one document', 'pdf_document_tool'),
    ('Split the large report.pdf into separate chapters', 'pdf_document_tool'),
    ('Convert this PDF to a Word document', 'pdf_document_tool'),
    ('Generate a PDF report from the quarterly data', 'pdf_document_tool'),
]


In [4]:
import os
import random

# Define tool snippets
tool_snippets = [
    '''
@mcp.tool()
def google_spreadsheet_tool(action: str = "append_to_sheet", **kwargs) -> dict:
    """
    Executes specified actions on the Google Spreadsheet.

    :param action: The action to perform ("append_to_sheet", "search", "insert_into_cell", "get_data_in_range", "describe").
    :Additional arguments for each specific action:
        - "search": 
            "search" will return the row indices where the search_value is found in the column_name.
            Requires 'search_value' and 'column_name'. Example: {"action": "search", "search_value": "John", "column_name": "Name"}
        - "append_to_sheet":
            "append_to_sheet" will append the data to the end of the sheet.
            Requires 'data'. Example: {"action": "append_to_sheet", "data": [["John", "Doe", "john.doe@example.com"], ["Jane", "Smith", "jane.smith@example.com"]]}
        - "insert_into_cell": 
            "insert_into_cell" will insert the value into the specified cell.
            Requires 'value' and 'cell'. Example: {"action": "insert_into_cell", "value": "New Value", "cell": "A1"}
        - "get_data_in_range": 
            "get_data_in_range" will return the data in the specified range.
            Requires 'range_name'. Example: {"action": "get_data_in_range", "range_name": "Sheet1!A1:B2"} or {"action": "get_data_in_range",    "range_name": "Contacts!A12:G28"}
        - "describe": 
            "describe" will return the number of columns and rows in the sheet.
            No additional arguments. Example: {"action": "describe"}
    :return: The result of the operation.
    """
    return f"Fake response for google spreadsheet operation: {action}"
''',
    '''
@mcp.tool()
def crypto_and_nft_tool(query: str) -> str:
    """Get current cryptocurrency prices and NFT prices around the world and for a specific wallet.
    :param query: The query to search for cryptocurrency or NFT prices.
    :return: The current cryptocurrency or NFT prices.
    """
    
    return f"Fake response for crypto/NFT query: {query}"
''',
    '''
@mcp.tool()
def firecrawl_tool(website_url: str) -> str:
    """
    Crawl webpages and return a markdown version of the html on the page
    :param website_url: The URL of the website to scrape
    return: The markdown version of the html on the page
    """
    return f"Scraped data from: {website_url}"
''',
    '''
@mcp.tool()
def serp_tool(query: str) -> str:
    """Search the web for information using the Google Search Engine
    :param query: The query to search for.
    :return: The search result.
    """
    return f"Search result for query: {query}"
''',
    '''
@mcp.tool()
def python_repl_tool(code: str) -> str:
    """
    Execute valid python code and returns the printed values in the code
    :param command: The Python command to run. Always end with a print statement to show the output like "print(output)"
    :return: The output of the code.
    """
    try:
        result = eval(code)
        return str(result)
    except Exception as e:
        return f"Error: {e}"
''',
    '''
@mcp.tool()
def crm_contact_tool(action: str, **kwargs) -> str:
    """
    Manage contacts in CRM system - add, search, update, delete contacts
    :param action: The action to perform ("add", "search", "update", "delete", "list")
    :Additional arguments based on action:
        - "add": Requires 'name', 'email', optional 'phone', 'company'
        - "search": Requires 'query' (name, email, or company)
        - "update": Requires 'contact_id' and fields to update
        - "delete": Requires 'contact_id'
        - "list": No additional arguments
    :return: Result of the CRM operation
    """
    return f"CRM operation '{action}' completed with args: {kwargs}"
''',
    '''
@mcp.tool()
def ebay_price_tool(item_name: str, condition: str = "used") -> str:
    """
    Check current eBay prices for items, get sold listings and average prices
    :param item_name: The name or description of the item to check
    :param condition: The condition of the item ("new", "used", "refurbished")
    :return: Current eBay price information and recent sold listings
    """
    return f"eBay price check for '{item_name}' in {condition} condition: Average price $125.99"
''',
    '''
@mcp.tool()
def calendar_scheduling_tool(action: str, **kwargs) -> str:
    """
    Manage calendar events and scheduling - create, view, update, delete events
    :param action: The action to perform ("create", "view", "update", "delete", "find_available")
    :Additional arguments based on action:
        - "create": Requires 'title', 'date', 'time', optional 'duration', 'attendees'
        - "view": Requires 'date' or 'date_range'
        - "update": Requires 'event_id' and fields to update
        - "delete": Requires 'event_id'
        - "find_available": Requires 'date', 'duration'
    :return: Result of the calendar operation
    """
    return f"Calendar operation '{action}' completed with args: {kwargs}"
''',
    '''
@mcp.tool()
def social_media_tool(platform: str, action: str, content: str = "", **kwargs) -> str:
    """
    Post content to social media platforms, manage posts, and get analytics
    :param platform: The platform to use ("twitter", "facebook", "instagram", "linkedin")
    :param action: The action to perform ("post", "schedule", "delete", "analytics")
    :param content: The content to post (for post/schedule actions)
    :Additional arguments based on action:
        - "post": Requires 'content', optional 'media_urls', 'hashtags'
        - "schedule": Requires 'content', 'schedule_time'
        - "delete": Requires 'post_id'
        - "analytics": Requires 'post_id' or 'date_range'
    :return: Result of the social media operation
    """
    return f"Social media operation on {platform}: {action} completed"
''',
    '''
@mcp.tool()
def weather_forecast_tool(location: str, days: int = 1, details: str = "basic") -> str:
    """
    Get weather forecasts, current conditions, and weather alerts for locations
    :param location: The location to get weather for (city, coordinates, or address)
    :param days: Number of days to forecast (1-7)
    :param details: Level of detail ("basic", "detailed", "hourly")
    :return: Weather forecast information
    """
    return f"Weather forecast for {location} ({days} days, {details}): Sunny, 72°F"
''',
    '''
@mcp.tool()
def file_storage_tool(action: str, file_path: str = "", **kwargs) -> str:
    """
    Manage files in cloud storage - upload, download, delete, share, organize
    :param action: The action to perform ("upload", "download", "delete", "share", "list", "move")
    :param file_path: Path to the file in cloud storage
    :Additional arguments based on action:
        - "upload": Requires 'local_path', 'cloud_path'
        - "download": Requires 'cloud_path', 'local_path'
        - "share": Requires 'cloud_path', optional 'permissions'
        - "move": Requires 'old_path', 'new_path'
        - "list": Requires 'folder_path'
    :return: Result of the file operation
    """
    return f"File storage operation '{action}' completed for {file_path}"
''',
    '''
@mcp.tool()
def database_query_tool(query: str, db: str = "formula_1", **kwargs) -> str:
    """
    Execute database queries, perform CRUD operations, and manage database schemas
    :param query: The SQL query to execute
    :param db: The database name to query
    :param query_type: Type of query ("select", "insert", "update", "delete", "create")
    :param limit: Maximum number of results to return
    :return: Query results or operation confirmation
    """
    return f"Database query executed on {database}: {query[:50]}..."
''',
    '''
@mcp.tool()
def translation_tool(text: str, target_language: str, source_language: str = "auto") -> str:
    """
    Translate text between languages, detect language, and provide language information
    :param text: The text to translate
    :param target_language: The target language code (e.g., "es", "fr", "de")
    :param source_language: The source language code (auto-detect if not specified)
    :return: Translated text with confidence score
    """
    return f"Translation of '{text}' to {target_language}: [Translated text here]"
''',
    '''
@mcp.tool()
def pdf_document_tool(action: str, file_path: str, **kwargs) -> str:
    """
    Process PDF and document files - extract text, merge, split, convert, generate
    :param action: The action to perform ("extract_text", "merge", "split", "convert", "generate")
    :param file_path: Path to the PDF/document file
    :Additional arguments based on action:
        - "extract_text": Optional 'pages' range
        - "merge": Requires 'files' list
        - "split": Requires 'page_ranges' or 'pages_per_file'
        - "convert": Requires 'output_format' (docx, txt, html)
        - "generate": Requires 'content' and 'template'
    :return: Result of the document operation
    """
    return f"PDF document operation '{action}' completed for {file_path}"
'''
]

def generate_random_mcp_order(mcp_path='random_mcp_server.py'):
    # Shuffle tool definitions
    random.shuffle(tool_snippets)
    
    # Create full server code
    MCP_SERVER = f'''
from mcp.server.fastmcp import FastMCP
from langchain_community.tools import DuckDuckGoSearchRun

mcp = FastMCP("MCP Example")
{''.join(tool_snippets)}

if __name__ == "__main__":
    mcp.run(transport="stdio")
    '''
    
    # Save to file
    with open(mcp_path, 'w') as f:
        f.write(MCP_SERVER)
    
    return os.path.abspath(mcp_path), [t.split('def')[-1].split('(')[0].strip() for t in tool_snippets]
random_mcp, random_tools = generate_random_mcp_order()

print(f"Saved to: {random_mcp}\nTool order: {random_tools}")

Saved to: /Users/sinanozdemir/Teaching/Pearson/applied-ai-book/agent_positional_bias/random_mcp_server.py
Tool order: ['crm_contact_tool', 'file_storage_tool', 'calendar_scheduling_tool', 'weather_forecast_tool', 'serp_tool', 'crypto_and_nft_tool', 'ebay_price_tool', 'google_spreadsheet_tool', 'python_repl_tool', 'database_query_tool', 'social_media_tool', 'firecrawl_tool', 'translation_tool', 'pdf_document_tool']


In [5]:
random_mcp

'/Users/sinanozdemir/Teaching/Pearson/applied-ai-book/agent_positional_bias/random_mcp_server.py'

In [6]:
random_tools

['crm_contact_tool',
 'file_storage_tool',
 'calendar_scheduling_tool',
 'weather_forecast_tool',
 'serp_tool',
 'crypto_and_nft_tool',
 'ebay_price_tool',
 'google_spreadsheet_tool',
 'python_repl_tool',
 'database_query_tool',
 'social_media_tool',
 'firecrawl_tool',
 'translation_tool',
 'pdf_document_tool']

In [7]:
import time
from langchain_core.messages.human import HumanMessage

async def tool_use_run(llm, initial_message):
    random_mcp_path, mcp_tool_order = generate_random_mcp_order()
    server_params = StdioServerParameters(command="python", args=[random_mcp_path])
    
    async with stdio_client(server_params) as (read, write):
        async with ClientSession(read, write) as session:
            await session.initialize()
            tools = await load_mcp_tools(session)
            time_before = time.time()
            ai_message = llm.bind_tools(tools).invoke([HumanMessage(content=initial_message)])
            time_after = time.time()
            tools_used = []
            if type(ai_message.content) == list:
                for c in ai_message.content:  # anthropic
                    if c['type'] == 'tool_use':
                        tools_used.append((c['name'], c['input']))
            if hasattr(ai_message, 'additional_kwargs') and 'tool_calls' in ai_message.additional_kwargs:  # openai
                for tc in ai_message.additional_kwargs['tool_calls']:
                    tools_used.append((tc['function']['name'], tc['function']['arguments']))
            return ai_message, tools_used, mcp_tool_order, time_after - time_before

In [8]:
results = []
errors = []

In [9]:
import os
import pandas as pd

FILENAME = 'results_agent_positional_bias_reasoning.csv'

if os.path.exists(FILENAME):
    results_df = pd.read_csv(FILENAME).dropna()
    results = results_df.to_dict(orient='records')
    print(f"Loaded {len(results)} previous results from {FILENAME}")
else:
    results = []

message_llm_counts = {}
for result in results:
    if result['initial_message'] not in message_llm_counts:
        message_llm_counts[result['initial_message']] = {}
    if result['experiment_name'] not in message_llm_counts[result['initial_message']]:
        message_llm_counts[result['initial_message']][result['experiment_name']] = 0
    message_llm_counts[result['initial_message']][result['experiment_name']] += 1

print(len(message_llm_counts))

Loaded 10565 previous results from results_agent_positional_bias_reasoning.csv
75


In [10]:
llms = {
            
            'reasoning qwen3 32b': ChatGroq(
                model="qwen/qwen3-32b",
                max_tokens=None,
                reasoning_format="parsed"
            ),
            'no reasoning qwen3 32b': ChatGroq(
                model="qwen/qwen3-32b",
                max_tokens=None,
                reasoning_effort="none"
            ),

            'reasoning anthropic claude sonnet 4': ChatAnthropic(
                model="claude-sonnet-4-0",
                max_tokens=1300,
                model_kwargs={"thinking": {"type": "enabled", "budget_tokens": 1024}},
            ),

            'no reasoning anthropic claude sonnet 4': ChatAnthropic(
                model="claude-sonnet-4-0",
                max_tokens=1300,
                model_kwargs={"thinking": {"type": "disabled"}},
            ),
            'reasoning anthropic claude opus 4.1': ChatAnthropic(
                model="claude-opus-4-1-20250805",
                max_tokens=1300,
                model_kwargs={"thinking": {"type": "enabled", "budget_tokens": 1024}},
            ),

            'no reasoning anthropic claude opus 4.1': ChatAnthropic(
                model="claude-opus-4-1-20250805",
                max_tokens=1300,
                model_kwargs={"thinking": {"type": "disabled"}},
            ),

        }


llms = {
    **llms,
    'low reasoning gpt-oss-20b': ChatGroq(
        model="openai/gpt-oss-20b",
        reasoning_effort="low",
    ),
    'medium reasoning gpt-oss-20b': ChatGroq(
        model="openai/gpt-oss-20b",
        reasoning_effort="medium",
    ),
    'high reasoning gpt-oss-20b': ChatGroq(
        model="openai/gpt-oss-20b",
        reasoning_effort="high",
    ),

    'low reasoning gpt-oss-120b': ChatGroq(
        model="openai/gpt-oss-120b",
        reasoning_effort="low",
    ),

    'medium reasoning gpt-oss-120b': ChatGroq(
        model="openai/gpt-oss-120b",
        reasoning_effort="medium",
    ),
    'high reasoning gpt-oss-120b': ChatGroq(
        model="openai/gpt-oss-120b",
        reasoning_effort="high",
    ),



}

In [11]:
await tool_use_run(llms['low reasoning gpt-oss-120b'], 'what is 1242521423424234 * 2523345')

(AIMessage(content='', additional_kwargs={'reasoning_content': 'We need to compute multiplication. Use python tool.', 'tool_calls': [{'id': 'fc_d67da624-ef67-41f2-a1fb-259ef0317ec7', 'function': {'arguments': '{"command":"result = 1242521423424234 * 2523345\\nprint(result)"}', 'name': 'python_repl_tool'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 53, 'prompt_tokens': 1882, 'total_tokens': 1935, 'completion_time': 0.09839522, 'prompt_time': 0.070915918, 'queue_time': 0.063909344, 'total_time': 0.169311138}, 'model_name': 'openai/gpt-oss-120b', 'system_fingerprint': 'fp_e1a78f200e', 'service_tier': 'on_demand', 'reasoning_effort': 'low', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--d7839c70-d86c-4bd7-8030-25b59255ce8e-0', tool_calls=[{'name': 'python_repl_tool', 'args': {'command': 'result = 1242521423424234 * 2523345\nprint(result)'}, 'id': 'fc_d67da624-ef67-41f2-a1fb-259ef0317ec7', 'type': 'tool_call'}], usage_metadata={'input_tokens':

In [None]:
from tqdm import tqdm
from random import sample
import pandas as pd

n = len(tool_snippets)
import time

random.shuffle(tool_selection_test_data)

for initial_message, expected_tool in tqdm(tool_selection_test_data):
    for experiment_name, llm in llms.items():
        message_llm_count = message_llm_counts.get(initial_message, {}).get(experiment_name, 0)
        _times = n - message_llm_count
        if _times > 0:
            print(f"Running {initial_message} for {experiment_name} {_times} times")
        for _ in tqdm(range(_times)):
            try:
                response, tools_used, mcp_tool_order, time_taken = await tool_use_run(llm, initial_message)
                first_tool_used_by_agent = tools_used[0][0] if tools_used else None
                results.append(
                    {
                        'llm': llm.model_name if hasattr(llm, 'model_name') else llm.model,
                        'experiment_name': experiment_name,
                        'initial_message': initial_message,
                        'first_tool_used_by_agent': first_tool_used_by_agent,
                        'tools_used_by_agent': tools_used, 
                        'mcp_tool_order': mcp_tool_order, 
                        'time_taken': time_taken,
                        'expected_tool': expected_tool,
                        'correct_tool_index': mcp_tool_order.index(expected_tool),
                        'chosen_tool_index': mcp_tool_order.index(first_tool_used_by_agent) if first_tool_used_by_agent in mcp_tool_order else None
                    }
                )
            except Exception as e:
                errors.append(f'error: {e} on llm: {llm}')
            
            if len(results) % 2 == 0 and len(results) > 0:
                pd.DataFrame(results).dropna().to_csv(FILENAME, index=False)
                # print(pd.DataFrame(results)['chosen_tool_index'].mean(), pd.DataFrame(results)['correct_tool_index'].mean())

0it [00:00, ?it/s]/74 [00:00<?, ?it/s]
0it [00:00, ?it/s]


Running Create a new table called employee_reviews for reasoning anthropic claude sonnet 4 4 times




100%|██████████| 4/4 [00:33<00:00,  8.31s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Create a new table called employee_reviews for high reasoning gpt-oss-20b 13 times


100%|██████████| 13/13 [00:24<00:00,  1.85s/it]
0it [00:00, ?it/s]


Running Create a new table called employee_reviews for medium reasoning gpt-oss-120b 6 times


100%|██████████| 6/6 [00:12<00:00,  2.10s/it]


Running Create a new table called employee_reviews for high reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:51<00:00,  3.71s/it]
0it [00:00, ?it/s]/74 [02:01<2:28:20, 121.93s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Share the presentation.pptx file with read-only access for reasoning qwen3 32b 3 times


100%|██████████| 3/3 [00:08<00:00,  2.73s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Share the presentation.pptx file with read-only access for medium reasoning gpt-oss-120b 2 times


100%|██████████| 2/2 [00:03<00:00,  1.62s/it]


Running Share the presentation.pptx file with read-only access for high reasoning gpt-oss-120b 3 times


100%|██████████| 3/3 [00:08<00:00,  2.67s/it]
0it [00:00, ?it/s]/74 [02:21<45:56, 38.82s/it]   
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running List all files in the /projects/ folder for reasoning qwen3 32b 3 times


100%|██████████| 3/3 [00:06<00:00,  2.13s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
  7%|▋         | 5/74 [02:27<23:35, 20.51s/it]

Running Schedule a meeting for tomorrow at 2 PM for reasoning qwen3 32b 1 times


100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
0it [00:00, ?it/s]


Running Schedule a meeting for tomorrow at 2 PM for reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:46<00:00,  7.60s/it]


Running Schedule a meeting for tomorrow at 2 PM for no reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:02<00:00,  4.49s/it]


Running Schedule a meeting for tomorrow at 2 PM for reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:53<00:00,  8.14s/it]


Running Schedule a meeting for tomorrow at 2 PM for no reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:16<00:00,  5.45s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Schedule a meeting for tomorrow at 2 PM for high reasoning gpt-oss-20b 2 times


100%|██████████| 2/2 [00:04<00:00,  2.17s/it]


Running Schedule a meeting for tomorrow at 2 PM for low reasoning gpt-oss-120b 2 times


100%|██████████| 2/2 [00:04<00:00,  2.00s/it]


Running Schedule a meeting for tomorrow at 2 PM for medium reasoning gpt-oss-120b 5 times


100%|██████████| 5/5 [00:10<00:00,  2.03s/it]


Running Schedule a meeting for tomorrow at 2 PM for high reasoning gpt-oss-120b 12 times


100%|██████████| 12/12 [00:28<00:00,  2.37s/it]
0it [00:00, ?it/s]/74 [09:17<2:21:39, 124.99s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Scrape the main headlines from https://news.ycombinator.com for high reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]0/74 [09:19<52:56, 49.64s/it]  
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Convert this French text to English: "Bonjour, comment allez-vous?" for medium reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.57s/it]


Running Convert this French text to English: "Bonjour, comment allez-vous?" for high reasoning gpt-oss-20b 5 times


100%|██████████| 5/5 [00:08<00:00,  1.72s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Convert this French text to English: "Bonjour, comment allez-vous?" for high reasoning gpt-oss-120b 5 times


100%|██████████| 5/5 [00:10<00:00,  2.06s/it]
0it [00:00, ?it/s]2/74 [09:39<39:00, 37.75s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Calculate 15% of 34543.453 for low reasoning gpt-oss-20b 10 times


100%|██████████| 10/10 [00:14<00:00,  1.47s/it]


Running Calculate 15% of 34543.453 for medium reasoning gpt-oss-20b 13 times


100%|██████████| 13/13 [00:20<00:00,  1.58s/it]


Running Calculate 15% of 34543.453 for high reasoning gpt-oss-20b 13 times


100%|██████████| 13/13 [00:24<00:00,  1.90s/it]


Running Calculate 15% of 34543.453 for low reasoning gpt-oss-120b 9 times


100%|██████████| 9/9 [00:25<00:00,  2.88s/it]


Running Calculate 15% of 34543.453 for medium reasoning gpt-oss-120b 2 times


100%|██████████| 2/2 [00:03<00:00,  1.58s/it]


Running Calculate 15% of 34543.453 for high reasoning gpt-oss-120b 3 times


100%|██████████| 3/3 [00:06<00:00,  2.06s/it]
0it [00:00, ?it/s]4/74 [11:15<40:43, 40.73s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Who won the latest Super Bowl? for low reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.65s/it]


Running Who won the latest Super Bowl? for medium reasoning gpt-oss-20b 9 times


100%|██████████| 9/9 [00:15<00:00,  1.72s/it]


Running Who won the latest Super Bowl? for high reasoning gpt-oss-20b 6 times


100%|██████████| 6/6 [00:10<00:00,  1.83s/it]
0it [00:00, ?it/s]


Running Who won the latest Super Bowl? for medium reasoning gpt-oss-120b 5 times


100%|██████████| 5/5 [00:11<00:00,  2.29s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]6/74 [11:54<33:16, 34.43s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running What are the top restaurants in New York City? for medium reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:02<00:00,  2.59s/it]


Running What are the top restaurants in New York City? for high reasoning gpt-oss-20b 4 times


100%|██████████| 4/4 [00:07<00:00,  1.78s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running What are the top restaurants in New York City? for high reasoning gpt-oss-120b 5 times


100%|██████████| 5/5 [00:21<00:00,  4.25s/it]
0it [00:00, ?it/s]7/74 [12:25<32:07, 33.82s/it]
0it [00:00, ?it/s]


Running Show me my calendar for next week for reasoning anthropic claude sonnet 4 2 times


100%|██████████| 2/2 [00:19<00:00,  9.54s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]8/74 [12:44<28:47, 30.85s/it]
0it [00:00, ?it/s]


Running Upload my resume.pdf to the cloud storage for reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:28<00:00,  6.34s/it]


Running Upload my resume.pdf to the cloud storage for no reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:41<00:00,  7.27s/it]


Running Upload my resume.pdf to the cloud storage for reasoning anthropic claude opus 4.1 12 times


100%|██████████| 12/12 [01:41<00:00,  8.46s/it]


Running Upload my resume.pdf to the cloud storage for no reasoning anthropic claude opus 4.1 11 times


100%|██████████| 11/11 [01:07<00:00,  6.17s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Upload my resume.pdf to the cloud storage for high reasoning gpt-oss-20b 4 times


100%|██████████| 4/4 [00:07<00:00,  1.90s/it]


Running Upload my resume.pdf to the cloud storage for low reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:22<00:00,  1.58s/it]


Running Upload my resume.pdf to the cloud storage for medium reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:25<00:00,  1.82s/it]


Running Upload my resume.pdf to the cloud storage for high reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:33<00:00,  2.39s/it]
0it [00:00, ?it/s]9/74 [20:13<1:53:57, 124.32s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Translate the German phrase "Wie geht es dir?" to Italian for low reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.65s/it]


Running Translate the German phrase "Wie geht es dir?" to Italian for medium reasoning gpt-oss-20b 8 times


100%|██████████| 8/8 [00:12<00:00,  1.51s/it]


Running Translate the German phrase "Wie geht es dir?" to Italian for high reasoning gpt-oss-20b 13 times


100%|██████████| 13/13 [00:22<00:00,  1.77s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Translate the German phrase "Wie geht es dir?" to Italian for high reasoning gpt-oss-120b 9 times


100%|██████████| 9/9 [00:17<00:00,  1.94s/it]
 28%|██▊       | 21/74 [21:07<1:16:18, 86.39s/it] 

Running Search for contacts at Google company for reasoning qwen3 32b 2 times


100%|██████████| 2/2 [00:04<00:00,  2.35s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]2/74 [21:12<59:27, 68.60s/it]  
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Delete the contact with ID 456 for reasoning qwen3 32b 3 times


100%|██████████| 3/3 [00:06<00:00,  2.16s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
 32%|███▏      | 24/74 [21:18<36:16, 43.53s/it]

Running Check the calculated sum of the first column for reasoning qwen3 32b 1 times


100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Running Check the calculated sum of the first column for no reasoning qwen3 32b 7 times


100%|██████████| 7/7 [00:11<00:00,  1.62s/it]


Running Check the calculated sum of the first column for reasoning anthropic claude sonnet 4 7 times


100%|██████████| 7/7 [00:43<00:00,  6.18s/it]


Running Check the calculated sum of the first column for no reasoning anthropic claude sonnet 4 3 times


100%|██████████| 3/3 [00:11<00:00,  3.98s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Check the calculated sum of the first column for high reasoning gpt-oss-20b 6 times


100%|██████████| 6/6 [00:15<00:00,  2.62s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Check the calculated sum of the first column for high reasoning gpt-oss-120b 7 times


100%|██████████| 7/7 [00:18<00:00,  2.68s/it]
0it [00:00, ?it/s]5/74 [23:03<46:17, 56.68s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Merge these 3 PDF files into one document for reasoning qwen3 32b 3 times


100%|██████████| 3/3 [00:14<00:00,  4.73s/it]
0it [00:00, ?it/s]


Running Merge these 3 PDF files into one document for reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:26<00:00,  6.16s/it]


Running Merge these 3 PDF files into one document for no reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [00:59<00:00,  4.28s/it]


Running Merge these 3 PDF files into one document for reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [02:12<00:00,  9.43s/it]


Running Merge these 3 PDF files into one document for no reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:19<00:00,  5.68s/it]
0it [00:00, ?it/s]


Running Merge these 3 PDF files into one document for medium reasoning gpt-oss-20b 9 times


100%|██████████| 9/9 [00:14<00:00,  1.56s/it]


Running Merge these 3 PDF files into one document for high reasoning gpt-oss-20b 14 times


100%|██████████| 14/14 [00:24<00:00,  1.75s/it]


Running Merge these 3 PDF files into one document for low reasoning gpt-oss-120b 10 times


100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


Running Merge these 3 PDF files into one document for medium reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:25<00:00,  1.82s/it]


Running Merge these 3 PDF files into one document for high reasoning gpt-oss-120b 13 times


100%|██████████| 13/13 [00:28<00:00,  2.18s/it]
0it [00:00, ?it/s]7/74 [31:04<1:39:08, 126.57s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Convert 42 kilometers to miles using a function in Python for high reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:03<00:00,  3.02s/it]


Running Convert 42 kilometers to miles using a function in Python for low reasoning gpt-oss-120b 5 times


100%|██████████| 5/5 [00:08<00:00,  1.72s/it]


Running Convert 42 kilometers to miles using a function in Python for medium reasoning gpt-oss-120b 1 times


100%|██████████| 1/1 [00:02<00:00,  2.17s/it]


Running Convert 42 kilometers to miles using a function in Python for high reasoning gpt-oss-120b 6 times


100%|██████████| 6/6 [00:14<00:00,  2.40s/it]
0it [00:00, ?it/s]8/74 [31:32<1:20:55, 105.55s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Convert 98 degrees Fahrenheit to Celsius using Python for medium reasoning gpt-oss-120b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.99s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]9/74 [31:34<1:00:59, 81.33s/it] 
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Add a new row and just write "To do" in it for no reasoning anthropic claude sonnet 4 12 times


100%|██████████| 12/12 [00:53<00:00,  4.47s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Add a new row and just write "To do" in it for high reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]2/74 [32:29<34:52, 49.83s/it]  
0it [00:00, ?it/s]


Running Post a photo to Instagram with hashtags #sunset #nature for reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:38<00:00,  7.06s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for no reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:04<00:00,  4.62s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [02:09<00:00,  9.27s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for no reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:13<00:00,  5.27s/it]
0it [00:00, ?it/s]


Running Post a photo to Instagram with hashtags #sunset #nature for medium reasoning gpt-oss-20b 4 times


100%|██████████| 4/4 [00:06<00:00,  1.60s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for high reasoning gpt-oss-20b 11 times


100%|██████████| 11/11 [00:21<00:00,  1.94s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for low reasoning gpt-oss-120b 2 times


100%|██████████| 2/2 [00:03<00:00,  1.58s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for medium reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:26<00:00,  1.90s/it]


Running Post a photo to Instagram with hashtags #sunset #nature for high reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:37<00:00,  2.70s/it]
0it [00:00, ?it/s]3/74 [40:12<1:28:19, 129.25s/it]
0it [00:00, ?it/s]


Running Download the project files from /work/documents/ for reasoning anthropic claude sonnet 4 11 times


100%|██████████| 11/11 [01:19<00:00,  7.20s/it]
0it [00:00, ?it/s]


Running Download the project files from /work/documents/ for reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:57<00:00,  8.43s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Download the project files from /work/documents/ for high reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:02<00:00,  2.75s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]4/74 [43:31<1:36:19, 144.49s/it]
0it [00:00, ?it/s]


Running Find available time slots for a 2-hour meeting this Friday for reasoning anthropic claude sonnet 4 3 times


Process group termination failed for PID 89522: [Errno 1] Operation not permitted, falling back to simple terminate
100%|██████████| 3/3 [00:24<00:00,  8.21s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
 47%|████▋     | 35/74 [43:56<1:15:35, 116.29s/it]

Running Post "Just finished a great workout!" to Twitter for reasoning qwen3 32b 1 times


100%|██████████| 1/1 [00:03<00:00,  3.43s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]6/74 [44:00<55:39, 87.88s/it]   
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Calculate the factorial of 10 using Python for high reasoning gpt-oss-20b 2 times


100%|██████████| 2/2 [00:04<00:00,  2.49s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
 50%|█████     | 37/74 [44:05<40:40, 65.96s/it]

Running Get the weather forecast for my location for the next 3 days for reasoning qwen3 32b 14 times


100%|██████████| 14/14 [00:34<00:00,  2.44s/it]
0it [00:00, ?it/s]


Running Get the weather forecast for my location for the next 3 days for reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:21<00:00,  5.85s/it]


Running Get the weather forecast for my location for the next 3 days for no reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [00:57<00:00,  4.12s/it]


Running Get the weather forecast for my location for the next 3 days for reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:26<00:00,  6.18s/it]


Running Get the weather forecast for my location for the next 3 days for no reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:03<00:00,  4.57s/it]


Running Get the weather forecast for my location for the next 3 days for low reasoning gpt-oss-20b 5 times


100%|██████████| 5/5 [00:07<00:00,  1.51s/it]


Running Get the weather forecast for my location for the next 3 days for medium reasoning gpt-oss-20b 11 times


100%|██████████| 11/11 [00:18<00:00,  1.68s/it]


Running Get the weather forecast for my location for the next 3 days for high reasoning gpt-oss-20b 14 times


100%|██████████| 14/14 [00:23<00:00,  1.69s/it]


Running Get the weather forecast for my location for the next 3 days for low reasoning gpt-oss-120b 12 times


100%|██████████| 12/12 [00:22<00:00,  1.88s/it]


Running Get the weather forecast for my location for the next 3 days for medium reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:25<00:00,  1.83s/it]


Running Get the weather forecast for my location for the next 3 days for high reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:29<00:00,  2.12s/it]
0it [00:00, ?it/s]8/74 [51:36<1:43:03, 171.76s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Convert this PDF to a Word document for reasoning qwen3 32b 6 times


100%|██████████| 6/6 [00:13<00:00,  2.24s/it]


Running Convert this PDF to a Word document for no reasoning qwen3 32b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Running Convert this PDF to a Word document for reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [01:39<00:00,  7.12s/it]


Running Convert this PDF to a Word document for no reasoning anthropic claude sonnet 4 14 times


100%|██████████| 14/14 [00:49<00:00,  3.54s/it]


Running Convert this PDF to a Word document for reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:39<00:00,  7.07s/it]


Running Convert this PDF to a Word document for no reasoning anthropic claude opus 4.1 14 times


100%|██████████| 14/14 [01:21<00:00,  5.84s/it]
0it [00:00, ?it/s]


Running Convert this PDF to a Word document for medium reasoning gpt-oss-20b 7 times


100%|██████████| 7/7 [00:13<00:00,  1.89s/it]


Running Convert this PDF to a Word document for high reasoning gpt-oss-20b 12 times


100%|██████████| 12/12 [00:22<00:00,  1.88s/it]


Running Convert this PDF to a Word document for low reasoning gpt-oss-120b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.53s/it]


Running Convert this PDF to a Word document for medium reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:24<00:00,  1.77s/it]


Running Convert this PDF to a Word document for high reasoning gpt-oss-120b 13 times


100%|██████████| 13/13 [00:30<00:00,  2.38s/it]
0it [00:00, ?it/s]1/74 [58:54<1:26:47, 157.81s/it]


Running What are the current gas prices in Chicago? for no reasoning qwen3 32b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]2/74 [58:56<1:07:07, 125.87s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Generate a random number between 1 and 1000 for low reasoning gpt-oss-20b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.73s/it]


Running Generate a random number between 1 and 1000 for medium reasoning gpt-oss-20b 2 times


100%|██████████| 2/2 [00:02<00:00,  1.45s/it]


Running Generate a random number between 1 and 1000 for high reasoning gpt-oss-20b 8 times


100%|██████████| 8/8 [00:12<00:00,  1.51s/it]


Running Generate a random number between 1 and 1000 for low reasoning gpt-oss-120b 2 times


100%|██████████| 2/2 [00:04<00:00,  2.04s/it]


Running Generate a random number between 1 and 1000 for medium reasoning gpt-oss-120b 4 times


100%|██████████| 4/4 [00:08<00:00,  2.13s/it]
0it [00:00, ?it/s]
 58%|█████▊    | 43/74 [59:25<53:44, 104.03s/it]  

Running Delete the event with ID 789 for reasoning qwen3 32b 2 times


100%|██████████| 2/2 [00:03<00:00,  1.93s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]4/74 [59:29<39:47, 79.57s/it] 
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running List the top 5 popular programming languages in 2024 for medium reasoning gpt-oss-20b 3 times


100%|██████████| 3/3 [00:05<00:00,  1.77s/it]


Running List the top 5 popular programming languages in 2024 for high reasoning gpt-oss-20b 6 times


100%|██████████| 6/6 [00:11<00:00,  1.86s/it]


Running List the top 5 popular programming languages in 2024 for low reasoning gpt-oss-120b 1 times


100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
0it [00:00, ?it/s]


Running List the top 5 popular programming languages in 2024 for high reasoning gpt-oss-120b 1 times


100%|██████████| 1/1 [00:02<00:00,  2.62s/it]
0it [00:00, ?it/s]5/74 [59:50<31:04, 64.31s/it]
0it [00:00, ?it/s]


Running Delete all orders older than 2 years from the orders table for reasoning anthropic claude sonnet 4 1 times


100%|██████████| 1/1 [00:08<00:00,  8.27s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Delete all orders older than 2 years from the orders table for high reasoning gpt-oss-20b 4 times


100%|██████████| 4/4 [00:08<00:00,  2.08s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Delete all orders older than 2 years from the orders table for high reasoning gpt-oss-120b 14 times


100%|██████████| 14/14 [00:44<00:00,  3.17s/it]
0it [00:00, ?it/s]6/74 [1:00:51<29:35, 63.42s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Running Translate this business email to Portuguese for reasoning qwen3 32b 5 times




In [None]:
len(errors)

31

In [None]:
results[-1]

{'llm': 'openai/gpt-oss-120b',
 'experiment_name': 'high reasoning gpt-oss-120b',
 'initial_message': 'Update the price of product ID 123 in the products table',
 'first_tool_used_by_agent': None,
 'tools_used_by_agent': [],
 'mcp_tool_order': ['weather_forecast_tool',
  'ebay_price_tool',
  'firecrawl_tool',
  'serp_tool',
  'google_spreadsheet_tool',
  'database_query_tool',
  'social_media_tool',
  'crypto_and_nft_tool',
  'calendar_scheduling_tool',
  'translation_tool',
  'file_storage_tool',
  'python_repl_tool',
  'pdf_document_tool',
  'crm_contact_tool'],
 'time_taken': 0.6367292404174805,
 'expected_tool': 'database_query_tool',
 'correct_tool_index': 5,
 'chosen_tool_index': None}

In [None]:
results_df = pd.DataFrame(results).dropna()
results_df.to_csv(FILENAME, index=False)
print(results_df.shape) 
results_df.sample(5)

(10565, 10)


Unnamed: 0,llm,experiment_name,initial_message,first_tool_used_by_agent,tools_used_by_agent,mcp_tool_order,expected_tool,correct_tool_index,chosen_tool_index,time_taken
5760,claude-sonnet-4-0,reasoning anthropic claude sonnet 4,Check eBay prices for refurbished iPad Air,ebay_price_tool,"[('ebay_price_tool', {'item_name': 'iPad Air',...","['translation_tool', 'firecrawl_tool', 'pdf_do...",ebay_price_tool,4,4.0,3.107107
7235,claude-sonnet-4-0,reasoning anthropic claude sonnet 4,Go to https://en.wikipedia.org/wiki/Python_(pr...,firecrawl_tool,"[('firecrawl_tool', {'website_url': 'https://e...","['translation_tool', 'weather_forecast_tool', ...",firecrawl_tool,2,2.0,2.642661
7717,qwen/qwen3-32b,no reasoning qwen3 32b,"Convert ""Thank you very much"" to Japanese",translation_tool,"[('translation_tool', '{""target_language"":""ja""...","['crypto_and_nft_tool', 'python_repl_tool', 'c...",translation_tool,8,8.0,0.361622
957,qwen/qwen3-32b,no reasoning qwen3 32b,What is the price of bitcoin right now?,crypto_and_nft_tool,"[('crypto_and_nft_tool', '{""query"":""bitcoin""}')]","['crm_contact_tool', 'google_spreadsheet_tool'...",crypto_and_nft_tool,2,2.0,0.497884
7629,openai/gpt-oss-20b,medium reasoning gpt-oss-20b,Visit https://openai.com and summarize the hom...,firecrawl_tool,"[('firecrawl_tool', '{""website_url"":""https://o...","['social_media_tool', 'python_repl_tool', 'eba...",firecrawl_tool,12,12.0,0.812574
