### Imports

In [1]:
# Misc imports
import os
import json
import pandas as pd
import csv
from tqdm import tqdm

# Import Minions + Minions
from minions.minions import Minions
from minions.minion import Minion

# Import Minion Clients
from minions.clients.ollama import OllamaClient
from minions.clients.tokasaurus import TokasaurusClient
from minions.clients.openai import OpenAIClient
from minions.clients.anthropic import AnthropicClient
from minions.clients.together import TogetherClient

# Import Pydantic
from pydantic import BaseModel

### Configure Clients


1. Specify structured output schema for the OllamaClient. Run this block as is! Do _NOT_ make any modifications


In [2]:
class StructuredLocalOutput(BaseModel):
    explanation: str
    citation: str | None
    answer: str | None

2. Instantiate Clients: Here we instantiate our local client to be ollama and remote client to be OpenAI


In [3]:
remote_client = OpenAIClient(model_name="gpt-4o", temperature=0.0)

# Option 1: Ollama
local_client = OllamaClient(
    model_name="qwen2.5-coder", 
    temperature=0.0, 
    structured_output_schema=StructuredLocalOutput
)

# option 2
# local_client = TokasaurusClient(
#     model_name="meta-llama/Llama-3.2-1B-Instruct",
#     temperature=0.0,
#     # Structured outputs are not yet supported with tokasaurus
#     # structured_output_schema=StructuredLocalOutput
# )



### Set-up Communication Protocol


In [4]:
# Option 1: Minions
# protocol = Minions(local_client=local_client, remote_client=remote_client)

# Option 2: Minion
protocol = Minion(local_client=local_client, remote_client=remote_client)
    

### Configure Minion/Minions Task

- context (List[str]): context that the minions need to reason over
- doc_metadata (str): every task is parameterized by doc_metadata which describes the "type" of information contained in the context
- task (str): description of the query to be completed over the context


#### Specify swebench task


In [5]:
# Define the document metadata
doc_metadata = "python code"

task = f"""\
Identify all functions that contain any errors in the code. The errors might be logical or there might be multiple errors within one function.
Your final answer should start with a list of all functions containing errors. This should then be followed by proposed code fixes (respond with code) for each erroneous function."""

results = []
with open('swebench_data.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header row
    # Skip first 15 rows
    for _ in range(15):
        next(reader)
    for row in tqdm(reader):
        code_context = [row[1]]  # row[1] contains the modified/buggy code
        
        output = protocol(
            task=task,
            doc_metadata=doc_metadata,
            context=code_context,
            max_rounds=5,
        )

        results.append([row[0], row[1], row[2], row[3], output['final_answer'], output['supervisor_messages'], output['worker_messages']])  # Store original file, buggy code, and minions output
        
# Write results to CSV
with open('minion_results.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['File', 'Buggy Code', 'Errors', 'Buggy Functions', 'Model Analysis', 'Supervisor', 'Worker'])
    writer.writerows(results)

0it [00:00, ?it/s]

entering loop with max_rounds:  5
getting worker's response


1it [00:09,  9.26s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "Based on the small language model's analysis, no errors were found in the functions provided in the Python script for the matplotlib plotting library. The code appears to be well-structured and follows best practices for Python programming. Therefore, no code fixes are necessary."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and suggest fixes."}
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and suggest fixes. Without the actual code, it is not possible to perform the task."}
getting worker's 

2it [00:24, 12.67s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the actual implementations of the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' are not provided. Without the code, it is impossible to identify any errors or suggest fixes."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code implementations for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response


3it [00:37, 13.07s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the implementations of the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' are not provided. Without the actual code, it is impossible to identify any errors or propose fixes."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and suggest fixes."}
getting worker's response


4it [00:47, 11.67s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the definitions of the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' are not provided. Without the actual code, it is impossible to identify any errors or suggest fixes for these functions."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and suggest fixes."}
getting worker's response


5it [00:57, 11.23s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed as the necessary code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' is not provided. Without the implementations of these functions, it is impossible to identify any errors or suggest fixes. The provided code snippet only includes a module related to Motzkin numbers, which does not contain the mentioned functions."}
entering loop with max_rounds:  5
getting worker's response


6it [01:03,  9.44s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': 'No functions contain errors in the provided code snippet.'}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response


7it [01:15, 10.22s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the actual code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' was not provided. Without the code, it is impossible to identify errors or propose fixes."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response


8it [01:27, 10.81s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed as the actual code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' is not provided. Without the code, it is impossible to identify errors or propose fixes. Please provide the complete code for these functions to proceed with the analysis."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the full code implementations for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response


9it [01:41, 11.81s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the code implementations for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' are not provided. Without the actual code, it is impossible to identify any errors or propose fixes."}
entering loop with max_rounds:  5
getting worker's response


10it [01:46,  9.50s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': 'No functions with errors were identified in the provided code. The code appears to be well-structured and free of logical or syntactical errors.'}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response


11it [01:58, 10.55s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed as the actual implementations of the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' are not provided. Without the code, it is impossible to identify errors or propose fixes."}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response


12it [02:09, 10.68s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the definitions of the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' are not provided. Without the actual code, it is impossible to identify any errors or propose fixes. Additional information or the complete code is required to proceed with the task."}
entering loop with max_rounds:  5
getting worker's response


13it [02:14,  8.73s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': 'No functions contain errors.'}
entering loop with max_rounds:  5
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and suggest fixes."}
getting worker's response
Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and suggest fixes. Without the actual code, it is not possible to determine the presence of errors or propose corrections."}
getting worker's response


14it [02:30, 10.91s/it]

Supervisor_json: {'decision': 'provide_final_answer', 'answer': "The task cannot be completed because the actual code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' has not been provided. Without the code, it is impossible to identify any errors or suggest fixes. Please provide the complete code for these functions to proceed with the analysis."}
entering loop with max_rounds:  5
getting worker's response


14it [02:35, 11.12s/it]

Supervisor_json: {'decision': 'request_additional_info', 'message': "Please provide the complete code for the functions 'load_data', 'clean_data', 'analyze_data', and 'visualize_results' so that I can identify any errors and propose fixes."}
getting worker's response





KeyboardInterrupt: 

In [6]:
print(output)

{'final_answer': 'No functions contain errors in the provided code snippet. Therefore, no code fixes are necessary.', 'supervisor_messages': [{'role': 'user', 'content': 'We need to perform the following task.\n\n### Task\nIdentify all functions that contain any errors in the code. The errors might be logical or there might be multiple errors within one function.\nYour final answer should start with a list of all functions containing errors. This should then be followed by proposed code fixes (respond with code) for each erroneous function.\n\n### Instructions\nYou will not have direct access to the context, but can chat with a small language model which has read the entire thing.\n\nFeel free to think step-by-step, but eventually you must provide an output in the format below:\n\n<think step by step here>\n```json\n{\n    "message": "<your message to the small language model. If you are asking model to do a task, make sure it is a single task!>"\n}\n```\n'}, {'role': 'assistant', 'con

### Evaluate Performance


In [9]:
import os
os.environ["OPENAI_API_KEY"] = (
    f"{os.getenv('OPENAI_API_KEY')}"
)

# MODEL_NAME="gpt-4o"
# MODEL_NAME="o3-mini"

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import openai

def process_row(ground_truth, minions_text):
    """
    Process a single Python file by evaluating results using GPT.
    Returns a tuple of 5 integers: (true positives, true negatives, false positives, false negatives, total number of actual errors)
    """
    
    prompt = f"""Here is the ground truth list of functions containing errors within a codebase:
    {ground_truth}.

    You are also provided a list of the functions identified by the model:
    {minions_text}
    
    Please return a list of 5 integers in the format [a, b, c, d, e] corresponding to:
    (true positives, true negatives, false positives, false negatives, total number of ground truth functions). It is extremely important that only this list is outputted and no other text.
    """
    
    client = openai.OpenAI()
    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that helps evaluate error detection in code."},
            {"role": "user", "content": prompt}
        ]
    )
    
    response_text = response.choices[0].message.content
    
    return response_text

def tabulate_results(df, save_file, max_workers=5):
    """
    Process rows of the dataframe in parallel using process_row
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create futures for each row using the Buggy Functions and Minions Analysis columns
        futures = {
            executor.submit(process_row, row['Buggy Functions'], row['Model Analysis']): i 
            for i, row in df.iterrows()
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                result = future.result()
                row_idx = futures[future]
                results.append((df.iloc[row_idx]['File'], result))
            except Exception as e:
                row_idx = futures[future]
                print(f"Error processing row {row_idx}: {e}")
    
    # Save results
    results_df = pd.DataFrame(results, columns=['File', 'Evaluation'])
    results_df.to_csv(save_file, index=False)
    
    print("Completed")


In [None]:
# Read the results CSV file
df = pd.read_csv('minion_results.csv')

# Run tabulate_results on the dataframe
tabulate_results(df, 'minion_evaluations_og.csv', max_workers=8)

In [None]:
# Read evaluations CSV
eval_df = pd.read_csv('minion_evaluations_og.csv') # Sometimes extra whitespace in the csv causes error e.g. "[a,b,c,d,e] "

# Convert string representation of list to actual list and split into columns
eval_df[['true_positives', 'true_negatives', 'false_positives', 'false_negatives', 'total_errors']] = pd.DataFrame(
    eval_df['Evaluation'].str.strip('[]').str.split(',').tolist(), 
    index=eval_df.index
).astype(int)

# Drop original Evaluation column
eval_df = eval_df.drop('Evaluation', axis=1)

# Save updated dataframe
eval_df.to_csv(f'minion_evaluations-gpt4o.csv', index=False)

print("Split evaluation column into separate metrics columns")
eval_df.head()


In [None]:
# Calculate precision, recall and F1 score
precision = eval_df['true_positives'].sum() / (eval_df['true_positives'].sum() + eval_df['false_positives'].sum())
recall = eval_df['true_positives'].sum() / (eval_df['true_positives'].sum() + eval_df['false_negatives'].sum())
f1_score = 2 * (precision * recall) / (precision + recall)
print("Minions Scores with gpt-4o (num rounds=5)")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}") 
print(f"F1 Score: {f1_score:.3f}")