In [1]:
# %pip install langfuse wikipedia openai

In [2]:
# Step 1: Load Configuration and Dependencies

import json

# Load sensitive config from config.json
with open('config.json', 'r') as f:
    config = json.load(f)

# Set notebook variables (these should match what you set in your notebook)
temperature = 0.0
verbose = True
use_langfuse = True
model_name = "gpt-4.1-2025-04-14"
username = "Shreyashgupta5"
code_link = "https://huggingface.co/spaces/Shreyashgupta5/ai_agents_course"
api_base_url = "https://agents-course-unit4-scoring.hf.space"

# Print to verify
print("Config loaded. Sensitive keys available for use.")
print("Notebook variables set:")
print(f"  model_name: {model_name}")
print(f"  temperature: {temperature}")
print(f"  verbose: {verbose}")
print(f"  use_langfuse: {use_langfuse}")
print(f"  username: {username}")
print(f"  code_link: {code_link}")
print(f"  api_base_url: {api_base_url}")

Config loaded. Sensitive keys available for use.
Notebook variables set:
  model_name: gpt-4.1-2025-04-14
  temperature: 0.0
  verbose: True
  use_langfuse: True
  username: Shreyashgupta5
  code_link: https://huggingface.co/spaces/Shreyashgupta5/ai_agents_course
  api_base_url: https://agents-course-unit4-scoring.hf.space


In [3]:
# Step 2: Configure Langfuse Decorator-Based Client

from langfuse.decorators import langfuse_context

langfuse_context.configure(
    secret_key=config["langfuse_secret"],
    public_key=config["langfuse_public_key"],
    host=config["host"]
)

In [None]:
# Step 3: Load Questions

import json

# Load all questions from 1_question.json
with open('1_question.json', 'r') as f:
    questions = json.load(f)

# Print out each question's task_id and question text for verification
for q in questions:
    print(f"Task ID: {q['task_id']}")
    print(f"Question: {q['question']}")
    print("-" * 40)

In [5]:
# Step 4: Define Tools

import wikipedia
from langfuse.decorators import observe  # (if not already imported)

@observe()
def wikipedia_search(query, sentences=2):
    """
    Search Wikipedia for a query and return a summary.
    """
    try:
        summary = wikipedia.summary(query, sentences=sentences, auto_suggest=True, redirect=True)
        return summary
    except wikipedia.DisambiguationError as e:
        return f"Disambiguation error. Options: {e.options[:5]}"
    except wikipedia.PageError:
        return "No Wikipedia page found for the query."
    except Exception as e:
        return f"Error: {str(e)}"

In [6]:
# Step 5: Agent Planning Step (OpenAI v1.x+)

import openai
from langfuse.decorators import observe  # (if not already imported)

# Set your OpenAI API key from config
client = openai.OpenAI(api_key=config["openai_api_key"])

@observe(as_type="generation")
def get_agent_plan(question, model_name, temperature=0.0):
    """
    Sends the question to the model and asks for a plan and tool list.
    """
    prompt = (
        "You are an AI agent. Here is a question you need to answer:\n"
        f"Question: {question}\n\n"
        "Create a step-by-step plan to answer this question. "
        "List the tools you would use and explain briefly how you would use them."
    )
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    plan = response.choices[0].message.content
    return plan

In [7]:
# Step 6: Tool Execution Step

from langfuse.decorators import observe  # (if not already imported)

@observe()
def execute_tools(plan, question):
    """
    Executes tools as suggested in the plan.
    For now, only supports Wikipedia search.
    Returns a dictionary of tool outputs.
    """
    tool_outputs = {}
    if "wikipedia" in plan.lower():
        wiki_result = wikipedia_search(question)
        tool_outputs['wikipedia'] = wiki_result
        print("Wikipedia tool executed.")
    else:
        print("No supported tools found in the plan.")
    return tool_outputs

In [8]:
# Step 7: Synthesis Step (OpenAI v1.x+)

from langfuse.decorators import observe  # (if not already imported)

@observe(as_type="generation")
def synthesize_final_answer(task_id, question, tool_outputs, gaia_doc, model_name, temperature=0.0):
    """
    Uses the model to synthesize a final answer in GAIA format.
    """
    prompt = (
        f"You are an AI agent participating in the GAIA benchmark. "
        f"Here is the official GAIA documentation for answer formatting:\n\n"
        f"{gaia_doc}\n\n"
        f"Here is the original question:\n{question}\n\n"
        f"Here are the outputs from the tools you used:\n{tool_outputs}\n\n"
        "Using the information above, generate the final answer in the required GAIA JSON format. "
        "Only output the JSON object, nothing else."
    )
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    final_answer_json = response.choices[0].message.content
    return final_answer_json

In [None]:
# Step 8: Main Agent Loop with Langfuse Traceability

from langfuse.decorators import observe, langfuse_context  # (if not already imported)

@observe()
def process_all_questions(questions, model_name, temperature, gaia_doc):
    final_answers = []
    for q in questions:
        print(f"Processing Task ID: {q['task_id']}")
        plan = get_agent_plan(q['question'], model_name, temperature)
        tool_outputs = execute_tools(plan, q['question'])
        # Use the correct task_id from the question
        final_answer_json = synthesize_final_answer(
            task_id=q['task_id'],
            question=q['question'],
            tool_outputs=tool_outputs,
            gaia_doc=gaia_doc,
            model_name=model_name,
            temperature=temperature
        )
        final_answers.append(final_answer_json)
        # Print trace URL for traceability
        print("Langfuse Trace URL:", langfuse_context.get_current_trace_url())
    return final_answers

# Load GAIA documentation from file
with open("documentation/GIAI-documentation.md", "r") as f:
    gaia_doc = f.read()

# Process all questions:
final_answers = process_all_questions(questions, model_name, temperature, gaia_doc)

In [None]:
# Step 9: Save Results (with cleaning and validation)

import json
import re

def clean_and_validate_answer(answer_str, required_fields=("task_id", "submitted_answer"), correct_task_id=None):
    """
    Cleans markdown code block from model output and validates required fields.
    Overwrites task_id if correct_task_id is provided.
    Returns a dict if valid, else raises ValueError.
    """
    # Remove markdown code block if present
    answer_str = answer_str.strip()
    # Remove all code block markers (``` or ```json)
    answer_str = re.sub(r"^```[a-zA-Z]*", "", answer_str)
    answer_str = re.sub(r"```$", "", answer_str).strip()
    # Parse JSON
    try:
        answer_obj = json.loads(answer_str)
    except Exception as e:
        raise ValueError(f"Invalid JSON: {e}\nRaw output: {answer_str}")
    # Overwrite task_id if provided
    if correct_task_id is not None:
        answer_obj["task_id"] = correct_task_id
    # Check required fields
    for field in required_fields:
        if field not in answer_obj:
            raise ValueError(f"Missing required field '{field}' in answer: {answer_obj}")
    return answer_obj

def save_final_answers(final_answers, questions, filename="final_answers.jsonl"):
    """
    Saves a list of final answer dicts or JSON strings to a .jsonl file.
    Each answer should be a valid JSON object (dict or JSON string).
    Cleans and validates each answer before saving.
    Overwrites task_id with the one from the corresponding question.
    """
    cleaned_answers = []
    for i, answer in enumerate(final_answers):
        # Get the correct task_id from the question
        correct_task_id = questions[i]["task_id"]
        # If answer is a string, clean and validate
        if isinstance(answer, str):
            try:
                answer_obj = clean_and_validate_answer(answer, correct_task_id=correct_task_id)
            except Exception as e:
                print(f"Error in answer {i}: {e}")
                continue
        else:
            # If answer is already a dict, just overwrite the task_id
            answer_obj = answer
            answer_obj["task_id"] = correct_task_id
        cleaned_answers.append(answer_obj)
    # Write to file
    with open(filename, "w") as f:
        for answer_obj in cleaned_answers:
            f.write(json.dumps(answer_obj, ensure_ascii=False) + "\n")
    print(f"Saved {len(cleaned_answers)} answers to {filename}")
    return cleaned_answers  # <-- Return the cleaned answers for submission

# Example usage:
final_answers_cleaned = save_final_answers(final_answers, questions)

In [None]:
# Step 10: Validate Answers (Check with GAIA API)

import requests

def validate_answers(final_answers, username, code_link, api_base_url, agent_code=None):
    """
    Submits answers to the GAIA evaluation endpoint for validation.
    Prints the score and which answers were correct.
    """
    url = f"{api_base_url}/submit"
    if agent_code is None:
        # Try to read your notebook as code, or use code_link as fallback
        try:
            with open("agent.ipynb", "r") as f:
                agent_code = f.read()
        except Exception:
            agent_code = code_link  # fallback
    payload = {
        "username": username,
        "code_link": code_link,
        "agent_code": agent_code,
        "answers": final_answers  # <-- Use the cleaned answers here!
    }
    response = requests.post(url, json=payload)
    if response.status_code == 200:
        result = response.json()
        print("Submission successful!")
        print(f"Score: {result.get('score', 'N/A')}%")
        if "results" in result:
            print("\nDetailed Results:")
            for r in result["results"]:
                status = "✅" if r.get("correct") else "❌"
                print(f"{status} Task ID: {r['task_id']} | Your Answer: {r['submitted_answer']} | Correct: {r.get('correct_answer', 'N/A')}")
        else:
            print("No detailed results returned.")
        return result
    else:
        print("Submission failed:", response.status_code, response.text)
        return None

# Example usage:
validation_result = validate_answers(final_answers_cleaned, username, code_link, api_base_url)

In [None]:
# Step 11: (Optional) Save Validation Results

def save_validation_results(validation_result, filename="validation_results.json"):
    if validation_result is not None:
        with open(filename, "w") as f:
            json.dump(validation_result, f, indent=2, ensure_ascii=False)
        print(f"Validation results saved to {filename}")

# Example usage:
save_validation_results(validation_result)

In [14]:
# Step 12: Save Validation Results (with cleaning and validation)

# --- Langfuse flush at the end of the notebook ---
from langfuse.decorators import langfuse_context  # (if not already imported)
langfuse_context.flush()