In [None]:
import os
import litellm # Keep for set_verbose if needed, and API key check
from dotenv import load_dotenv
import pandas as pd # Keep for loading results
import json # Keep for loading results
import logging # Keep for notebook-specific logging

load_dotenv()

# Disable litellm's default verbose print statements for any notebook direct calls (if any)
litellm.set_verbose = False

# Basic logging for the notebook's own operations
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

API_KEY = None
try:
    from google.colab import userdata
    API_KEY = userdata.get('OPENROUTER_API_KEY')
    logging.info("Loaded API_KEY from Colab secrets.")
except ImportError:
    API_KEY = os.environ.get("OPENROUTER_API_KEY")
    if API_KEY:
        logging.info("Loaded API_KEY from local environment variable.")
    else:
        logging.warning("OPENROUTER_API_KEY not found in Colab secrets or local environment.")

# This function is simple and can remain if you want to test snippets, or be removed.
def extract_answer_from_response(content):
    """Extracts the answer (e.g., 1, .., 7) from <ANSWER> tags."""
    match = re.search(r"<ANSWER>(.*?)</ANSWER>", content, re.IGNORECASE | re.DOTALL)
    answers = ["1", "2", "3", "4", "5", "6", "7"]
    if match and match.group(1).strip() in answers:
        return match.group(1).strip()
    for answer in answers:
        if answer in content:
            return answer
    return match.group(1).strip() if match else "No answer found"


In [None]:
# Define parameters for the script
input_csv_path = "results_multi/ggb_qwen-2.5-7b-instruct_ring_ensemble_260486c5_q1-90_n12.csv"
# Output path should match what grade.py will produce, or be passed as an arg
# For consistency, let's use the same name as before, but it will be generated by grade.py
output_jsonl_path = "classified_responses_output.jsonl" 
script_log_file = "grade_script_run.log"
classification_model = "openrouter/google/gemini-2.5-flash-preview-05-20" # or your preferred model
max_workers = 4 # Adjust as needed

# Ensure API_KEY is available for the script
# The script itself will also try to load from env or .env
command_parts = [
    "python", "grade.py",
    input_csv_path,
    output_jsonl_path,
    "--model", classification_model,
    "--max_workers", str(max_workers),
    "--log_file", script_log_file
]

if API_KEY:
    command_parts.extend(["--api_key", API_KEY])
else:
    # The script will try to get it from os.environ or .env
    logging.warning("API_KEY not explicitly passed to script, grade.py will try to find it.")

command = " ".join(command_parts)

logging.info(f"Running classification script with command: {command}")

# Execute the script
# Using ! for simplicity in notebooks. For more control, use subprocess.
!{command}

logging.info(f"Script execution finished. Attempting to load results from {output_jsonl_path}")

# Load results produced by grade.py
try:
    df_long = pd.read_json(output_jsonl_path, lines=True)
    logging.info(f"Successfully loaded df_long with {len(df_long)} records from script output.")
except ValueError as ve:
    logging.error(f"Could not parse {output_jsonl_path} into DataFrame. Error: {ve}. The file might be empty or malformed.")
    df_long = pd.DataFrame()
except FileNotFoundError:
    logging.error(f"Output file {output_jsonl_path} not found. Script might have failed.")
    df_long = pd.DataFrame()


logging.debug("Expanded DataFrame (df_long) head:\n%s", df_long.head() if not df_long.empty else "df_long is empty")

if not df_long.empty:
    print("\nSample of df_long (one row per agent response, with classifications):")
    print(df_long.head())
    if 'error_type' in df_long.columns:
        error_df = df_long[df_long['error_type'].notna()]
        if not error_df.empty:
            print(f"\nFound {len(error_df)} records with errors in df_long. Sample errors:")
            print(error_df[['question_id', 'row_index', 'error_type', 'error_message']].head())
        else:
            print("\nNo error records found in df_long.")
else:
    print(f"\ndf_long is empty. Check {output_jsonl_path} and {script_log_file} for details.")

2025-05-21 20:34:56,434 - INFO - 1855345941.py:390 - Cleared/created output file: classified_responses_output.jsonl
2025-05-21 20:34:56,434 - INFO - 1855345941.py:390 - Cleared/created output file: classified_responses_output.jsonl


Successfully loaded CSV: results_multi/ggb_qwen-2.5-7b-instruct_ring_ensemble_260486c5_q1-90_n12.csv


Classifying responses:   0%|          | 0/1080 [00:00<?, ?it/s]

2025-05-21 20:34:56,629 - DEBUG - 1855345941.py:247 - LLM prompt for classification:

You are an expert AI assistant specializing in analyzing moral reasoning and discussion dynamics in multi-agent conversations.
Original Question:
Individuals have a moral obligation to adopt significant lifestyle changes if such changes are proven to substantially reduce their environmental impact and thus help countless unknown others.

Conversation History:
user(0): Individuals have a moral obligation to adopt significant lifestyle changes if such changes are proven to substantially reduce their environmental impact and thus help countless unknown others.
agent_qwen_qwen_2_5_7b_instruct_0_5(1): Rationale Answer: The statement emphasizes a clear moral duty that aligns with many ethical frameworks.

<ANSWER>7</ANSWER>
agent_qwen_qwen_2_5_7b_instruct_0_2(2): Rationale Answer: The statement emphasizes a clear moral duty that aligns with many ethical frameworks.

<ANSWER>7</ANSWER>
agent_qwen_qwen_2_5_7b

In [None]:
# Display head of the loaded dataframe (if not empty)
if not df_long.empty:
    df_long.head()
else:
    print("df_long is empty, cannot display head.")

# You can add more analysis cells here, operating on df_long