In [None]:
# This notebook shows how to extract a taxonomy of mitigation actions from AI incident mitigation texts.
# It demonstrates the prompts and workflow using GPT-5-mini for batch API calls,
# including how to prepare the input file, submit the batch job and retrieve the output files.

### Imports
Load required libraries.

In [None]:
import os
import json
import time
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
import ast

### OpenAI API Key Configuration  
Load the OpenAI API key from the .env file.

In [None]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

### Data Loading and Overview
Load the dataset containing AI incident mitigation texts and check its structure and summary information.  
Each row represents a single AI incident, and the "mitigation_taken" column lists all mitigation actions associated with that incident.

In [None]:
df = pd.read_csv('DATASET.csv')

In [None]:
# Check for unique values of column "mitigation_taken"
df['mitigation_taken'].nunique()

In [None]:
# Filter the dataset to show rows where no mitigation actions were taken
df[df["mitigation_taken"] == "['No mitigation taken']"]

In [None]:
# Remove rows where no mitigation actions were taken
df = df[df["mitigation_taken"] != "['No mitigation taken']"]
df = df.reset_index(drop=True)

In [None]:
df.info()

In [None]:
df.head()

### OpenAI Batch API
This section prepares and configures batch API calls using GPT-5-mini to analyze AI incident mitigation data.

- **Dataset:** We use a dataset of AI incident texts, where each row represents a single incident and the "mitigation_taken" column contains all mitigation actions associated with that incident.

- **What is being done:** The code splits the dataset into manageable batches and constructs structured tasks for the OpenAI Batch API. Each task contains a set of mitigation statements and instructions for GPT-5-mini to derive a hierarchical taxonomy of mitigation actions (categories and subcategories).

- **How it is being done:**
  1. **Batching:** The dataset is divided into chunks (default 200 incidents per batch) to avoid exceeding API limits.
  2. **Data processing:** For each batch, all mitigation statements are extracted. Strings representing lists are safely parsed to ensure a consistent list format.
  3. **Task creation:** For each batch, a task dictionary is created containing:
     - A unique ID for the batch
     - The API endpoint (`/v1/chat/completions`)
     - GPT-5-mini as the model
     - System instructions explaining the rules, constraints and expected output format
     - User instructions including the list of mitigation statements
  4. **Task collection:** Each prepared task is appended to a list of tasks ready to be submitted to the OpenAI Batch API.

In [None]:
# Define the batch size for processing the dataset in chunks
batch_size = 200
tasks = []

# Loop through the dataset in batches
for start in range(0, len(df), batch_size):
    batch = df.iloc[start:start + batch_size]

    # Collect all mitigation statements for the current batch
    mitigation_texts = []
    for _, row in batch.iterrows():
        mitigations = row.get('mitigation_taken', ["No mitigation taken"])

        if isinstance(mitigations, str):
            try:
                mitigations = ast.literal_eval(mitigations)
            except (ValueError, SyntaxError):
                mitigations = [mitigations]

        if not isinstance(mitigations, list):
            mitigations = [str(mitigations)]

        mitigation_texts.extend(mitigations)

    # Build the batch task using the collected mitigation statements
    task = {
        "custom_id": f"batch_{start}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-5-mini",
            "messages": [
                {
                    "role": "system",
                    "content": (
                        "You are an AI incidents analyst specializing in qualitative analysis and taxonomy development. "
                        "Your goal is to analyze the raw data and cluster it into a coherent, structured hierarchy of categories and subcategories derived directly from the data.\n\n"
                        "RULES OF LOGIC:\n"
                        "1. DATA-DRIVEN: Categories and subcategories must emerge only from the provided statements.\n"
                        "2. NON-OVERLAPPING: Each subcategory must belong to exactly one parent category.\n\n"
                        "CONSTRAINTS:\n"
                        "- Organize output as categories with subcategories.\n"
                        "- Maximum of 15 top-level categories.\n"
                        "- Output MUST be valid JSON and nothing else."
                    )
                },
                {
                    "role": "user",
                    "content": (
                        "I will provide a list of AI mitigation statements. Derive a hierarchical taxonomy (categories and subcategories) from them.\n\n"
                        "Output format:\n"
                        "{\n"
                        '  "derived_taxonomy": {\n'
                        "    \"Category A\": [\"Subcat A1\", \"Subcat A2\"],\n"
                        "    \"Category B\": [...],\n"
                        "    ...\n"
                        "  }\n"
                        "}\n\n"
                        "<mitigation_statements>\n"
                        + "\n".join(f"- {m}" for m in mitigation_texts) +
                        "\n</mitigation_statements>"
                    )
                }
            ]
        }
    }

    tasks.append(task)


In [None]:
file_name = "TASK.jsonl"

# Write each task object to a JSONL file
with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [None]:
# Upload the batch input file to OpenAI and register it for batch processing
batch_file = client.files.create(
    file=open(file_name, "rb"),
    purpose="batch"
)
print(batch_file)

In [None]:
# Create a batch job using the uploaded input file
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

In [None]:
# Optional - poll the batch job status until completion
while True:
    batch_job = client.batches.retrieve(batch_job.id)
    if batch_job.status != "completed":
        time.sleep(10)
        print(batch_job.status)
    else:
        print(f"job {batch_job.id} is done")
        break

In [None]:
# Print batch job information
batch = client.batches.retrieve(batch_job.id)
print(batch)

In [None]:
output_file_id = batch.output_file_id
print(output_file_id)

In [None]:
# Save batch output file locally in JSONL format
with open("RESULT.jsonl", "wb") as f:
    for chunk in client.files.content(output_file_id).iter_bytes():
        f.write(chunk)