In [None]:
# This notebook shows how to extract AI incident mitigation actions from a dataset containing news media texts.
# It covers data overview, batch processing with GPT-5-mini and generating structured output for analysis.

### Imports
Load required libraries.

In [None]:
import os
import json
import time
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

### OpenAI API Key Configuration  
Load the OpenAI API key from the .env file.

In [None]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

### Data Loading and Overview
Load the dataset containing AI incident texts and check its structure and summary information.  
Each row corresponds to a single AI incident, and the "text" column contains the news article text describing that incident.

In [None]:
df = pd.read_csv('DATASET.csv')

In [None]:
print("Dataset information:")
df.info()

In [None]:
print("Dataset shape:")
df.shape

In [None]:
print("First 10 rows of dataset:")
df.head(10)

### OpenAI Batch API

This section prepares tasks for the OpenAI Batch API to extract mitigation actions from individual AI incident texts in the dataset.

- **Dataset:** The dataset contains news media texts describing AI-related incidents. Each row represents one incident, with the "text" column containing the article text and "new_id" providing a unique identifier for the incident.

- **What is being done:** The code generates one task per incident, where each task instructs GPT-5-mini to extract all explicitly described post-incident mitigation or corrective actions.

- **How it is being done:**
  1. **Iterate through the dataset:** Each row is processed individually.
  2. **Extract relevant fields:** The incident text is obtained from the `text` column, and the row ID from `new_id`.
  3. **Task creation:** For each row, a task dictionary is built containing:
     - A unique `custom_id` for the task
     - The API endpoint (`/v1/chat/completions`)
     - GPT-5-mini as the model
     - System instructions defining:
       - What constitutes a mitigation or corrective action
       - Extraction rules
       - Output rules
     - User input with the incident text to be analyzed
  4. **Task collection:** Each prepared task is appended to the `tasks` list for submission to the Batch API.

In [None]:
tasks = []

# Generate individual tasks for each row in the dataset
for _, row in df.iterrows():
    # Extract the row ID from the "new_id" column and the corresponding article text from the "text" column
    incident_text = str(row['text']).strip() if pd.notna(row['text']) else ""
    new_id = row['new_id']

    task = {
        "custom_id": f"row_{new_id}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-5-mini",
            "messages": [
                {
                    "role": "system",
                    "content": (
                        "You are an AI incident analysis expert.\n\n"
                        "You will be given text from media and news reports describing AI-related incidents.\n"
                        "Your task is to extract all explicit post-incident mitigation and corrective actions taken in response to the incident.\n\n"
                        "DEFINITION â€” MITIGATION / CORRECTIVE ACTION\n"
                        "A mitigation or corrective action is any explicit step taken AFTER the incident to respond to, contain, investigate, remediate, reduce harm, acknowledge responsibility, or address consequences caused by an AI system.\n"
                        "This includes but is NOT LIMITED to technical, organizational, operational, legal, policy-based, or public-response actions.\n\n"
                        "EXTRACTION RULES\n"
                        "1. Extract ONLY actions explicitly stated in the text.\n"
                        "2. Only extract mitigations from AI-related incidents. If the text does not describe an actual incident, return [\"No mitigation taken\"]."
                        "3. Actions must be taken AFTER the incident; ignore causes, or preventive measures taken beforehand.\n"
                        "4. Do NOT infer or assume actions that are not clearly described.\n"
                        "5. If multiple actions are mentioned together, split them into separate items.\n"
                        "6. Rewrite each action as ONE clear, self-contained sentence.\n"
                        "OUTPUT RULES (STRICT)\n"
                        "- Return ONLY a valid JSON array of strings.\n"
                        "- Do NOT include explanations, commentary, or extra text.\n"
                        "- If the text does not describe any mitigation actions, return:\n"
                        "[\"No mitigation taken\"]"
                        "- The output MUST be in English.\n"
                    )
                },
                {
                    "role": "user",
                    "content": (
                        "AI INCIDENT TEXT:\n"
                        f"{incident_text}"
                    )
                }
            ]
        }
    }

    tasks.append(task)

In [None]:
file_name = "TASK.jsonl"

# Write each task object to a JSONL file
with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [None]:
# Upload the batch input file to OpenAI and register it for batch processing
batch_file = client.files.create(
    file=open(file_name, "rb"),
    purpose="batch"
)

print(batch_file)

In [None]:
# Create a batch job using the uploaded input file
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

In [None]:
# Optional - poll the batch job status until completion
while True:
    batch_job = client.batches.retrieve(batch_job.id)
    if batch_job.status != "completed":
        time.sleep(10)
        print(batch_job.status)
    else:
        print(f"job {batch_job.id} is done")
        break

In [None]:
# Print batch job information
batch = client.batches.retrieve(batch_job.id)
print(batch)

In [None]:
output_file_id = batch.output_file_id
print(output_file_id)

In [None]:
# Save batch output file locally in JSONL format
with open("RESULT.jsonl", "wb") as f:
    for chunk in client.files.content(output_file_id).iter_bytes():
        f.write(chunk)