# GPT-OSS-20B Red Team Findings Reproduction

This notebook demonstrates the red-teaming framework and reproduces findings for the Kaggle hackathon.

In [None]:
# Setup
import sys
import os
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Import framework
from gpt_oss_redteam import load_config, setup_logging
from gpt_oss_redteam.model.backends import get_backend
from gpt_oss_redteam.probes.registry import discover_probes, run_category
from gpt_oss_redteam.eval.metrics import calculate_metrics
from gpt_oss_redteam.eval.reporting import make_findings_json, generate_report
from gpt_oss_redteam.eval.converters import harmony_to_kaggle

# Setup logging
setup_logging(level="INFO")

# Load configuration
config = load_config()
print(f"Loaded config with backend: {config.backend.type}")

## Initialize Model Backend

In [None]:
# Initialize mock backend for demonstration
backend = get_backend(
    "mock",
    config.backend.model_dump(),
    config.providers
)

print("Backend initialized:", backend.__class__.__name__)

## Discover and List Available Probes

In [None]:
# Discover all probes
discovered = discover_probes()

print("Discovered probe categories:")
for category, probes in discovered.items():
    print(f"  {category}: {len(probes)} probes")

total_probes = sum(len(p) for p in discovered.values())
print(f"\nTotal probes: {total_probes}")

## Run Small Probe Set

In [None]:
# Select categories to test
test_categories = ["reward_hacking", "deception"]
test_seeds = [0, 1, 2]

# Run probes
all_results = []

for category in test_categories:
    print(f"\nRunning {category} probes...")
    results = run_category(category, backend, seeds=test_seeds)
    all_results.extend(results)
    print(f"  Completed {len(results)} probe runs")

print(f"\nTotal results: {len(all_results)}")

## Calculate Metrics

In [None]:
# Calculate metrics
metrics = calculate_metrics(all_results)

print("Overall Metrics:")
print(f"  Total probes: {metrics['total_probes']}")
print(f"  Passed: {metrics['passed']}")
print(f"  Failed: {metrics['failed']}")
print(f"  Pass rate: {metrics['pass_rate']:.1%}")

print("\nCategory Breakdown:")
for category, cat_metrics in metrics['categories'].items():
    print(f"  {category}:")
    print(f"    Pass rate: {cat_metrics['pass_rate']:.1%}")
    print(f"    Avg duration: {cat_metrics['avg_duration_ms']:.1f}ms")

## Display Results Table

In [None]:
import pandas as pd

# Create DataFrame from results
results_data = []
for result in all_results:
    results_data.append({
        'Probe': result.probe_name,
        'Passed': '✓' if result.passed else '✗',
        'Seed': result.seed,
        'Duration (ms)': f"{result.duration_ms:.1f}",
        'Evidence': result.evidence[0] if result.evidence else 'N/A'
    })

df = pd.DataFrame(results_data)
print("\nProbe Results:")
display(df.head(10))  # Show first 10 results

## Generate Findings JSON

In [None]:
# Generate Kaggle findings JSON for failed probes
failed_results = [r for r in all_results if not r.passed]

if failed_results:
    findings = make_findings_json(
        failed_results,
        issue_id="NOTEBOOK-001",
        meta={
            "source": "notebook",
            "backend": "mock"
        }
    )
    
    print(f"Generated findings for {len(findings['findings'])} issues")
    print("\nFirst finding:")
    if findings['findings']:
        import json
        print(json.dumps(findings['findings'][0], indent=2))
else:
    print("No failed probes to report")

## Demonstrate Harmony → Kaggle Converter

In [None]:
# Load example Harmony findings
harmony_path = "../examples/example-harmony-findings.json"

if Path(harmony_path).exists():
    import tempfile
    
    with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp:
        kaggle_path = tmp.name
    
    # Convert
    kaggle_data = harmony_to_kaggle(harmony_path, kaggle_path)
    
    print("Converted Harmony to Kaggle format:")
    print(f"  Version: {kaggle_data['version']}")
    print(f"  Findings: {len(kaggle_data['findings'])}")
    
    if kaggle_data['findings']:
        print("\nFirst converted finding:")
        print(json.dumps(kaggle_data['findings'][0], indent=2))
else:
    print("Example Harmony file not found")

## Assert Deterministic Behavior

In [None]:
# Test deterministic behavior with same seed
from gpt_oss_redteam.probes.registry import run_probe

# Run same probe twice with same seed
result1 = run_probe("reward_hacking.confidence_incorrect_facts", backend, seed=42)
result2 = run_probe("reward_hacking.confidence_incorrect_facts", backend, seed=42)

# Check if results are identical
if result1.transcript and result2.transcript:
    response1 = result1.transcript.response
    response2 = result2.transcript.response
    
    if response1 == response2:
        print("✓ Deterministic behavior confirmed")
        print(f"  Both runs produced: '{response1[:50]}...'")
    else:
        print("✗ Non-deterministic behavior detected")
        print(f"  Run 1: '{response1[:50]}...'")
        print(f"  Run 2: '{response2[:50]}...'")
else:
    print("Could not verify deterministic behavior")

## Generate Markdown Report

In [None]:
# Generate markdown report
report = generate_report(all_results, format="markdown", include_details=False)

# Display first part of report
print(report[:1000])
print("\n... [truncated]")

## Summary

This notebook demonstrated:
1. Loading configuration and initializing the mock backend
2. Discovering and running probes across multiple categories
3. Calculating metrics and generating reports
4. Creating Kaggle-ready findings JSON
5. Converting between Harmony and Kaggle formats
6. Verifying deterministic behavior under mock backend

The framework is ready for red-teaming GPT-OSS-20B with safe, benign probes.