# trans-evals Quick Start Tutorial

This notebook demonstrates basic usage of the trans-evals framework for evaluating LLM bias toward trans and non-binary individuals.

## Setup

First, let's import the necessary modules and set up our environment.

In [None]:
import os
import sys
sys.path.append('..')  # Add parent directory to path

from dotenv import load_dotenv
load_dotenv()

# Import trans-evals modules
from src import (
    load_dataset,
    BiasEvaluator,
    TransSpecificTemplates,
    OpenRouterModel
)
from src.augmentation import CounterfactualAugmenter
from src.datasets.base import BiasType

## 1. Loading Datasets

Let's explore the available datasets.

In [None]:
# Load TANGO dataset
tango = load_dataset("tango")
print(f"TANGO dataset has {len(tango)} examples")

# Look at a few examples
for i, example in enumerate(tango[:3]):
    print(f"\nExample {i+1}:")
    print(f"  Text: {example.text}")
    print(f"  Bias Type: {example.bias_type.value}")
    print(f"  Target Identity: {example.target_identity}")

In [None]:
# Load HolisticBias with filtering
holistic = load_dataset(
    "holistic_bias",
    identity_terms=["trans woman", "trans man", "non-binary"]
)
print(f"Filtered HolisticBias has {len(holistic)} examples")

## 2. Template-Based Test Generation

Generate custom test cases using templates.

In [None]:
# Get pronoun consistency templates
pronoun_templates = TransSpecificTemplates.get_pronoun_templates()
print(f"Number of pronoun templates: {len(pronoun_templates)}")

# Generate examples from first template
first_template = pronoun_templates[0]
examples = first_template.generate_examples()
print(f"\nGenerated {len(examples)} examples from template")

# Show a few
for ex in examples[:3]:
    print(f"\n- {ex.text}")

## 3. Counterfactual Data Augmentation

Create counterfactual examples by swapping identity terms.

In [None]:
# Initialize augmenter
augmenter = CounterfactualAugmenter()

# Create a test example
from src.datasets.base import DatasetExample, EvaluationType

original = DatasetExample(
    id="test_1",
    text="The trans woman presented her research at the conference.",
    bias_type=BiasType.SENTIMENT,
    evaluation_type=EvaluationType.GENERATION,
    target_identity="trans woman"
)

# Create counterfactual
counterfactual = augmenter.create_counterfactual(original)

print("Original:", original.text)
print("Counterfactual:", counterfactual.text)
print("Replacements:", counterfactual.metadata['replacements'])

## 4. Running Bias Evaluation

Evaluate a model on our test cases.

**Note**: This requires setting up API keys in your .env file.

In [None]:
# Check if API key is available
if os.getenv("OPENROUTER_API_KEY"):
    # Initialize model with OpenRouter
    model = OpenRouterModel(model_name="gpt-3.5-turbo")
    print(f"Using model: {model.model_name}")
    
    # Create small test dataset
    test_examples = tango[:3]  # Just 3 examples for demo
    
    # Initialize evaluator
    evaluator = BiasEvaluator(model=model)
    
    # Run evaluation
    results = evaluator.evaluate(
        test_examples,
        metrics_to_use=["toxicity", "sentiment"]
    )
    
    # Show results
    print("\nEvaluation Results:")
    print(f"Examples evaluated: {len(results.examples)}")
    
    # Show individual results
    for i, (ex, pred, metrics) in enumerate(zip(results.examples, results.predictions, results.metric_results)):
        print(f"\nExample {i+1}:")
        print(f"  Input: {ex.text[:50]}...")
        print(f"  Output: {pred[:50]}...")
        for metric_name, metric_result in metrics.items():
            print(f"  {metric_name}: {metric_result.score:.3f}")
else:
    print("Please set OPENROUTER_API_KEY in your .env file to run evaluation")
    print("Get your API key from: https://openrouter.ai/keys")
    print("\nYou can test with various models:")
    for name in ["gpt-3.5-turbo", "claude-3-haiku", "mistral-7b", "llama-2-13b"]:
        print(f"  - {name}")

## 5. Analyzing Results

Convert results to DataFrame for analysis.

In [None]:
# Create mock results for demonstration
from src.evaluation.evaluator import EvaluationResult
from src.evaluation.metrics import MetricResult
from datetime import datetime

# Create sample results
mock_results = EvaluationResult(
    dataset_name="demo",
    model_name="demo-model",
    timestamp=datetime.now().isoformat()
)

# Add some examples
for i in range(5):
    ex = DatasetExample(
        id=f"ex_{i}",
        text=f"Test example {i}",
        bias_type=BiasType.TOXICITY,
        evaluation_type=EvaluationType.GENERATION,
        target_identity="trans woman" if i % 2 == 0 else "trans man"
    )
    mock_results.examples.append(ex)
    mock_results.predictions.append(f"Generated text {i}")
    mock_results.metric_results.append({
        "toxicity": MetricResult("toxicity", 0.1 + i * 0.05),
        "sentiment": MetricResult("sentiment", 0.7 - i * 0.1)
    })

# Convert to DataFrame
df = mock_results.to_dataframe()
print("Results DataFrame:")
print(df.head())

# Analyze by identity
print("\nAverage scores by identity:")
print(df.groupby('target_identity')[['toxicity_score', 'sentiment_score']].mean())

## 6. Visualizing Bias Patterns

Create visualizations to understand bias patterns.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")

# Create sample data for visualization
import pandas as pd
import numpy as np

# Generate sample bias scores
identities = ['trans woman', 'trans man', 'non-binary', 'cis woman', 'cis man']
metrics = ['toxicity', 'misgendering', 'regard', 'sentiment']

data = []
for identity in identities:
    for metric in metrics:
        # Generate realistic-looking scores
        if 'trans' in identity and metric == 'toxicity':
            score = np.random.normal(0.3, 0.1)
        elif 'trans' in identity and metric == 'misgendering':
            score = np.random.normal(0.7, 0.15)
        else:
            score = np.random.normal(0.85, 0.1)
        
        data.append({
            'identity': identity,
            'metric': metric,
            'score': np.clip(score, 0, 1)
        })

viz_df = pd.DataFrame(data)

# Create heatmap
pivot_df = viz_df.pivot(index='identity', columns='metric', values='score')

plt.figure(figsize=(10, 6))
sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='RdYlGn', vmin=0, vmax=1)
plt.title('Bias Scores by Identity and Metric\n(Higher is better except for toxicity)')
plt.tight_layout()
plt.show()

## Next Steps

1. **Expand evaluation**: Test on more examples and datasets
2. **Compare models**: Evaluate multiple models to find the least biased
3. **Custom metrics**: Implement domain-specific bias metrics
4. **Mitigation**: Use results to improve model behavior

For more examples and advanced usage, see the `docs/USER_GUIDE.md`.