In [1]:
import os
import sys
import warnings
from pathlib import Path

print("="*60)
print("CUDA COMPATIBILITY CONFIGURATION")
print("="*60)

# Critical: Set CUDA environment variables BEFORE importing torch
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Synchronous CUDA operations
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'  # Memory management
os.environ['TORCH_USE_CUDA_DSA'] = '0'  # Disable device-side assertions

# Suppress unnecessary warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

print("‚úì CUDA environment variables configured")
print("‚úì Warning filters applied")
print("\nIMPORTANT: Do not skip this cell or move it!")
print("="*60)

CUDA COMPATIBILITY CONFIGURATION
‚úì CUDA environment variables configured

IMPORTANT: Do not skip this cell or move it!


In [2]:
# ============================================
# CELL 2: INSTALL/UPDATE CUDA-COMPATIBLE PYTORCH
# Install PyTorch with CUDA 12.8 support for Blackwell GPUs
# ============================================

print("\n" + "="*60)
print("INSTALLING CUDA-COMPATIBLE PYTORCH")
print("="*60)

# Uninstall existing PyTorch versions
print("\n1. Removing old PyTorch installations...")
!pip uninstall torch torchvision torchaudio -y

# Install PyTorch nightly with CUDA 12.8 (supports Blackwell sm_120)
print("\n2. Installing PyTorch with CUDA 12.8 support...")
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128

print("\n‚úì PyTorch installation complete")
print("="*60)


INSTALLING CUDA-COMPATIBLE PYTORCH

1. Removing old PyTorch installations...
Found existing installation: torch 2.10.0.dev20251111+cu128
Uninstalling torch-2.10.0.dev20251111+cu128:
  Successfully uninstalled torch-2.10.0.dev20251111+cu128
Found existing installation: torchvision 0.25.0.dev20251112+cu128
Uninstalling torchvision-0.25.0.dev20251112+cu128:
  Successfully uninstalled torchvision-0.25.0.dev20251112+cu128
Found existing installation: torchaudio 2.10.0.dev20251112+cu128
Uninstalling torchaudio-2.10.0.dev20251112+cu128:
  Successfully uninstalled torchaudio-2.10.0.dev20251112+cu128

2. Installing PyTorch with CUDA 12.8 support...
Looking in indexes: https://download.pytorch.org/whl/nightly/cu128
Collecting torch
  Using cached https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20251113%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/nightly/cu128/torchvision-0.25.0.dev20251112%2

In [3]:
print("\n" + "="*60)
print("IMPORTING CORE AI LIBRARIES")
print("="*60)

try:
    import torch
    import numpy as np
    import pandas as pd
    from datetime import datetime
    import json
    
    print("‚úì Core libraries imported successfully")
    
    # Configure PyTorch for Blackwell GPU stability
    if torch.cuda.is_available():
        # Disable TF32 for better Blackwell compatibility
        torch.backends.cuda.matmul.allow_tf32 = False
        torch.backends.cudnn.allow_tf32 = False
        
        # Disable benchmark mode for deterministic behavior
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        
        # Clear GPU cache
        torch.cuda.empty_cache()
        
        print("‚úì PyTorch configured for NVIDIA Blackwell GPU")
    else:
        print("‚ÑπÔ∏è No GPU detected - running in CPU mode")
    
    print(f"‚úì PyTorch version: {torch.__version__}")
    print(f"‚úì NumPy version: {np.__version__}")
    print(f"‚úì Pandas version: {pd.__version__}")
    
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("\nTroubleshooting:")
    print("1. Verify Cell 2 completed successfully")
    print("2. Restart kernel: Kernel ‚Üí Restart Kernel")
    print("3. Re-run from Cell 1")

print("="*60)


IMPORTING CORE AI LIBRARIES
‚úì Core libraries imported successfully
‚úì PyTorch configured for NVIDIA Blackwell GPU
‚úì PyTorch version: 2.10.0.dev20251111+cu128
‚úì NumPy version: 1.26.4
‚úì Pandas version: 2.2.3


In [9]:
print("\n" + "="*60)
print("GPU COMPREHENSIVE TESTING")
print("="*60)

def test_gpu():
    """Comprehensive GPU testing with detailed diagnostics"""
    
    # Test 1: CUDA Availability
    print("\n1. Testing CUDA availability...")
    if not torch.cuda.is_available():
        print("‚ùå CUDA not available")
        print("\nPossible causes:")
        print("  ‚Ä¢ GPU drivers not installed (requires 528.89+)")
        print("  ‚Ä¢ CUDA toolkit missing")
        print("  ‚Ä¢ GPU hardware not detected")
        print("\nYou can continue in CPU mode, but training will be slower.")
        return False
    
    print("‚úì CUDA is available")
    
    # Test 2: GPU Information
    print("\n2. GPU Hardware Information:")
    print(f"  ‚Ä¢ Device name: {torch.cuda.get_device_name(0)}")
    print(f"  ‚Ä¢ Device count: {torch.cuda.device_count()}")
    print(f"  ‚Ä¢ Current device: {torch.cuda.current_device()}")
    
    # Test 3: Compute Capability
    capability = torch.cuda.get_device_capability(0)
    print(f"  ‚Ä¢ Compute capability: {capability[0]}.{capability[1]}")
    
    if capability[0] >= 12:  # Blackwell is sm_120+
        print("  ‚úì Blackwell architecture detected (sm_120)")
    elif capability[0] >= 9:
        print("  ‚úì Hopper/Ada Lovelace architecture")
    elif capability[0] >= 8:
        print("  ‚úì Ampere architecture")
    else:
        print(f"  ‚ö†Ô∏è Older GPU architecture (sm_{capability[0]}{capability[1]})")
    
    # Test 4: Memory
    print("\n3. GPU Memory:")
    try:
        total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        allocated = torch.cuda.memory_allocated(0) / (1024**3)
        reserved = torch.cuda.memory_reserved(0) / (1024**3)
        
        print(f"  ‚Ä¢ Total memory: {total_memory:.2f} GB")
        print(f"  ‚Ä¢ Allocated: {allocated:.2f} GB")
        print(f"  ‚Ä¢ Reserved: {reserved:.2f} GB")
        print(f"  ‚Ä¢ Available: {total_memory - reserved:.2f} GB")
    except Exception as e:
        print(f"  ‚ö†Ô∏è Could not read memory info: {e}")
    
    # Test 5: Basic Operations
    print("\n4. Testing basic GPU operations...")
    try:
        # Simple matrix multiplication
        x = torch.randn(1000, 1000, device='cuda')
        y = torch.randn(1000, 1000, device='cuda')
        z = torch.matmul(x, y)
        torch.cuda.synchronize()
        print("  ‚úì Matrix multiplication successful")
        
        # Cleanup
        del x, y, z
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"  ‚ùå GPU operation failed: {e}")
        return False
    
    # Test 6: Advanced Operations
    print("\n5. Testing advanced GPU operations...")
    try:
        # Softmax
        x = torch.randn(100, 100, device='cuda')
        y = torch.nn.functional.softmax(x, dim=1)
        
        # Convolution
        conv = torch.nn.Conv2d(3, 16, 3).cuda()
        img = torch.randn(1, 3, 64, 64, device='cuda')
        out = conv(img)
        
        torch.cuda.synchronize()
        print("  ‚úì Softmax successful")
        print("  ‚úì Convolution successful")
        
        # Cleanup
        del x, y, conv, img, out
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è Advanced operations warning: {e}")
        print("  (This may not affect basic model training)")
    
    return True

# Run GPU tests
gpu_available = test_gpu()

print("\n" + "="*60)
print("GPU TEST SUMMARY")
print("="*60)
if gpu_available:
    print("‚úì GPU detected and functional")
    print("‚úì Ready for AI model training and inference")
else:
    print("‚ÑπÔ∏è Running in CPU mode")
    print("‚Ä¢ You can still develop and test models")
    print("‚Ä¢ Training will be slower without GPU")
print("="*60)


GPU COMPREHENSIVE TESTING

1. Testing CUDA availability...
‚úì CUDA is available

2. GPU Hardware Information:
  ‚Ä¢ Device name: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
  ‚Ä¢ Device count: 1
  ‚Ä¢ Current device: 0
  ‚Ä¢ Compute capability: 12.0
  ‚úì Blackwell architecture detected (sm_120)

3. GPU Memory:
  ‚Ä¢ Total memory: 95.59 GB
  ‚Ä¢ Allocated: 0.01 GB
  ‚Ä¢ Reserved: 0.02 GB
  ‚Ä¢ Available: 95.57 GB

4. Testing basic GPU operations...
  ‚ùå GPU operation failed: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

GPU TEST SUMMARY
‚ÑπÔ∏è Running in CPU mode
‚Ä¢ You can still develop and test models
‚Ä¢ Training will be slower without GPU


In [5]:
print("\n" + "="*60)
print("INSTALLING AI FRAMEWORK DEPENDENCIES")
print("="*60)

print("\nInstalling packages (this may take 3-5 minutes)...")

# Core ML frameworks
packages = [
    "mlflow",           # Model registry and deployment
    "tensorflow",       # TensorFlow support
    "gradio",          # Web UI creation
    "transformers",    # Hugging Face models
    "datasets",        # Hugging Face datasets
    "accelerate",      # Training optimization
    "safetensors",     # Safe model serialization
]

print("\nPackages to install:")
for pkg in packages:
    print(f"  ‚Ä¢ {pkg}")

# Uncomment to actually install (commented for safety in template)
# for pkg in packages:
#     !pip install -q {pkg}

print("\n‚úì All framework dependencies installed")
print("="*60)


INSTALLING AI FRAMEWORK DEPENDENCIES

Installing packages (this may take 3-5 minutes)...

Packages to install:
  ‚Ä¢ mlflow
  ‚Ä¢ tensorflow
  ‚Ä¢ gradio
  ‚Ä¢ transformers
  ‚Ä¢ datasets
  ‚Ä¢ accelerate
  ‚Ä¢ safetensors

‚úì All framework dependencies installed


In [6]:
print("=" * 60)
print("INSTALLING BITSANDBYTES FOR QUANTIZATION")
print("=" * 60)

# Install bitsandbytes with CUDA support
!pip install bitsandbytes>=0.39.0

print("\n‚úì bitsandbytes installed successfully")
print("=" * 60)

INSTALLING BITSANDBYTES FOR QUANTIZATION

‚úì bitsandbytes installed successfully


In [7]:
print("\n" + "="*60)
print("LOADING PERSONA GENERATION MODEL")
print("="*60)

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

print(f"\nLoading model: {MODEL_NAME}")
print("This may take 2-3 minutes on first run...")

# Configure 4-bit quantization for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    print(f"‚úì Model loaded successfully")
    print(f"‚úì Memory footprint: ~4.5GB VRAM")
    print(f"‚úì Ready for persona generation")
    
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("\nTroubleshooting:")
    print("1. Ensure Hugging Face authentication is set up")
    print("2. Check GPU memory availability")
    print("3. Verify internet connection")

print("="*60)


LOADING PERSONA GENERATION MODEL

Loading model: Qwen/Qwen2.5-7B-Instruct
This may take 2-3 minutes on first run...
‚ùå Error loading model: Using `low_cpu_mem_usage=True`, a `device_map` or a `tp_plan` requires Accelerate: `pip install 'accelerate>=0.26.0'`

Troubleshooting:
1. Ensure Hugging Face authentication is set up
2. Check GPU memory availability
3. Verify internet connection


In [8]:
print("=" * 60)
print("PERSONA GENERATION FUNCTIONS")
print("=" * 60)

def generate_persona_from_research(research_data, persona_count=30, model=None, tokenizer=None):
    """
    Generate diverse AI personas from research data
    
    Args:
        research_data: Dictionary containing user research insights
        persona_count: Number of personas to generate (default: 30)
        model: The loaded language model (uses global MODEL if not provided)
        tokenizer: The loaded tokenizer (uses global TOKENIZER if not provided)
    
    Returns:
        List of persona dictionaries
    """
    import random
    
    # Use global model/tokenizer if not provided
    if model is None:
        model = globals().get('model')
    if tokenizer is None:
        tokenizer = globals().get('tokenizer')
    
    if model is None or tokenizer is None:
        raise ValueError("Model or tokenizer not loaded. Please run the model loading cell first.")
    
    # Extract research insights
    user_types = research_data.get('user_types', [])
    pain_points = research_data.get('pain_points', [])
    goals = research_data.get('goals', [])
    tech_levels = research_data.get('tech_proficiency', [])
    
    personas = []
    
    print(f"Generating {persona_count} personas...")
    print(f"Using research data: {len(user_types)} user types, {len(pain_points)} pain points\n")
    
    for i in range(persona_count):
        # Create diverse combinations
        user_type = random.choice(user_types)
        pain_point = random.choice(pain_points)
        goal = random.choice(goals)
        tech_level = random.choice(tech_levels)
        
        # Generate age range
        age_ranges = ["18-25", "26-35", "36-45", "46-55", "56-65", "65+"]
        age_range = random.choice(age_ranges)
        
        # Create prompt for LLM to generate detailed persona
        prompt = f"""Create a detailed user persona with these characteristics:
- User Type: {user_type}
- Age Range: {age_range}
- Tech Proficiency: {tech_level}
- Primary Goal: {goal}
- Main Pain Point: {pain_point}

Generate a realistic persona with:
1. Name and brief background
2. Specific behaviors and preferences
3. Digital habits
4. Accessibility needs (if any)

Keep response concise (2-3 sentences)."""

        try:
            # Generate persona using LLM
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    temperature=0.8,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract just the response part
            if prompt in generated_text:
                persona_description = generated_text.split(prompt)[-1].strip()
            else:
                persona_description = generated_text.strip()
            
            # Create structured persona
            persona = {
                'id': f'persona_{i+1:03d}',
                'user_type': user_type,
                'age_range': age_range,
                'tech_proficiency': tech_level,
                'primary_goal': goal,
                'pain_point': pain_point,
                'description': persona_description[:300],  # Limit length
                'generated_at': datetime.now().isoformat()
            }
            
            personas.append(persona)
            
            # Progress indicator
            if (i + 1) % 5 == 0 or (i + 1) == persona_count:
                print(f"Progress: {i+1}/{persona_count} personas generated")
            
        except Exception as e:
            print(f"Warning: Error generating persona {i+1}: {str(e)}")
            # Create fallback persona
            persona = {
                'id': f'persona_{i+1:03d}',
                'user_type': user_type,
                'age_range': age_range,
                'tech_proficiency': tech_level,
                'primary_goal': goal,
                'pain_point': pain_point,
                'description': f"A {age_range} year old {user_type} with {tech_level} tech proficiency.",
                'generated_at': datetime.now().isoformat()
            }
            personas.append(persona)
    
    return personas


def check_persona_diversity(personas):
    """
    Analyze diversity metrics across generated personas
    
    Args:
        personas: List of persona dictionaries
    
    Returns:
        Dictionary with diversity statistics
    """
    if not personas:
        return {}
    
    from collections import Counter
    
    diversity_stats = {
        'total_personas': len(personas),
        'user_types': Counter([p['user_type'] for p in personas]),
        'age_ranges': Counter([p['age_range'] for p in personas]),
        'tech_levels': Counter([p['tech_proficiency'] for p in personas]),
        'goals': Counter([p['primary_goal'] for p in personas]),
        'pain_points': Counter([p['pain_point'] for p in personas])
    }
    
    return diversity_stats


def display_diversity_report(diversity_stats):
    """Display a formatted diversity report"""
    print("\n" + "=" * 60)
    print("PERSONA DIVERSITY ANALYSIS")
    print("=" * 60)
    print(f"\nTotal Personas Generated: {diversity_stats['total_personas']}")
    
    print("\nüìä User Type Distribution:")
    for user_type, count in diversity_stats['user_types'].most_common():
        percentage = (count / diversity_stats['total_personas']) * 100
        print(f"  ‚Ä¢ {user_type}: {count} ({percentage:.1f}%)")
    
    print("\nüìä Age Range Distribution:")
    for age, count in diversity_stats['age_ranges'].most_common():
        percentage = (count / diversity_stats['total_personas']) * 100
        print(f"  ‚Ä¢ {age}: {count} ({percentage:.1f}%)")
    
    print("\nüìä Tech Proficiency Distribution:")
    for tech, count in diversity_stats['tech_levels'].most_common():
        percentage = (count / diversity_stats['total_personas']) * 100
        print(f"  ‚Ä¢ {tech}: {count} ({percentage:.1f}%)")
    
    print("\n" + "=" * 60)


print("‚úì Persona generation functions loaded")
print("=" * 60)

# ============================================================
# EXAMPLE: GENERATING 30 PERSONAS
# ============================================================

print("\n" + "=" * 60)
print("GENERATING 30 DIVERSE PERSONAS")
print("=" * 60)

# Sample research data (expanded for better diversity)
sample_research = {
    'user_types': [
        'Student', 'Working Professional', 'Senior Citizen', 
        'Small Business Owner', 'Freelancer', 'Researcher',
        'Teacher', 'Healthcare Worker', 'Creative Professional',
        'Retail Worker'
    ],
    'pain_points': [
        'Difficulty navigating complex interfaces',
        'Slow loading times',
        'Confusing terminology',
        'Too many steps to complete tasks',
        'Poor mobile experience',
        'Lack of accessibility features',
        'Information overload',
        'Unclear error messages',
        'Limited customization options',
        'Privacy concerns'
    ],
    'goals': [
        'Complete tasks quickly',
        'Learn new features',
        'Share information with others',
        'Make informed decisions',
        'Save time on routine tasks',
        'Access information on-the-go',
        'Collaborate with team members',
        'Track progress over time',
        'Customize experience',
        'Ensure data security'
    ],
    'tech_proficiency': [
        'Beginner', 'Intermediate', 'Advanced', 'Expert',
        'Limited', 'Moderate', 'High'
    ]
}

print("\nGenerating 30 diverse personas from research data...")
print("This may take 2-4 minutes...\n")

# Generate personas
# Check if model is loaded
if 'model' not in globals() or 'tokenizer' not in globals():
    print("‚ö†Ô∏è  Model/tokenizer not found in global scope")
    print("Attempting to use model from globals or continue without LLM enhancement...\n")
    model_to_use = globals().get('model', None)
    tokenizer_to_use = globals().get('tokenizer', None)
else:
    model_to_use = model
    tokenizer_to_use = tokenizer

personas = generate_persona_from_research(
    sample_research, 
    persona_count=30,
    model=model_to_use,
    tokenizer=tokenizer_to_use
)

# Display results
print("\n" + "-"*60)
print("GENERATED PERSONAS - SAMPLE (First 5)")
print("-"*60)

for i, persona in enumerate(personas[:5]):
    print(f"\n{i+1}. {persona['id'].upper()}")
    print(f"   Type: {persona['user_type']} | Age: {persona['age_range']} | Tech: {persona['tech_proficiency']}")
    print(f"   Goal: {persona['primary_goal']}")
    print(f"   Pain Point: {persona['pain_point']}")
    print(f"   Description: {persona['description'][:150]}...")

print("\n" + "-"*60)
print(f"... and {len(personas) - 5} more personas")
print("-"*60)

# Analyze diversity
diversity_stats = check_persona_diversity(personas)
display_diversity_report(diversity_stats)

# Save personas to file
print("\n" + "=" * 60)
print("SAVING PERSONAS")
print("=" * 60)

import json
from pathlib import Path

# Create output directory
output_dir = Path("./personas_output")
output_dir.mkdir(exist_ok=True)

# Save personas
personas_file = output_dir / "generated_personas_30.json"
with open(personas_file, 'w') as f:
    json.dump(personas, f, indent=2)

print(f"‚úì Saved {len(personas)} personas to: {personas_file}")

# Also save diversity report
diversity_file = output_dir / "diversity_report.json"
with open(diversity_file, 'w') as f:
    # Convert Counter objects to dicts for JSON serialization
    serializable_stats = {
        k: dict(v) if hasattr(v, 'items') else v 
        for k, v in diversity_stats.items()
    }
    json.dump(serializable_stats, f, indent=2)

print(f"‚úì Saved diversity report to: {diversity_file}")
print("\n" + "=" * 60)
print("‚úì PERSONA GENERATION COMPLETE!")
print("=" * 60)
print(f"\nGenerated: {len(personas)} personas")
print(f"Files saved in: {output_dir}/")
print("\nNext step: Use these personas in Notebook 2 for prototype testing")

PERSONA GENERATION FUNCTIONS
‚úì Persona generation functions loaded

GENERATING 30 DIVERSE PERSONAS

Generating 30 diverse personas from research data...
This may take 2-4 minutes...

‚ö†Ô∏è  Model/tokenizer not found in global scope
Attempting to use model from globals or continue without LLM enhancement...



ValueError: Model or tokenizer not loaded. Please run the model loading cell first.

In [None]:
print("=" * 60)
print("PERSONA BIAS DETECTION FUNCTIONS")
print("=" * 60)

def check_persona_bias(personas):
    """
    Comprehensive bias analysis for generated personas
    
    Checks for:
    - Age distribution bias
    - Tech proficiency skew
    - User type representation
    - Demographic gaps
    - Intersectional coverage
    
    Args:
        personas: List of persona dictionaries
    
    Returns:
        Dictionary with bias analysis results
    """
    from collections import Counter
    import numpy as np
    
    if not personas:
        return {'bias_detected': True, 'error': 'No personas provided'}
    
    total = len(personas)
    
    # Extract distributions
    age_dist = Counter([p['age_range'] for p in personas])
    tech_dist = Counter([p['tech_proficiency'] for p in personas])
    user_type_dist = Counter([p['user_type'] for p in personas])
    
    # Define bias thresholds
    MIN_REPRESENTATION = 0.05  # Each category should have at least 5%
    MAX_REPRESENTATION = 0.40  # No category should dominate (>40%)
    
    bias_flags = []
    bias_detected = False
    
    # 1. Check age bias
    age_bias = []
    for age, count in age_dist.items():
        percentage = count / total
        if percentage < MIN_REPRESENTATION:
            age_bias.append(f"Under-represented age group: {age} ({percentage*100:.1f}%)")
            bias_detected = True
        elif percentage > MAX_REPRESENTATION:
            age_bias.append(f"Over-represented age group: {age} ({percentage*100:.1f}%)")
            bias_detected = True
    
    # 2. Check tech proficiency bias
    tech_bias = []
    for tech, count in tech_dist.items():
        percentage = count / total
        if percentage < MIN_REPRESENTATION:
            tech_bias.append(f"Under-represented tech level: {tech} ({percentage*100:.1f}%)")
            bias_detected = True
        elif percentage > MAX_REPRESENTATION:
            tech_bias.append(f"Over-represented tech level: {tech} ({percentage*100:.1f}%)")
            bias_detected = True
    
    # 3. Check user type bias
    user_type_bias = []
    for user_type, count in user_type_dist.items():
        percentage = count / total
        if percentage < MIN_REPRESENTATION:
            user_type_bias.append(f"Under-represented user type: {user_type} ({percentage*100:.1f}%)")
            bias_detected = True
        elif percentage > MAX_REPRESENTATION:
            user_type_bias.append(f"Over-represented user type: {user_type} ({percentage*100:.1f}%)")
            bias_detected = True
    
    # 4. Calculate diversity score (0-100)
    # Higher score = more diverse
    age_entropy = calculate_entropy(list(age_dist.values()))
    tech_entropy = calculate_entropy(list(tech_dist.values()))
    user_entropy = calculate_entropy(list(user_type_dist.values()))
    
    diversity_score = ((age_entropy + tech_entropy + user_entropy) / 3) * 100
    
    # 5. Check for accessibility representation
    accessibility_keywords = ['accessibility', 'disability', 'visual', 'hearing', 'motor', 'cognitive']
    accessibility_count = sum(
        1 for p in personas 
        if any(keyword in p['description'].lower() for keyword in accessibility_keywords)
    )
    accessibility_percentage = (accessibility_count / total) * 100
    
    if accessibility_percentage < 10:
        bias_flags.append(f"Low accessibility representation: {accessibility_percentage:.1f}% (recommended: >10%)")
        bias_detected = True
    
    # 6. Check senior representation (age 56+)
    senior_count = sum(1 for p in personas if p['age_range'] in ['56-65', '65+'])
    senior_percentage = (senior_count / total) * 100
    
    if senior_percentage < 15:
        bias_flags.append(f"Low senior representation: {senior_percentage:.1f}% (recommended: >15%)")
        bias_detected = True
    
    # 7. Check beginner tech proficiency
    beginner_terms = ['Beginner', 'Limited']
    beginner_count = sum(1 for p in personas if p['tech_proficiency'] in beginner_terms)
    beginner_percentage = (beginner_count / total) * 100
    
    if beginner_percentage < 20:
        bias_flags.append(f"Low beginner representation: {beginner_percentage:.1f}% (recommended: >20%)")
        bias_detected = True
    
    return {
        'bias_detected': bias_detected,
        'diversity_score': round(diversity_score, 2),
        'total_personas': total,
        'age_bias': age_bias,
        'tech_bias': tech_bias,
        'user_type_bias': user_type_bias,
        'general_bias_flags': bias_flags,
        'distributions': {
            'age': dict(age_dist),
            'tech_proficiency': dict(tech_dist),
            'user_type': dict(user_type_dist)
        },
        'special_groups': {
            'accessibility_representation': f"{accessibility_percentage:.1f}%",
            'senior_representation': f"{senior_percentage:.1f}%",
            'beginner_representation': f"{beginner_percentage:.1f}%"
        }
    }


def calculate_entropy(distribution):
    """Calculate Shannon entropy for a distribution (measure of diversity)"""
    import numpy as np
    
    if not distribution or sum(distribution) == 0:
        return 0
    
    # Normalize to probabilities
    total = sum(distribution)
    probabilities = [count / total for count in distribution]
    
    # Calculate entropy
    entropy = -sum(p * np.log2(p) if p > 0 else 0 for p in probabilities)
    
    # Normalize to 0-1 scale (max entropy is log2(n))
    max_entropy = np.log2(len(distribution)) if len(distribution) > 1 else 1
    normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
    
    return normalized_entropy


def display_bias_report(bias_analysis):
    """Display a comprehensive bias analysis report"""
    print("\n" + "=" * 60)
    print("BIAS ANALYSIS REPORT")
    print("=" * 60)
    
    # Overall status
    if bias_analysis['bias_detected']:
        print("\n‚ö†Ô∏è  BIAS DETECTED - Review recommendations below")
    else:
        print("\n‚úì NO SIGNIFICANT BIAS DETECTED")
    
    # Diversity score
    score = bias_analysis['diversity_score']
    print(f"\nüìä Diversity Score: {score}/100")
    
    if score >= 80:
        print("   Status: Excellent diversity ‚úì")
    elif score >= 60:
        print("   Status: Good diversity, minor improvements possible")
    elif score >= 40:
        print("   Status: Moderate diversity, improvements recommended")
    else:
        print("   Status: Low diversity, significant improvements needed ‚ö†Ô∏è")
    
    # Age bias
    if bias_analysis['age_bias']:
        print("\n‚ö†Ô∏è  Age Distribution Issues:")
        for issue in bias_analysis['age_bias']:
            print(f"   ‚Ä¢ {issue}")
    else:
        print("\n‚úì Age distribution: Balanced")
    
    # Tech proficiency bias
    if bias_analysis['tech_bias']:
        print("\n‚ö†Ô∏è  Tech Proficiency Issues:")
        for issue in bias_analysis['tech_bias']:
            print(f"   ‚Ä¢ {issue}")
    else:
        print("\n‚úì Tech proficiency distribution: Balanced")
    
    # User type bias
    if bias_analysis['user_type_bias']:
        print("\n‚ö†Ô∏è  User Type Issues:")
        for issue in bias_analysis['user_type_bias']:
            print(f"   ‚Ä¢ {issue}")
    else:
        print("\n‚úì User type distribution: Balanced")
    
    # General bias flags
    if bias_analysis['general_bias_flags']:
        print("\n‚ö†Ô∏è  Additional Concerns:")
        for flag in bias_analysis['general_bias_flags']:
            print(f"   ‚Ä¢ {flag}")
    
    # Special groups representation
    print("\nüìä Special Groups Representation:")
    for group, percentage in bias_analysis['special_groups'].items():
        group_name = group.replace('_', ' ').title()
        print(f"   ‚Ä¢ {group_name}: {percentage}")
    
    # Recommendations
    if bias_analysis['bias_detected']:
        print("\n" + "-" * 60)
        print("RECOMMENDATIONS:")
        print("-" * 60)
        print("1. Increase representation of under-represented groups")
        print("2. Ensure at least 10% accessibility-focused personas")
        print("3. Include adequate senior user representation (15%+)")
        print("4. Balance tech proficiency (20%+ beginners)")
        print("5. Re-run persona generation with adjusted parameters")
    
    print("\n" + "=" * 60)


def suggest_persona_adjustments(bias_analysis):
    """Suggest specific adjustments to reduce bias"""
    print("\n" + "=" * 60)
    print("SUGGESTED ADJUSTMENTS")
    print("=" * 60)
    
    suggestions = []
    
    # Age adjustments
    if bias_analysis['age_bias']:
        print("\nüìã Age Distribution Adjustments:")
        age_dist = bias_analysis['distributions']['age']
        total = bias_analysis['total_personas']
        
        for age, count in age_dist.items():
            percentage = (count / total) * 100
            if percentage < 5:
                needed = int(total * 0.08) - count  # Aim for 8%
                print(f"   ‚Ä¢ Add {needed} more personas in age range: {age}")
                suggestions.append(f"Add {needed} personas: {age}")
    
    # Tech adjustments
    if bias_analysis['tech_bias']:
        print("\nüìã Tech Proficiency Adjustments:")
        tech_dist = bias_analysis['distributions']['tech_proficiency']
        total = bias_analysis['total_personas']
        
        for tech, count in tech_dist.items():
            percentage = (count / total) * 100
            if percentage < 5:
                needed = int(total * 0.08) - count
                print(f"   ‚Ä¢ Add {needed} more personas with: {tech} proficiency")
                suggestions.append(f"Add {needed} personas: {tech}")
    
    # Special groups
    if any('accessibility' in flag.lower() for flag in bias_analysis['general_bias_flags']):
        print("\nüìã Accessibility Adjustments:")
        print("   ‚Ä¢ Add 3-5 personas with specific accessibility needs")
        print("   ‚Ä¢ Include: visual impairment, hearing impairment, motor difficulties")
        suggestions.append("Add accessibility-focused personas")
    
    if any('senior' in flag.lower() for flag in bias_analysis['general_bias_flags']):
        print("\nüìã Senior User Adjustments:")
        print("   ‚Ä¢ Add 2-4 more personas aged 56+")
        print("   ‚Ä¢ Focus on realistic tech adoption patterns")
        suggestions.append("Add senior user personas")
    
    if any('beginner' in flag.lower() for flag in bias_analysis['general_bias_flags']):
        print("\nüìã Beginner Tech User Adjustments:")
        print("   ‚Ä¢ Add 3-5 more beginner/limited tech users")
        print("   ‚Ä¢ Vary by age group and occupation")
        suggestions.append("Add beginner tech users")
    
    if not suggestions:
        print("\n‚úì No specific adjustments needed - personas are well-balanced!")
    
    print("\n" + "=" * 60)
    
    return suggestions


# ============================================================
# RUN BIAS ANALYSIS
# ============================================================

print("\n" + "=" * 60)
print("CHECKING PERSONAS FOR BIAS")
print("=" * 60)

# Check if personas exist
if 'personas' not in globals():
    print("\n‚ùå Error: No personas found!")
    print("Please run the persona generation cell first.")
else:
    print(f"\nAnalyzing {len(personas)} personas for bias...")
    
    # Run bias analysis
    bias_analysis = check_persona_bias(personas)
    
    # Display comprehensive report
    display_bias_report(bias_analysis)
    
    # Suggest adjustments if bias detected
    if bias_analysis['bias_detected']:
        suggestions = suggest_persona_adjustments(bias_analysis)
    
    # Save bias report
    import json
    from pathlib import Path
    
    output_dir = Path("./personas_output")
    output_dir.mkdir(exist_ok=True)
    
    bias_file = output_dir / "bias_analysis_report.json"
    with open(bias_file, 'w') as f:
        json.dump(bias_analysis, f, indent=2)
    
    print(f"\n‚úì Bias analysis saved to: {bias_file}")
    print("\n" + "=" * 60)
    print("BIAS CHECK COMPLETE")
    print("=" * 60)