In [4]:
print("=" * 60)
print("NOTEBOOK 4: AI-POWERED USABILITY TESTING")
print("=" * 60)

NOTEBOOK 4: AI-POWERED USABILITY TESTING


In [8]:
# ============================================================
# CELL 1: IMPORT DEPENDENCIES
# ============================================================import torch
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
from collections import Counter
import random

print("‚úì Dependencies loaded")

‚úì Dependencies loaded


In [10]:
# ============================================================
# CELL 2: LOAD PERSONAS, SCENARIOS, AND PROTOTYPE
# ============================================================

print("\n" + "=" * 60)
print("LOADING TEST DATA")
print("=" * 60)

# Load test scenarios from Notebook 3
scenarios_file = Path("./test_scenarios.json")
if scenarios_file.exists():
    with open(scenarios_file, 'r') as f:
        test_scenarios = json.load(f)
    print(f"‚úì Loaded {len(test_scenarios)} test scenarios")
else:
    print("‚ùå Test scenarios not found. Run Notebook 3 first!")
    test_scenarios = []

# FIX: Correct path to personas
personas_file = Path("./personas_output/generated_personas_30.json")
if personas_file.exists():
    with open(personas_file, 'r') as f:
        personas = json.load(f)
    print(f"‚úì Loaded {len(personas)} personas")
else:
    print("‚ùå Personas not found. Run Notebook 1 first!")
    print(f"   Looking for: {personas_file.absolute()}")
    personas = []

# Load prototype from Notebook 2
prototype_file = Path("prototype_data.json")
if prototype_file.exists():
    with open(prototype_file, 'r') as f:
        prototype_data = json.load(f)
    print(f"‚úì Loaded prototype with {prototype_data.get('screen_count', 0)} screens")
else:
    print("‚ùå Prototype not found. Run Notebook 2 first!")
    prototype_data = None

print("=" * 60)


LOADING TEST DATA
‚úì Loaded 150 test scenarios
‚úì Loaded 30 personas
‚úì Loaded prototype with 1 screens


In [7]:
# ============================================================
# CELL 3: LOAD AI MODEL FOR TESTING
# ============================================================

print("\n" + "=" * 60)
print("LOADING AI MODEL FOR TESTING")
print("=" * 60)

# Use the Mistral model that's already working from Notebook 1
# Avoid Qwen2VL due to PyTorch version incompatibility

print("\nüìå Using Mistral-7B model (from Notebook 1)")
print("This model is already tested and working in your environment")

# Check if model is already loaded in global scope
if 'model' in globals() and 'tokenizer' in globals():
    print("\n‚úì Model already loaded in memory")
    print(f"‚úì Using existing model and tokenizer")
else:
    print("\nLoading Mistral-7B-Instruct-v0.3...")
    print("This may take 1-2 minutes...\n")
    
    try:
        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
        
        MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
        
        # Configure quantization for memory efficiency
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        
        print("‚úì Model loaded successfully")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading model: {e}")
        print("\nContinuing with rule-based testing approach...")
        model = None
        tokenizer = None

print("=" * 60)


LOADING AI MODEL FOR TESTING

üìå Using Mistral-7B model (from Notebook 1)
This model is already tested and working in your environment

‚úì Model already loaded in memory
‚úì Using existing model and tokenizer


In [11]:
# ============================================================
# CELL 4: AI TESTING SIMULATION FUNCTIONS
# ============================================================

print("\n" + "=" * 60)
print("AI TESTING SIMULATION FUNCTIONS")
print("=" * 60)

def simulate_user_interaction(persona, scenario, prototype, model=None, tokenizer=None):
    """
    Simulate a persona interacting with the prototype for a scenario
    
    Args:
        persona: Persona dictionary
        scenario: Test scenario dictionary
        prototype: Prototype data
        model: AI model for simulation (optional)
        tokenizer: Tokenizer (optional)
    
    Returns:
        Test result dictionary
    """
    from datetime import datetime
    
    # Extract key information
    persona_id = persona['id']
    scenario_id = scenario['scenario_id']
    task = scenario['task_description']
    success_criteria = scenario['success_criteria']
    
    # User context
    user_type = persona['user_type']
    tech_level = persona['tech_proficiency']
    pain_point = persona['pain_point']
    
    # Initialize result
    result = {
        'test_id': f"test_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{random.randint(1000,9999)}",
        'persona_id': persona_id,
        'scenario_id': scenario_id,
        'task': task,
        'user_context': {
            'user_type': user_type,
            'tech_proficiency': tech_level,
            'pain_point': pain_point
        },
        'timestamp': datetime.now().isoformat()
    }
    
    # Simulate based on user characteristics
    # Beginners have lower success rate
    base_success_rate = 0.85
    
    if tech_level in ['Beginner', 'Limited']:
        base_success_rate = 0.65
    elif tech_level in ['Intermediate', 'Moderate']:
        base_success_rate = 0.80
    elif tech_level in ['Advanced', 'Expert', 'High']:
        base_success_rate = 0.95
    
    # Adjust for scenario difficulty
    focus_area = scenario.get('focus_area', '')
    if focus_area in ['Error handling', 'Secondary workflows']:
        base_success_rate -= 0.1
    
    # Simulate outcome
    success_roll = random.random()
    task_completed = success_roll < base_success_rate
    
    # Generate realistic issues based on user characteristics
    issues = []
    confusion_points = []
    
    if not task_completed or random.random() < 0.3:
        # Generate issues
        if tech_level in ['Beginner', 'Limited']:
            issues.extend([
                f"Struggled to find {focus_area.lower()} elements",
                f"Unclear about terminology related to {task[:30]}...",
                "Needed multiple attempts to complete action"
            ])
        
        if pain_point in scenario.get('task_description', ''):
            issues.append(f"Encountered expected pain point: {pain_point}")
        
        if focus_area == 'Error handling':
            issues.append("Error message was confusing or unhelpful")
        
        confusion_points.extend([
            "Navigation path not intuitive",
            "Button labels unclear",
            "Too many options presented at once"
        ])
    
    # Use LLM for enhanced analysis if available
    llm_insights = None
    if model and tokenizer:
        try:
            prompt = f"""A {user_type} with {tech_level} tech proficiency is trying to: {task}
Their main concern is: {pain_point}

Describe 2-3 specific usability issues they might encounter. Be concise."""

            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=256,
                padding=True
            )
            inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
            
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id,
                    attention_mask=inputs['attention_mask']
                )
            
            llm_insights = tokenizer.decode(outputs[0], skip_special_tokens=True)
            if prompt in llm_insights:
                llm_insights = llm_insights.split(prompt)[-1].strip()
            llm_insights = llm_insights[:200]
            
        except Exception as e:
            llm_insights = None
    
    # Calculate time taken (realistic estimates)
    base_time = 300  # 5 minutes base
    if tech_level in ['Beginner', 'Limited']:
        time_multiplier = random.uniform(1.5, 2.5)
    elif tech_level in ['Intermediate', 'Moderate']:
        time_multiplier = random.uniform(1.0, 1.5)
    else:
        time_multiplier = random.uniform(0.7, 1.2)
    
    time_seconds = int(base_time * time_multiplier)
    
    # Compile result
    result.update({
        'task_completed': task_completed,
        'success_rate': round(base_success_rate, 2),
        'time_seconds': time_seconds,
        'time_readable': f"{time_seconds // 60}m {time_seconds % 60}s",
        'issues_encountered': random.sample(issues, min(len(issues), 3)) if issues else [],
        'confusion_points': random.sample(confusion_points, min(len(confusion_points), 2)) if confusion_points else [],
        'llm_insights': llm_insights,
        'meets_success_criteria': task_completed and len(issues) <= 1,
        'severity': 'low' if task_completed and len(issues) == 0 else ('medium' if task_completed else 'high')
    })
    
    return result


def run_batch_tests(scenarios, personas, prototype, model=None, tokenizer=None, max_tests=50):
    """
    Run a batch of AI-powered usability tests
    
    Args:
        scenarios: List of test scenarios
        personas: List of personas
        prototype: Prototype data
        model: AI model
        tokenizer: Tokenizer
        max_tests: Maximum number of tests to run
    
    Returns:
        List of test results
    """
    print(f"\nRunning batch of {min(max_tests, len(scenarios))} tests...")
    print("This may take 3-5 minutes...\n")
    
    results = []
    
    # Select scenarios to test
    test_scenarios = scenarios[:max_tests]
    
    for i, scenario in enumerate(test_scenarios):
        # Find matching persona
        persona = next(
            (p for p in personas if p['id'] == scenario['persona_id']),
            None
        )
        
        if not persona:
            continue
        
        # Run simulation
        result = simulate_user_interaction(
            persona, 
            scenario, 
            prototype,
            model=model,
            tokenizer=tokenizer
        )
        
        results.append(result)
        
        # Progress indicator
        if (i + 1) % 10 == 0 or (i + 1) == len(test_scenarios):
            print(f"Progress: {i+1}/{len(test_scenarios)} tests completed")
    
    print(f"\n‚úì Completed {len(results)} tests")
    
    return results


print("‚úì Testing simulation functions loaded")
print("=" * 60)


AI TESTING SIMULATION FUNCTIONS
‚úì Testing simulation functions loaded


In [12]:
# ============================================================
# CELL 5: RUN AI-POWERED TESTS
# ============================================================

print("\n" + "=" * 60)
print("RUNNING AI-POWERED USABILITY TESTS")
print("=" * 60)

if not test_scenarios or not personas:
    print("\n‚ùå Cannot run tests - missing scenarios or personas")
    print("Please run Notebooks 1 and 3 first")
else:
    # Run tests on high-priority scenarios first
    high_priority = [s for s in test_scenarios if s.get('priority') == 'high']
    
    print(f"\nTest Configuration:")
    print(f"  ‚Ä¢ Total scenarios available: {len(test_scenarios)}")
    print(f"  ‚Ä¢ High priority scenarios: {len(high_priority)}")
    print(f"  ‚Ä¢ Personas available: {len(personas)}")
    print(f"  ‚Ä¢ Running first 50 tests for this demo\n")
    
    # Run batch tests
    test_results = run_batch_tests(
        scenarios=test_scenarios,
        personas=personas,
        prototype=prototype_data,
        model=globals().get('model'),
        tokenizer=globals().get('tokenizer'),
        max_tests=50
    )
    
    print("\n" + "=" * 60)
    print("TEST EXECUTION COMPLETE")
    print("=" * 60)


RUNNING AI-POWERED USABILITY TESTS

Test Configuration:
  ‚Ä¢ Total scenarios available: 150
  ‚Ä¢ High priority scenarios: 20
  ‚Ä¢ Personas available: 30
  ‚Ä¢ Running first 50 tests for this demo


Running batch of 50 tests...
This may take 3-5 minutes...

Progress: 10/50 tests completed
Progress: 20/50 tests completed
Progress: 30/50 tests completed
Progress: 40/50 tests completed
Progress: 50/50 tests completed

‚úì Completed 50 tests

TEST EXECUTION COMPLETE


In [13]:
# ============================================================
# CELL 6: ANALYZE TEST RESULTS
# ============================================================

print("\n" + "=" * 60)
print("ANALYZING TEST RESULTS")
print("=" * 60)

if 'test_results' in locals() and test_results:
    
    # Calculate statistics
    total_tests = len(test_results)
    successful = sum(1 for r in test_results if r['task_completed'])
    failed = total_tests - successful
    success_rate = (successful / total_tests) * 100
    
    # Count issues
    all_issues = []
    for r in test_results:
        all_issues.extend(r.get('issues_encountered', []))
    
    issue_counts = Counter(all_issues)
    
    # Severity breakdown
    severity_counts = Counter([r['severity'] for r in test_results])
    
    # Average time
    avg_time = sum(r['time_seconds'] for r in test_results) / total_tests
    
    print(f"\nüìä Overall Statistics:")
    print(f"  ‚Ä¢ Total tests run: {total_tests}")
    print(f"  ‚Ä¢ Successful: {successful} ({success_rate:.1f}%)")
    print(f"  ‚Ä¢ Failed: {failed} ({100-success_rate:.1f}%)")
    print(f"  ‚Ä¢ Average completion time: {int(avg_time//60)}m {int(avg_time%60)}s")
    
    print(f"\n‚ö†Ô∏è  Severity Breakdown:")
    print(f"  ‚Ä¢ High severity: {severity_counts.get('high', 0)}")
    print(f"  ‚Ä¢ Medium severity: {severity_counts.get('medium', 0)}")
    print(f"  ‚Ä¢ Low severity: {severity_counts.get('low', 0)}")
    
    print(f"\nüîç Top 5 Most Common Issues:")
    for issue, count in issue_counts.most_common(5):
        percentage = (count / total_tests) * 100
        print(f"  ‚Ä¢ {issue[:60]}... ({count} tests, {percentage:.1f}%)")
    
    # Show sample results
    print("\n" + "-" * 60)
    print("SAMPLE TEST RESULTS (First 3):")
    print("-" * 60)
    
    for i, result in enumerate(test_results[:3]):
        print(f"\n{i+1}. Test ID: {result['test_id']}")
        print(f"   Persona: {result['user_context']['user_type']} ({result['user_context']['tech_proficiency']})")
        print(f"   Task: {result['task'][:60]}...")
        print(f"   Completed: {'‚úì Yes' if result['task_completed'] else '‚úó No'}")
        print(f"   Time: {result['time_readable']}")
        print(f"   Severity: {result['severity'].upper()}")
        if result.get('issues_encountered'):
            print(f"   Issues: {', '.join(result['issues_encountered'][:2])}")
    
    print("\n" + "=" * 60)

else:
    print("\n‚ùå No test results available")


ANALYZING TEST RESULTS

üìä Overall Statistics:
  ‚Ä¢ Total tests run: 50
  ‚Ä¢ Successful: 40 (80.0%)
  ‚Ä¢ Failed: 10 (20.0%)
  ‚Ä¢ Average completion time: 6m 1s

‚ö†Ô∏è  Severity Breakdown:
  ‚Ä¢ High severity: 10
  ‚Ä¢ Medium severity: 3
  ‚Ä¢ Low severity: 37

üîç Top 5 Most Common Issues:
  ‚Ä¢ Needed multiple attempts to complete action... (6 tests, 12.0%)
  ‚Ä¢ Error message was confusing or unhelpful... (5 tests, 10.0%)
  ‚Ä¢ Encountered expected pain point: Limited customization optio... (2 tests, 4.0%)
  ‚Ä¢ Encountered expected pain point: Confusing terminology... (2 tests, 4.0%)
  ‚Ä¢ Struggled to find primary workflow completion elements... (2 tests, 4.0%)

------------------------------------------------------------
SAMPLE TEST RESULTS (First 3):
------------------------------------------------------------

1. Test ID: test_20251113_130334_5462
   Persona: Creative Professional (Expert)
   Task: Navigate to main feature and complete tasks quickly...
   Completed: ‚úì

In [14]:
# ============================================================
# CELL 7: SAVE TEST RESULTS
# ============================================================

print("\n" + "=" * 60)
print("SAVING TEST RESULTS")
print("=" * 60)

if 'test_results' in locals() and test_results:
    
    # Create output directory
    output_dir = Path("./test_results_output")
    output_dir.mkdir(exist_ok=True)
    
    # Save full results
    results_file = output_dir / "usability_test_results.json"
    with open(results_file, 'w') as f:
        json.dump(test_results, f, indent=2)
    
    print(f"‚úì Saved {len(test_results)} test results to: {results_file}")
    
    # Create summary report
    summary = {
        'test_date': datetime.now().isoformat(),
        'total_tests': len(test_results),
        'success_rate': f"{success_rate:.1f}%",
        'avg_completion_time_seconds': int(avg_time),
        'severity_breakdown': dict(severity_counts),
        'top_issues': [
            {'issue': issue, 'count': count, 'percentage': f"{(count/total_tests)*100:.1f}%"}
            for issue, count in issue_counts.most_common(10)
        ]
    }
    
    summary_file = output_dir / "test_summary.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"‚úì Saved summary report to: {summary_file}")
    
    print("\n" + "=" * 60)
    print("‚úì TESTING COMPLETE!")
    print("=" * 60)
    print(f"\nResults saved in: {output_dir}/")
    print("\nNext step: Proceed to Notebook 5 for comprehensive analysis")
    print("=" * 60)

else:
    print("\n‚ö†Ô∏è No results to save")


SAVING TEST RESULTS
‚úì Saved 50 test results to: test_results_output/usability_test_results.json
‚úì Saved summary report to: test_results_output/test_summary.json

‚úì TESTING COMPLETE!

Results saved in: test_results_output/

Next step: Proceed to Notebook 5 for comprehensive analysis
