In [1]:
import time
from pathlib import Path
import socket
import pandas as pd
import asyncio
import aiohttp
import sqlalchemy
import uuid
from datetime import datetime
import random
import socket

print('All imports are ready in one place.')

All imports are ready in one place.


In [2]:
DB_CONFIG = {
    'user': 'cloudmart_user',
    'password': 'cloudmart_pass',
    'host': 'localhost',
    'port': 3309,
    'database': 'cloudmart_saga'
}

CONN_STR = f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"

# Health check endpoints and API endpoints
HEALTH_ENDPOINTS = [
    ("single_pessimistic", "order-service", 'http://localhost:8000/health'),
]

ENDPOINTS = {
    'single_pessimistic': 'http://localhost:8000/orders'
}


# Step 1: Pre-check and Environment Setup

This cell checks for Docker, determines the correct `docker-compose` command, and verifies that no conflicting containers are running. This is a safety check to prevent errors when starting the services.

In [3]:
# Utility functions for service connectivity
def is_port_open(host: str, port: int, timeout: float = 1.0) -> bool:
    """Check if a TCP port is open on the given host."""
    try:
        with socket.create_connection((host, port), timeout=timeout):
            return True
    except Exception:
        return False

In [4]:
# Quick health check for services
print("=== Service Health Check ===")
async def quick_health_check():
    async with aiohttp.ClientSession() as session:
        for pattern, service_name, health_url in HEALTH_ENDPOINTS:
            try:
                async with session.get(health_url, timeout=aiohttp.ClientTimeout(total=5)) as response:
                    status = "✓" if response.status == 200 else f"✗ {response.status}"
                    print(f"{pattern:20} | {service_name:15} | {status}")
            except Exception as e:
                print(f"{pattern:20} | {service_name:15} | ✗ {str(e)[:30]}...")

await quick_health_check()
print("="*60)

=== Service Health Check ===
single_pessimistic   | order-service   | ✓


In [None]:
# Improved utility function to generate test data with better failure injection
def generate_test_payload_improved(scenario='success', pattern='single_pessimistic'):
    """Generate test payload with improved failure injection logic"""
    valid_customers = ["customer-001", "customer-002", "customer-003", "customer-004", "customer-005"]
    # Adjust comments to reflect actual initial stocks and our restocking step
    cheap_books = ["book-123", "book-789", "book-202"]  # Restocked to have ample stock for success
    expensive_books = ["book-456"]  # 8000 yen for payment failure (restocked 5)
    low_stock_books = ["book-101", "book-202"]  # Will exceed with large quantity for failure

    base_customer = random.choice(valid_customers)

    if scenario == 'stock_failure':
        base_book = random.choice(low_stock_books)
        return {
            "customer_id": base_customer,
            "items": [{"book_id": base_book, "quantity": 1000}]  # exceed any stock
        }
    elif scenario == 'payment_failure':
        # ensure > 5000 either by one book-456 or multiple cheap ones
        base_book = random.choice(expensive_books)
        return {
            "customer_id": base_customer,
            "items": [{"book_id": base_book, "quantity": 1}]  # 8000 > 5000
        }
    else:
        base_book = random.choice(cheap_books)
        return {
            "customer_id": base_customer,
            "items": [{"book_id": base_book, "quantity": 1}]
        }

# Replace the original function (without parentheses - assign the function, not call it)
generate_test_payload = generate_test_payload_improved

In [6]:
# Async HTTP client utilities
async def make_request(session, url, payload, pattern, scenario):
    """Make async HTTP request and record timing"""
    request_id = uuid.uuid4().hex[:8]
    start_time = time.time()
    try:
        async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=30)) as response:
            response_time = time.time() - start_time
            if response.status in (200, 201):
                try:
                    result = await response.json()
                    order_id = result.get('order_id', result.get('id', request_id))
                except Exception:
                    order_id = request_id
                    result = await response.text()
            else:
                order_id = request_id
                result = await response.text()
            return {
                'pattern': pattern,
                'scenario': scenario,
                'order_id': order_id,
                'request_id': request_id,
                'status_code': response.status,
                'response_time': response_time,
                'timestamp': datetime.now().isoformat(),
                'result': str(result)[:200],
                'load_phase': 'single'
            }
    except Exception as e:
        response_time = time.time() - start_time
        return {
            'pattern': pattern,
            'scenario': scenario,
            'order_id': request_id,
            'request_id': request_id,
            'status_code': 'ERROR',
            'response_time': response_time,
            'timestamp': datetime.now().isoformat(),
            'result': str(e)[:200],
            'load_phase': 'single'
        }

In [7]:
# Clear previous test results to start fresh
test_results = []
print("Previous test results cleared. Ready for new tests.")

Previous test results cleared. Ready for new tests.


In [None]:
# Restock helper to ensure tests are deterministic
import sqlalchemy
from sqlalchemy import text
print("=== Restocking inventory for tests ===")
try:
    engine = sqlalchemy.create_engine(CONN_STR)
    with engine.begin() as conn:
        # Set ample stock for success tests
        conn.execute(text("UPDATE inventory SET available_stock=:n WHERE book_id IN ('book-123','book-789','book-202')"), {"n": 100})
        # Ensure payment-failure book has stock so it passes inventory and fails at payment
        conn.execute(text("UPDATE inventory SET available_stock=:n WHERE book_id='book-456'"), {"n": 5})
    print("✓ Restocked: book-123, book-789, book-202 -> 100; book-456 -> 5")
except Exception as e:
    print(f"Restock error: {e}")

In [None]:
# Test the updated failure injection logic with a few manual tests
async def test_failure_injection():
    """Quick test of the updated failure injection logic"""
    print("=== Testing Updated Failure Injection ===")

    async with aiohttp.ClientSession() as session:
        # Test 1: Success case
        print("\n1. Testing success case:")
        # Prefer books we just restocked
        payload = {
            "customer_id": random.choice(["customer-001","customer-002","customer-003","customer-004","customer-005"]),
            "items": [{"book_id": random.choice(["book-123","book-789","book-202"]), "quantity": 1}]
        }
        print(f"   Payload: {payload}")
        result = await make_request(session, ENDPOINTS['single_pessimistic'], payload, 'single_pessimistic', 'success')
        print(f"   Result: {result['status_code']} - {result['response_time']:.3f}s")

        # Test 2: Stock failure case
        print("\n2. Testing stock failure (high quantity):")
        payload = {
            "customer_id": random.choice(["customer-001","customer-002","customer-003","customer-004","customer-005"]),
            "items": [{"book_id": random.choice(["book-101","book-202"]), "quantity": 1000}]  # exceed any stock
        }
        print(f"   Payload: {payload}")
        result = await make_request(session, ENDPOINTS['single_pessimistic'], payload, 'single_pessimistic', 'stock_failure')
        print(f"   Result: {result['status_code']} - {result['response_time']:.3f}s")

        # Test 3: Payment failure case
        print("\n3. Testing payment failure (high amount):")
        # Use in-stock expensive combination to exceed 5000
        # Either one expensive book-456 (8000) or multiple cheap ones
        payload = {
            "customer_id": random.choice(["customer-001","customer-002","customer-003","customer-004","customer-005"]),
            "items": [{"book_id": "book-456", "quantity": 1}]  # 8000 > 5000
        }
        print(f"   Payload: {payload}")
        result = await make_request(session, ENDPOINTS['single_pessimistic'], payload, 'single_pessimistic', 'payment_failure')
        print(f"   Result: {result['status_code']} - {result['response_time']:.3f}s")

await test_failure_injection()

=== Testing Updated Failure Injection ===

1. Testing success case:
   Payload: {'customer_id': 'customer-005', 'items': [{'book_id': 'book-123', 'quantity': 1}]}
   Result: 400 - 0.026s

2. Testing stock failure (high quantity):
   Payload: {'customer_id': 'customer-002', 'items': [{'book_id': 'book-202', 'quantity': 60}]}
   Result: 400 - 0.004s

3. Testing payment failure (high amount):
   Payload: {'customer_id': 'customer-005', 'items': [{'book_id': 'book-456', 'quantity': 1}]}
   Result: 400 - 0.002s


In [None]:
# Single-shot and abnormal case tests
async def run_single_tests():
    """Run single-shot tests for functional verification"""
    print("=== Running Single-shot Tests ===")

    test_cases = [
        # Normal success cases (5 times each)
        ('single_pessimistic', 'success', 5),

        # Failure cases (10 times each)
        ('single_pessimistic', 'stock_failure', 10),
        ('single_pessimistic', 'payment_failure', 10),
    ]

    async with aiohttp.ClientSession() as session:
        for pattern, scenario, count in test_cases:
            print(f"\nTesting {pattern} - {scenario} ({count} times)")
            url = ENDPOINTS[pattern]

            for i in range(count):
                payload = generate_test_payload(scenario, pattern)
                result = await make_request(session, url, payload, pattern, scenario)
                test_results.append(result)

                status_symbol = "✓" if result['status_code'] in [200, 201] else "✗"
                print(f"  {i+1:2d}. {status_symbol} {result['status_code']} - {result['response_time']:.3f}s - {result['order_id']}")

                # Brief delay between requests
                await asyncio.sleep(0.1)

# Run single tests
await run_single_tests()

print(f"\nSingle tests completed. Total results: {len(test_results)}")

# Show summary
df_single = pd.DataFrame(test_results)
if not df_single.empty:
    summary = df_single.groupby(['pattern', 'scenario']).agg({
        'response_time': ['count', 'mean', 'std'],
        'status_code': lambda x: (x.isin([200, 201])).sum()
    }).round(3)
    print("\nSingle Test Summary:")
    print(summary)

In [None]:
# Check test results summary
print("=== Test Results Summary ===")
print(f"Total test results: {len(test_results)}")

if test_results:
    df = pd.DataFrame(test_results)
    success_count = (df['status_code'].isin([200, 201])).sum()
    error_count = (df['status_code'] == 'ERROR').sum()
    print(f"Successful requests: {success_count}")
    print(f"Error requests: {error_count}")
    print(f"Success rate: {success_count / len(test_results) * 100:.1f}%")

    if error_count > 0:
        print("\nSample error details:")
        error_results = df[df['status_code'] == 'ERROR'].head(3)
        for _, row in error_results.iterrows():
            print(f"  {row['pattern']} - {row['scenario']}: {row['result'][:100]}...")

    print("\nStatus code distribution:")
    print(df['status_code'].value_counts())

In [None]:
# Load test with failure injection
async def run_load_test(duration_seconds=180, virtual_users=100):
    """Run load test with failure injection"""
    print("=== Running Load Test ===")
    print(f"Duration: {duration_seconds}s, Virtual Users: {virtual_users}")
    print(f"Expected requests: ~{duration_seconds * virtual_users // 2} (assuming 0.5 req/s per VU)")

    start_time = time.time()
    end_time = start_time + duration_seconds

    async def worker(session, worker_id):
        """Individual worker generating load"""
        worker_results = []
        request_count = 0

        while time.time() < end_time:
            request_count += 1

            # Determine pattern (only single_pessimistic)
            pattern = 'single_pessimistic'

            # Failure injection logic
            rand_val = random.random()
            if rand_val < 0.08:  # 8% stock failure
                scenario = 'stock_failure'
            elif rand_val < 0.11:  # 3% payment failure (8% + 3%)
                scenario = 'payment_failure'
            else:
                scenario = 'success'

            url = ENDPOINTS[pattern]
            payload = generate_test_payload(scenario, pattern)

            result = await make_request(session, url, payload, pattern, f"load_{scenario}")
            worker_results.append(result)

            # Control request rate (roughly 0.5 requests per second per worker)
            await asyncio.sleep(random.uniform(1.5, 2.5))

        print(f"Worker {worker_id:3d} completed {request_count} requests")
        return worker_results

    # Run concurrent workers
    async with aiohttp.ClientSession(
        connector=aiohttp.TCPConnector(limit=virtual_users, limit_per_host=virtual_users//2)
    ) as session:

        tasks = [worker(session, i) for i in range(virtual_users)]
        worker_results = await asyncio.gather(*tasks, return_exceptions=True)

        # Flatten results
        load_results = []
        for worker_result in worker_results:
            if isinstance(worker_result, list):
                load_results.extend(worker_result)
            else:
                print(f"Worker error: {worker_result}")

        test_results.extend(load_results)

    actual_duration = time.time() - start_time
    print(f"\nLoad test completed in {actual_duration:.1f}s")
    print(f"Total requests generated: {len(load_results)}")

    # Quick analysis
    if load_results:
        df_load = pd.DataFrame(load_results)
        success_rate = (df_load['status_code'].isin([200, 201])).mean() * 100
        avg_response_time = df_load['response_time'].mean()
        p95_response_time = df_load['response_time'].quantile(0.95)

        scenario_counts = df_load['scenario'].value_counts()

        print("Load Test Summary:")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Average response time: {avg_response_time:.3f}s")
        print(f"P95 response time: {p95_response_time:.3f}s")
        print("Scenario distribution:")
        for scenario, count in scenario_counts.items():
            print(f"  {scenario}: {count} ({count/len(load_results)*100:.1f}%)")

# Run multi-phase load tests (WARNING: This will take ~6 minutes total)
print("=== Starting Multi-Phase Load Tests ===")
print("This will take approximately 6 minutes total (3 phases).")

# Phase 1: Light load (Warm-up)
print("\n--- Phase 1: Light Load (Warm-up) ---")
print("Duration: 90s, Virtual Users: 30")
light_start = len(test_results)
await run_load_test(duration_seconds=90, virtual_users=30)
light_end = len(test_results)

# Add phase identifier to light load results
for i in range(light_start, light_end):
    if i < len(test_results):
        test_results[i]['load_phase'] = 'light'

# Brief pause between phases
print("Pausing 10 seconds between phases...")
await asyncio.sleep(10)

# Phase 2: Medium load (Standard)
print("\n--- Phase 2: Medium Load (Standard) ---")
print("Duration: 120s, Virtual Users: 80")
medium_start = len(test_results)
await run_load_test(duration_seconds=120, virtual_users=80)
medium_end = len(test_results)

# Add phase identifier to medium load results
for i in range(medium_start, medium_end):
    if i < len(test_results):
        test_results[i]['load_phase'] = 'medium'

# Brief pause between phases
print("Pausing 10 seconds between phases...")
await asyncio.sleep(10)

# Phase 3: Heavy load (Stress test)
print("\n--- Phase 3: Heavy Load (Stress Test) ---")
print("Duration: 90s, Virtual Users: 150")
heavy_start = len(test_results)
await run_load_test(duration_seconds=90, virtual_users=150)
heavy_end = len(test_results)

# Add phase identifier to heavy load results
for i in range(heavy_start, heavy_end):
    if i < len(test_results):
        test_results[i]['load_phase'] = 'heavy'

print("\n=== Multi-Phase Load Test Summary ===")
print(f"Light phase: {light_end - light_start} requests")
print(f"Medium phase: {medium_end - medium_start} requests")
print(f"Heavy phase: {len(test_results) - heavy_start} requests")
print(f"Total load test requests: {len(test_results) - light_start}")
print(f"All tests completed. Total results collected: {len(test_results)}")

In [None]:
# Database aggregation and CSV export
def export_performance_csvs():
    """Export performance data to 3 CSV files"""
    print("=== Exporting Performance Data to CSV ===")

    try:
        engine = sqlalchemy.create_engine(CONN_STR)

        # 1. E2E latency CSV (Raw test response times)
        print("Exporting E2E latency data...")
        if 'test_results' in globals() and test_results:
            df_raw = pd.DataFrame(test_results)
            csv_path_e2e = Path.cwd() / 'data' / 'single_pessimistic_pattern' / 'e2e_latency.csv'
            df_raw[['pattern','scenario','status_code','response_time','timestamp','load_phase']].to_csv(csv_path_e2e, index=False)
            print(f"✓ E2E latency data exported: single_pessimistic_pattern/e2e_latency.csv ({len(df_raw)} rows)")
            # Create df_e2e for summary
            df_e2e = df_raw.copy()
            df_e2e['e2e_ms'] = df_e2e['response_time'] * 1000  # seconds to ms
        else:
            # Fallback to database query
            q_e2e = sqlalchemy.text("""
            SELECT
              'single_pessimistic' AS pattern,
              CASE WHEN o.status IN ('CANCELLED','FAILED') THEN 'failure' ELSE 'success' END AS scenario,
              o.order_id,
              o.created_at,
              COALESCE(o.confirmed_at,o.cancelled_at,o.updated_at) AS finished_at,
              TIMESTAMPDIFF(MICROSECOND,o.created_at,COALESCE(o.confirmed_at,o.cancelled_at,o.updated_at)) / 1000 AS e2e_ms,
              NULL AS http_response_time_s
            FROM orders o
            WHERE o.created_at IS NOT NULL
              AND COALESCE(o.confirmed_at,o.cancelled_at,o.updated_at) IS NOT NULL
            ORDER BY o.created_at DESC;
            """)
            df_e2e = pd.read_sql_query(q_e2e, engine, parse_dates=['created_at','finished_at'])
            csv_path_e2e = Path.cwd() / 'data' / 'single_pessimistic_pattern' / 'e2e_latency.csv'
            df_e2e.to_csv(csv_path_e2e, index=False)
            print(f"✓ E2E latency data exported: single_pessimistic_pattern/e2e_latency.csv ({len(df_e2e)} rows)")

        # 2. Convergence Events CSV
        print("Exporting convergence events data...")
        q_conv = sqlalchemy.text("""
        SELECT aggregate_id,event_type,created_at as processed_at
        FROM events
        WHERE created_at IS NOT NULL
        ORDER BY aggregate_id,created_at;
        """)
        df_conv = pd.read_sql_query(q_conv, engine, parse_dates=['processed_at'])
        csv_path_conv = Path.cwd() / 'data' / 'single_pessimistic_pattern' / 'convergence_events.csv'
        df_conv.to_csv(csv_path_conv, index=False)
        print(f"✓ Convergence events exported: single_pessimistic_pattern/convergence_events.csv ({len(df_conv)} rows)")

        # 3. Saga Steps CSV
        print("Exporting saga steps data...")
        q_saga = sqlalchemy.text("""
        WITH step_durations AS (
          SELECT aggregate_id,event_type,created_at as processed_at,
            LAG(created_at,1,created_at) OVER (PARTITION BY aggregate_id ORDER BY created_at) as prev_processed_at
          FROM events WHERE created_at IS NOT NULL
        )
        SELECT
          s.aggregate_id AS saga_id,
          s.aggregate_id AS order_id,
          ROW_NUMBER() OVER (PARTITION BY s.aggregate_id ORDER BY s.processed_at) AS step_number,
          s.event_type AS step_name,
          CASE WHEN s.event_type LIKE :cancel OR s.event_type LIKE :fail THEN 'compensation' ELSE 'forward' END AS command_type,
          'completed' AS status,
          s.prev_processed_at AS started_at,
          s.processed_at AS completed_at,
          TIMESTAMPDIFF(MICROSECOND,s.prev_processed_at,s.processed_at)/1000 AS duration_ms
        FROM step_durations s ORDER BY s.aggregate_id,s.processed_at;
        """)
        df_saga = pd.read_sql_query(q_saga, engine, params={'cancel':'%Cancel%','fail':'%Fail%'}, parse_dates=['started_at','completed_at'])
        csv_path_saga = Path.cwd() / 'data' / 'single_pessimistic_pattern' / 'saga_steps.csv'
        df_saga.to_csv(csv_path_saga, index=False)
        print(f"✓ Saga steps exported: single_pessimistic_pattern/saga_steps.csv ({len(df_saga)} rows)")

        # Summary
        print("\n=== Export Summary ===")
        print(f"E2E latency records: {len(df_e2e)}")
        print(f"Event records: {len(df_conv)}")
        print(f"Saga step records: {len(df_saga)}")
        if len(df_e2e)>0:
            print("\nE2E Latency Summary:")
            print(f"  p50: {df_e2e['e2e_ms'].quantile(0.5):.1f}ms")
            print(f"  p95: {df_e2e['e2e_ms'].quantile(0.95):.1f}ms")
            print(f"  p99: {df_e2e['e2e_ms'].quantile(0.99):.1f}ms")
        if len(df_conv)>0:
            convergence_summary = df_conv.groupby('aggregate_id').agg({'processed_at':['min','max','count']}).reset_index()
            convergence_summary.columns=['aggregate_id','first_event','last_event','event_count']
            convergence_summary['convergence_s']=(convergence_summary['last_event']-convergence_summary['first_event']).dt.total_seconds()
            print("\nConvergence Time Summary:")
            print(f"  Average: {convergence_summary['convergence_s'].mean():.2f}s")
            print(f"  p95: {convergence_summary['convergence_s'].quantile(0.95):.2f}s")
        if len(df_saga)>0:
            compensation_count=df_saga['command_type'].str.contains('compensation').sum()
            print("\nSaga Steps Summary:")
            print(f"  Total steps: {len(df_saga)}")
            print(f"  Compensation steps: {compensation_count}")
            print(f"  Compensation rate: {compensation_count/len(df_saga)*100:.1f}%")
        return df_e2e, df_conv, df_saga
    except Exception as e:
        print(f"Error during CSV export: {e}")
        print("Note: Ensure database services are running and tables exist.")
        return None, None, None

# Export CSV files
df_e2e, df_conv, df_saga = export_performance_csvs()

In [None]:
# Export load phase results to CSV
print("=== Exporting Load Phase Results to CSV ===")

try:
    if 'test_results' in globals() and test_results:
        df_load_phase = pd.DataFrame(test_results)

        # Filter only load test results (exclude single tests)
        load_results = df_load_phase[df_load_phase['load_phase'] != 'single']

        if not load_results.empty:
            # Group by load_phase and scenario
            load_phase_summary = load_results.groupby(['load_phase', 'scenario']).agg({
                'response_time': ['count', 'mean', 'std', 'min', 'max', lambda x: x.quantile(0.95), lambda x: x.quantile(0.99)],
                'status_code': lambda x: (x.isin([200, 201])).sum()
            }).round(3)

            # Flatten column names
            load_phase_summary.columns = ['count', 'mean_rt', 'std_rt', 'min_rt', 'max_rt', 'p95_rt', 'p99_rt', 'success_count']
            load_phase_summary = load_phase_summary.reset_index()

            # Calculate success rate
            load_phase_summary['success_rate'] = (load_phase_summary['success_count'] / load_phase_summary['count'] * 100).round(1)

            # Export to CSV
            csv_path_load = Path.cwd() / 'data' / 'single_pessimistic_pattern' / 'load_phase_results.csv'
            load_phase_summary.to_csv(csv_path_load, index=False)
            print(f"✓ Load phase results exported: single_pessimistic_pattern/load_phase_results.csv ({len(load_phase_summary)} rows)")

            # Display summary
            print("\nLoad Phase Summary:")
            for phase in ['light', 'medium', 'heavy']:
                phase_data = load_phase_summary[load_phase_summary['load_phase'] == phase]
                if not phase_data.empty:
                    total_requests = phase_data['count'].sum()
                    avg_success_rate = phase_data['success_rate'].mean()
                    avg_p95 = phase_data['p95_rt'].mean()
                    print(f"  {phase.capitalize()} phase: {total_requests} requests, Success rate: {avg_success_rate:.1f}%, P95 RT: {avg_p95:.1f}ms")
        else:
            print("No load test results found in test_results.")
    else:
        print("test_results not found or empty.")

except Exception as e:
    print(f"Error during load phase CSV export: {e}")