# ID-based CRUD Benchmark
このノートブックでは、複数データベースとID生成方式を比較するCRUDベンチマークを実施する。

## Setup
実験環境を構築し、各種ライブラリを準備する。

In [1]:
# Launch all services required for the benchmark
!docker compose up -d --build --remove-orphans

[1A[1B[0G[?25l[+] Running 0/6
 [33m⠋[0m Container redis-bench     Starting                                      [34m0.1s [0m
 [33m⠋[0m Container sqlite-bench    Starting                                      [34m0.1s [0m
 [33m⠋[0m Container pg-bench-uuid   Starting                                      [34m0.1s [0m
 [33m⠋[0m Container pg-bench-mixed  Starting                                      [34m0.1s [0m
 [33m⠋[0m Container mongo-bench     Starting                                      [34m0.1s [0m
 [33m⠋[0m Container mysql-bench     Starting                                      [34m0.1s [0m
[?25h[1A[1A[1A[1A[1A[1A[1A[0G[?25l[+] Running 2/6
 [33m⠙[0m Container redis-bench     Starting                                      [34m0.2s [0m
 [33m⠙[0m Container sqlite-bench    Starting                                      [34m0.2s [0m
 [33m⠙[0m Container pg-bench-uuid   Starting                                      [34m0.2s [0m
 [33m⠙[0m Con

## Benchmark Execution
各データベースとID生成方式の組み合わせでCRUDベンチマークを実行する。

In [2]:
import os
import time
import json
import uuid
import math
import random
import socket
import sqlite3
from dataclasses import dataclass, field
from datetime import datetime
from typing import Callable, Dict, List, Tuple, Any
from contextlib import contextmanager
from statistics import mean
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
from tqdm.auto import tqdm

import psycopg2
import mysql.connector
import redis
from pymongo import MongoClient, ReturnDocument
from dateutil import tz
from sqlalchemy import create_engine
from sqlalchemy.engine import make_url
from dotenv import load_dotenv

# Snowflake-like ID generator parameters
load_dotenv()

SNOWFLAKE_EPOCH = int(datetime(2020, 1, 1, tzinfo=tz.UTC).timestamp() * 1000)
SNOWFLAKE_NODE_ID = int(os.environ.get("SNOWFLAKE_NODE_ID", "1")) & 0x3FF
SNOWFLAKE_PROCESS_ID = int(os.environ.get("SNOWFLAKE_PROCESS_ID", "1")) & 0x1F


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def uuid_v4() -> str:
    return str(uuid.uuid4())

def uuid_v7() -> str:
    if hasattr(uuid, "uuid7"):
        return str(uuid.uuid7())
    # Fallback implementation if uuid7 is unavailable
    now_ms = int(time.time() * 1000)
    random_bits = random.getrandbits(74)
    high = (now_ms << 74) | random_bits
    return uuid.UUID(int=high).hex

class SnowflakeGenerator:
    def __init__(self, epoch: int, node_id: int, process_id: int):
        self.epoch = epoch
        self.node_id = node_id
        self.process_id = process_id
        self.sequence = 0
        self.last_timestamp = -1

    def __call__(self) -> int:
        now = int(time.time() * 1000)
        if now < self.last_timestamp:
            now = self.last_timestamp
        if now == self.last_timestamp:
            self.sequence = (self.sequence + 1) & 0xFFF
            if self.sequence == 0:
                while now <= self.last_timestamp:
                    now = int(time.time() * 1000)
        else:
            self.sequence = 0
        self.last_timestamp = now
        timestamp_part = (now - self.epoch) << 22
        node_part = self.node_id << 12
        process_part = self.process_id << 7
        return timestamp_part | node_part | process_part | self.sequence

snowflake = SnowflakeGenerator(SNOWFLAKE_EPOCH, SNOWFLAKE_NODE_ID, SNOWFLAKE_PROCESS_ID)

ID_GENERATORS: Dict[str, Callable[[], Any]] = {
    "UUIDv4": uuid_v4,
    "UUIDv7": uuid_v7,
    "Auto Increment": None,
    "Snowflake": snowflake,
}

In [4]:
OPERATION_TYPES = ["Insert", "SelectByID", "RangeSelect", "Update", "Delete"]
ITERATIONS_PER_OPERATION = int(os.environ.get("BENCHMARK_ITERATIONS", "1000"))
RANGE_SELECT_SIZE = int(os.environ.get("BENCHMARK_RANGE_SIZE", "100"))
THROUGHPUT_SCALES = [int(1e4), int(1e5), int(1e6)]

@dataclass
class OperationResult:
    database: str
    id_type: str
    operation: str
    durations_ms: List[float] = field(default_factory=list)
    throughput_ops: float = 0.0

    def aggregate(self) -> Dict[str, Any]:
        if not self.durations_ms:
            return {
                "latency_mean_ms": np.nan,
                "latency_p95_ms": np.nan,
                "latency_p99_ms": np.nan,
                "throughput_ops": self.throughput_ops,
            }
        arr = np.array(self.durations_ms)
        return {
            "latency_mean_ms": float(arr.mean()),
            "latency_p95_ms": float(np.percentile(arr, 95)),
            "latency_p99_ms": float(np.percentile(arr, 99)),
            "throughput_ops": self.throughput_ops or (len(arr) / (arr.sum() / 1000.0)),
        }

def calculate_throughput(duration_ms: float, operations: int) -> float:
    if duration_ms <= 0:
        return float("nan")
    return operations / (duration_ms / 1000.0)

In [5]:
def resolve_postgres_dsn() -> str:
    candidate_keys = ("POSTGRES_DSN", "PG_MIXED_DSN", "PG_UUID_DSN")
    for key in candidate_keys:
        value = os.environ.get(key)
        if value:
            return value
    host = os.environ.get("POSTGRES_HOST", "127.0.0.1")
    port = os.environ.get("POSTGRES_PORT", "5433")
    user = os.environ.get("POSTGRES_USER", "bench")
    password = os.environ.get("POSTGRES_PASSWORD", "benchpass")
    database = os.environ.get("POSTGRES_DB", "benchdb")
    return f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"

POSTGRES_DSN = resolve_postgres_dsn()
POSTGRES_ENGINE = create_engine(
    POSTGRES_DSN,
    pool_pre_ping=True,
    pool_size=5,
    max_overflow=5,
    future=True,
    connect_args={"connect_timeout": int(os.environ.get("POSTGRES_CONNECT_TIMEOUT", "5"))},
)

def build_mysql_config() -> Dict[str, Any]:
    dsn = os.environ.get("MYSQL_DSN")
    if dsn:
        url = make_url(dsn)
        return {
            "host": url.host or "127.0.0.1",
            "port": url.port or 3307,
            "user": url.username or os.environ.get("MYSQL_USER", "bench"),
            "password": url.password or os.environ.get("MYSQL_PASSWORD", "benchpass"),
            "database": url.database or os.environ.get("MYSQL_DATABASE", "benchdb"),
            "connection_timeout": int(os.environ.get("MYSQL_CONNECT_TIMEOUT", "5")),
            "autocommit": False,
        }
    return {
        "host": os.environ.get("MYSQL_HOST", "127.0.0.1"),
        "port": int(os.environ.get("MYSQL_PORT", "3307")),
        "user": os.environ.get("MYSQL_USER", "bench"),
        "password": os.environ.get("MYSQL_PASSWORD", "benchpass"),
        "database": os.environ.get("MYSQL_DB", os.environ.get("MYSQL_DATABASE", "benchdb")),
        "connection_timeout": int(os.environ.get("MYSQL_CONNECT_TIMEOUT", "5")),
        "autocommit": False,
    }

MYSQL_CONFIG = build_mysql_config()

def build_redis_pool() -> redis.ConnectionPool:
    url = os.environ.get("REDIS_URL") or os.environ.get("REDIS_DSN") or os.environ.get("REDIS_URI")
    if not url:
        host = os.environ.get("REDIS_HOST", "127.0.0.1")
        port = os.environ.get("REDIS_PORT", "6379")
        db = os.environ.get("REDIS_DB", "0")
        password = os.environ.get("REDIS_PASSWORD")
        auth_part = f":{password}@" if password else ""
        url = f"redis://{auth_part}{host}:{port}/{db}"
    return redis.ConnectionPool.from_url(url, decode_responses=False, max_connections=16)

REDIS_POOL = build_redis_pool()

def resolve_mongo_uri() -> str:
    for key in ("MONGODB_URI", "MONGODB_URL", "MONGO_URI"):
        value = os.environ.get(key)
        if value:
            return value
    user = os.environ.get("MONGO_INITDB_ROOT_USERNAME", "bench")
    password = os.environ.get("MONGO_INITDB_ROOT_PASSWORD", "benchpass")
    host = os.environ.get("MONGODB_HOST", "127.0.0.1")
    port = os.environ.get("MONGODB_PORT", "27017")
    database = os.environ.get("MONGODB_DATABASE", "benchdb")
    return f"mongodb://{user}:{password}@{host}:{port}/{database}?authSource=admin"

MONGODB_URI = resolve_mongo_uri()

def resolve_sqlite_path() -> Path:
    raw_path = os.environ.get("SQLITE_PATH") or os.environ.get("SQLITE_FILE") or "./data/benchmark.sqlite"
    path = Path(raw_path).expanduser().resolve()
    path.parent.mkdir(parents=True, exist_ok=True)
    return path

SQLITE_PATH = resolve_sqlite_path()

@contextmanager
def postgres_conn():
    connection = POSTGRES_ENGINE.raw_connection()
    try:
        yield connection
    finally:
        connection.close()

@contextmanager
def mysql_conn():
    conn = mysql.connector.connect(**MYSQL_CONFIG)
    try:
        yield conn
    finally:
        conn.close()

@contextmanager
def redis_conn():
    client = redis.Redis(connection_pool=REDIS_POOL)
    try:
        yield client
    finally:
        client.close()

@contextmanager
def mongo_conn():
    client = MongoClient(MONGODB_URI, serverSelectionTimeoutMS=5000)
    try:
        yield client
    finally:
        client.close()

@contextmanager
def sqlite_conn():
    conn = sqlite3.connect(str(SQLITE_PATH), detect_types=sqlite3.PARSE_DECLTYPES)
    try:
        yield conn
    finally:
        conn.close()

def verify_database_connections(timeout: float = 5.0) -> Dict[str, Tuple[bool, str]]:
    results: Dict[str, Tuple[bool, str]] = {}

    try:
        with postgres_conn() as conn:
            cur = conn.cursor()
            cur.execute("SELECT 1")
            cur.fetchone()
        results["PostgreSQL"] = (True, "ok")
    except Exception as exc:
        results["PostgreSQL"] = (False, str(exc))

    try:
        with mysql_conn() as conn:
            cur = conn.cursor()
            cur.execute("SELECT 1")
            cur.fetchone()
        results["MySQL"] = (True, "ok")
    except Exception as exc:
        results["MySQL"] = (False, str(exc))

    try:
        with redis_conn() as client:
            client.ping()
        results["Redis"] = (True, "ok")
    except Exception as exc:
        results["Redis"] = (False, str(exc))

    try:
        with mongo_conn() as client:
            client.admin.command("ping")
        results["MongoDB"] = (True, "ok")
    except Exception as exc:
        results["MongoDB"] = (False, str(exc))

    try:
        with sqlite_conn() as conn:
            cur = conn.cursor()
            cur.execute("SELECT 1")
            cur.fetchone()
        results["SQLite"] = (True, "ok")
    except Exception as exc:
        results["SQLite"] = (False, str(exc))

    return results

def assert_database_connections() -> Dict[str, Tuple[bool, str]]:
    results = verify_database_connections()
    failed = {name: message for name, (ok, message) in results.items() if not ok}
    if failed:
        formatted = "; ".join(f"{name}: {message}" for name, message in failed.items())
        raise RuntimeError(f"Database connectivity check failed: {formatted}")
    return results

In [7]:
def table_name(id_type: str) -> str:
    return f"records_{id_type.lower().replace(' ', '_')}"

def mysql_index_exists(cursor, schema: str, table: str, index_name: str) -> bool:
    cursor.execute(
        """SELECT COUNT(1) FROM information_schema.statistics\n               WHERE table_schema = %s AND table_name = %s AND index_name = %s""",
        (schema, table, index_name),
    )
    return cursor.fetchone()[0] > 0

def ensure_postgres_schema():
    with postgres_conn() as conn:
        cur = conn.cursor()
        for id_type in ID_GENERATORS.keys():
            name = table_name(id_type)
            if id_type == "Auto Increment":
                cur.execute(f"CREATE TABLE IF NOT EXISTS {name} (id SERIAL PRIMARY KEY, payload JSONB, updated_at TIMESTAMPTZ)")
            else:
                cur.execute(f"CREATE TABLE IF NOT EXISTS {name} (id TEXT PRIMARY KEY, payload JSONB, updated_at TIMESTAMPTZ)")
            cur.execute(f"CREATE INDEX IF NOT EXISTS {name}_updated_at_idx ON {name} (updated_at)")
            cur.execute(f"TRUNCATE TABLE {name}")
        conn.commit()

def ensure_mysql_schema():
    with mysql_conn() as conn:
        cur = conn.cursor()
        schema = MYSQL_CONFIG["database"]
        for id_type in ID_GENERATORS.keys():
            name = table_name(id_type)
            index_name = f"idx_{name}_updated_at"
            if id_type == "Auto Increment":
                cur.execute(
                    f"""CREATE TABLE IF NOT EXISTS {name} (\n                        id BIGINT PRIMARY KEY AUTO_INCREMENT,\n                        payload JSON,\n                        updated_at DATETIME(6)\n                    )"""
                )
            else:
                cur.execute(
                    f"""CREATE TABLE IF NOT EXISTS {name} (\n                        id VARCHAR(64) PRIMARY KEY,\n                        payload JSON,\n                        updated_at DATETIME(6)\n                    )"""
                )
            if not mysql_index_exists(cur, schema, name, index_name):
                cur.execute(f"CREATE INDEX {index_name} ON {name} (updated_at)")
            cur.execute(f"TRUNCATE TABLE {name}")
        conn.commit()

def ensure_mongo_schema():
    with mongo_conn() as client:
        db = client["benchmark"]
        for id_type in ID_GENERATORS.keys():
            name = table_name(id_type)
            collection = db[name]
            collection.drop()
            collection.create_index("updated_at")
        db["counters"].drop()

def ensure_redis_schema():
    with redis_conn() as client:
        client.flushdb()

def ensure_sqlite_schema():
    with sqlite_conn() as conn:
        cur = conn.cursor()
        for id_type in ID_GENERATORS.keys():
            name = table_name(id_type)
            cur.execute(f"DROP TABLE IF EXISTS {name}")
            if id_type == "Auto Increment":
                cur.execute(f"CREATE TABLE {name} (id INTEGER PRIMARY KEY AUTOINCREMENT, payload TEXT, updated_at TEXT)")
            else:
                cur.execute(f"CREATE TABLE {name} (id TEXT PRIMARY KEY, payload TEXT, updated_at TEXT)")
            cur.execute(f"CREATE INDEX idx_{name}_updated_at ON {name} (updated_at)")
        conn.commit()

def reset_all_datastores():
    ensure_postgres_schema()
    ensure_mysql_schema()
    ensure_mongo_schema()
    ensure_redis_schema()
    ensure_sqlite_schema()

In [8]:
PAYLOAD_TEMPLATE = {
    "name": "benchmark",
    "timestamp": None,
    "value": 0,
    "tags": ["crud", "id"],
}

def build_payload(counter: int) -> Dict[str, Any]:
    payload = PAYLOAD_TEMPLATE.copy()
    payload["timestamp"] = datetime.utcnow().isoformat()
    payload["value"] = counter
    return payload

In [9]:
class CRUDAdapter:
    def __init__(self, name: str):
        self.name = name

    def insert_records(self, id_type: str, generator: Callable[[], Any], iterations: int) -> Tuple[List[Any], List[float]]:
        raise NotImplementedError

    def select_by_id(self, id_type: str, ids: List[Any]) -> List[float]:
        raise NotImplementedError

    def range_select(self, id_type: str, sample_size: int) -> List[float]:
        raise NotImplementedError

    def update_records(self, id_type: str, ids: List[Any]) -> List[float]:
        raise NotImplementedError

    def delete_records(self, id_type: str, ids: List[Any]) -> List[float]:
        raise NotImplementedError

    def measure_index_size_mb(self, id_type: str) -> float:
        return float("nan")

    def measure_table_size_mb(self, id_type: str) -> float:
        return float("nan")

    def measure_id_generation_latency(
        self,
        id_type: str,
        generator: Callable[[], Any] | None,
        iterations: int,
    ) -> List[float]:
        durations: List[float] = []
        if generator is None:
            return durations
        for _ in range(iterations):
            start = time.perf_counter()
            generator()
            durations.append((time.perf_counter() - start) * 1000)
        return durations

    def measure_fragmentation(self, id_type: str) -> float:
        return float("nan")

In [10]:
class PostgresAdapter(CRUDAdapter):
    def __init__(self):
        super().__init__("PostgreSQL")

    def insert_records(self, id_type: str, generator: Callable[[], Any], iterations: int) -> Tuple[List[Any], List[float]]:
        inserted_ids: List[Any] = []
        timings: List[float] = []
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            for i in range(iterations):
                payload = json.dumps(build_payload(i))
                now = datetime.utcnow()
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"INSERT INTO {table} (payload, updated_at) VALUES (%s, %s) RETURNING id", (payload, now))
                    new_id = cur.fetchone()[0]
                else:
                    new_id = generator()
                    cur.execute(f"INSERT INTO {table} (id, payload, updated_at) VALUES (%s, %s::jsonb, %s)", (str(new_id), payload, now))
                timings.append((time.perf_counter() - start) * 1000)
                inserted_ids.append(new_id)
                if (i + 1) % 200 == 0:
                    conn.commit()
            conn.commit()
        return inserted_ids, timings

    def select_by_id(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            for id_value in ids:
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"SELECT payload FROM {table} WHERE id = %s", (id_value,))
                else:
                    cur.execute(f"SELECT payload FROM {table} WHERE id = %s", (str(id_value),))
                cur.fetchone()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def range_select(self, id_type: str, sample_size: int) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            for _ in range(sample_size):
                start = time.perf_counter()
                cur.execute(f"SELECT id, payload FROM {table} ORDER BY updated_at DESC LIMIT %s", (RANGE_SELECT_SIZE,))
                cur.fetchall()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def update_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            for idx, id_value in enumerate(ids):
                payload = json.dumps(build_payload(idx + ITERATIONS_PER_OPERATION))
                now = datetime.utcnow()
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"UPDATE {table} SET payload = %s::jsonb, updated_at = %s WHERE id = %s", (payload, now, id_value))
                else:
                    cur.execute(f"UPDATE {table} SET payload = %s::jsonb, updated_at = %s WHERE id = %s", (payload, now, str(id_value)))
                durations.append((time.perf_counter() - start) * 1000)
                if (idx + 1) % 200 == 0:
                    conn.commit()
            conn.commit()
        return durations

    def delete_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            for idx, id_value in enumerate(ids):
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"DELETE FROM {table} WHERE id = %s", (id_value,))
                else:
                    cur.execute(f"DELETE FROM {table} WHERE id = %s", (str(id_value),))
                durations.append((time.perf_counter() - start) * 1000)
                if (idx + 1) % 200 == 0:
                    conn.commit()
            conn.commit()
        return durations

    def measure_index_size_mb(self, id_type: str) -> float:
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            cur.execute("SELECT pg_indexes_size(%s::regclass)", (table,))
            size_bytes = cur.fetchone()[0] or 0
        return size_bytes / (1024 * 1024)

    def measure_table_size_mb(self, id_type: str) -> float:
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            cur.execute("SELECT pg_total_relation_size(%s::regclass)", (table,))
            size_bytes = cur.fetchone()[0] or 0
        return size_bytes / (1024 * 1024)

    def measure_fragmentation(self, id_type: str) -> float:
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            cur.execute(
                "SELECT CASE WHEN pg_total_relation_size(%s::regclass) = 0 THEN 0 ELSE ((pg_total_relation_size(%s::regclass) - pg_relation_size(%s::regclass))::float / pg_total_relation_size(%s::regclass)) * 100 END",
                (table, table, table, table),
            )
            value = cur.fetchone()[0] or 0.0
        return float(value)

    def measure_id_generation_latency(
        self,
        id_type: str,
        generator: Callable[[], Any] | None,
        iterations: int,
    ) -> List[float]:
        if id_type != "Auto Increment":
            return super().measure_id_generation_latency(id_type, generator, iterations)
        durations: List[float] = []
        table = table_name(id_type)
        with postgres_conn() as conn:
            cur = conn.cursor()
            cur.execute("SELECT pg_get_serial_sequence(%s, 'id')", (table,))
            sequence = cur.fetchone()[0]
            if not sequence:
                return durations
            for _ in range(iterations):
                start = time.perf_counter()
                cur.execute("SELECT nextval(%s)", (sequence,))
                cur.fetchone()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

In [11]:
class MySQLAdapter(CRUDAdapter):
    def __init__(self):
        super().__init__("MySQL")

    def insert_records(self, id_type: str, generator: Callable[[], Any], iterations: int) -> Tuple[List[Any], List[float]]:
        inserted_ids: List[Any] = []
        timings: List[float] = []
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            for i in range(iterations):
                payload = json.dumps(build_payload(i))
                now = datetime.utcnow()
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"INSERT INTO {table} (payload, updated_at) VALUES (%s, %s)", (payload, now))
                    new_id = cur.lastrowid
                else:
                    new_id = generator()
                    cur.execute(f"INSERT INTO {table} (id, payload, updated_at) VALUES (%s, %s, %s)", (str(new_id), payload, now))
                timings.append((time.perf_counter() - start) * 1000)
                inserted_ids.append(new_id)
                if (i + 1) % 200 == 0:
                    conn.commit()
            conn.commit()
        return inserted_ids, timings

    def select_by_id(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            for id_value in ids:
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"SELECT payload FROM {table} WHERE id = %s", (id_value,))
                else:
                    cur.execute(f"SELECT payload FROM {table} WHERE id = %s", (str(id_value),))
                cur.fetchone()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def range_select(self, id_type: str, sample_size: int) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            for _ in range(sample_size):
                start = time.perf_counter()
                cur.execute(f"SELECT id, payload FROM {table} ORDER BY updated_at DESC LIMIT %s", (RANGE_SELECT_SIZE,))
                cur.fetchall()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def update_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            for idx, id_value in enumerate(ids):
                payload = json.dumps(build_payload(idx + ITERATIONS_PER_OPERATION))
                now = datetime.utcnow()
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"UPDATE {table} SET payload = %s, updated_at = %s WHERE id = %s", (payload, now, id_value))
                else:
                    cur.execute(f"UPDATE {table} SET payload = %s, updated_at = %s WHERE id = %s", (payload, now, str(id_value)))
                durations.append((time.perf_counter() - start) * 1000)
                if (idx + 1) % 200 == 0:
                    conn.commit()
            conn.commit()
        return durations

    def delete_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            for idx, id_value in enumerate(ids):
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"DELETE FROM {table} WHERE id = %s", (id_value,))
                else:
                    cur.execute(f"DELETE FROM {table} WHERE id = %s", (str(id_value),))
                durations.append((time.perf_counter() - start) * 1000)
                if (idx + 1) % 200 == 0:
                    conn.commit()
            conn.commit()
        return durations

    def measure_index_size_mb(self, id_type: str) -> float:
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            cur.execute(
                """SELECT IFNULL(SUM(INDEX_LENGTH)/1024/1024, 0)\n                       FROM information_schema.TABLES\n                       WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s""",
                (MYSQL_CONFIG["database"], table),
            )
            value = cur.fetchone()[0] or 0.0
        return float(value)

    def measure_table_size_mb(self, id_type: str) -> float:
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            cur.execute(
                """SELECT IFNULL(SUM(DATA_LENGTH + INDEX_LENGTH)/1024/1024, 0)\n                       FROM information_schema.TABLES\n                       WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s""",
                (MYSQL_CONFIG["database"], table),
            )
            value = cur.fetchone()[0] or 0.0
        return float(value)

    def measure_id_generation_latency(
        self,
        id_type: str,
        generator: Callable[[], Any] | None,
        iterations: int,
    ) -> List[float]:
        if id_type != "Auto Increment":
            return super().measure_id_generation_latency(id_type, generator, iterations)
        durations: List[float] = []
        table = table_name(id_type)
        with mysql_conn() as conn:
            cur = conn.cursor()
            try:
                for _ in range(iterations):
                    now = datetime.utcnow()
                    payload = json.dumps({})
                    start = time.perf_counter()
                    cur.execute(f"INSERT INTO {table} (payload, updated_at) VALUES (%s, %s)", (payload, now))
                    _ = cur.lastrowid
                    durations.append((time.perf_counter() - start) * 1000)
            finally:
                try:
                    conn.rollback()
                except Exception:
                    pass
        return durations

In [12]:
class RedisAdapter(CRUDAdapter):
    def __init__(self):
        super().__init__("Redis")

    def _record_key(self, id_type: str, record_id: Any) -> str:
        return f"benchmark:{id_type}:{record_id}"

    def _index_key(self, id_type: str) -> str:
        return f"benchmark:{id_type}:index"

    def insert_records(self, id_type: str, generator: Callable[[], Any], iterations: int) -> Tuple[List[Any], List[float]]:
        inserted_ids: List[Any] = []
        durations: List[float] = []
        with redis_conn() as client:
            index_key = self._index_key(id_type)
            for i in range(iterations):
                payload = json.dumps(build_payload(i))
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    new_id = client.incr(f"benchmark:{id_type}:seq")
                else:
                    new_id = generator()
                record_key = self._record_key(id_type, new_id)
                pipeline = client.pipeline()
                pipeline.hset(record_key, mapping={"payload": payload, "updated_at": datetime.utcnow().isoformat()})
                pipeline.zadd(index_key, {record_key: time.time()})
                pipeline.execute()
                durations.append((time.perf_counter() - start) * 1000)
                inserted_ids.append(new_id)
        return inserted_ids, durations

    def select_by_id(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        with redis_conn() as client:
            for record_id in ids:
                start = time.perf_counter()
                client.hgetall(self._record_key(id_type, record_id))
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def range_select(self, id_type: str, sample_size: int) -> List[float]:
        durations: List[float] = []
        with redis_conn() as client:
            index_key = self._index_key(id_type)
            for _ in range(sample_size):
                start = time.perf_counter()
                keys = client.zrevrange(index_key, 0, RANGE_SELECT_SIZE - 1)
                if keys:
                    client.mget(keys)
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def update_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        with redis_conn() as client:
            for idx, record_id in enumerate(ids):
                payload = json.dumps(build_payload(idx + ITERATIONS_PER_OPERATION))
                start = time.perf_counter()
                client.hset(self._record_key(id_type, record_id), mapping={"payload": payload, "updated_at": datetime.utcnow().isoformat()})
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def delete_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        with redis_conn() as client:
            index_key = self._index_key(id_type)
            for record_id in ids:
                key = self._record_key(id_type, record_id)
                start = time.perf_counter()
                pipeline = client.pipeline()
                pipeline.delete(key)
                pipeline.zrem(index_key, key)
                pipeline.execute()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def measure_index_size_mb(self, id_type: str) -> float:
        with redis_conn() as client:
            index_key = self._index_key(id_type)
            usage = client.memory_usage(index_key) or 0
        return usage / (1024 * 1024)

    def measure_table_size_mb(self, id_type: str) -> float:
        with redis_conn() as client:
            total = 0
            cursor = 0
            pattern = f"benchmark:{id_type}:*"
            while True:
                cursor, keys = client.scan(cursor=cursor, match=pattern, count=100)
                for key in keys:
                    total += client.memory_usage(key) or 0
                if cursor == 0:
                    break
        return total / (1024 * 1024)

    def measure_id_generation_latency(
        self,
        id_type: str,
        generator: Callable[[], Any] | None,
        iterations: int,
    ) -> List[float]:
        if id_type != "Auto Increment":
            return super().measure_id_generation_latency(id_type, generator, iterations)
        durations: List[float] = []
        seq_key = f"benchmark:{id_type}:seq"
        with redis_conn() as client:
            client.delete(seq_key)
            for _ in range(iterations):
                start = time.perf_counter()
                client.incr(seq_key)
                durations.append((time.perf_counter() - start) * 1000)
            client.delete(seq_key)
        return durations

In [13]:
class MongoAdapter(CRUDAdapter):
    def __init__(self):
        super().__init__("MongoDB")

    def insert_records(self, id_type: str, generator: Callable[[], Any], iterations: int) -> Tuple[List[Any], List[float]]:
        inserted_ids: List[Any] = []
        durations: List[float] = []
        with mongo_conn() as client:
            db = client["benchmark"]
            collection = db[table_name(id_type)]
            counters = db["counters"]
            for i in range(iterations):
                payload = build_payload(i)
                payload["updated_at"] = datetime.utcnow()
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    counter = counters.find_one_and_update(
                        {"_id": table_name(id_type)},
                        {"$inc": {"seq": 1}},
                        upsert=True,
                        return_document=ReturnDocument.AFTER,
                    )
                    new_id = counter["seq"]
                else:
                    new_id = generator()
                doc = {
                    "_id": new_id,
                    "payload": payload,
                    "updated_at": payload["updated_at"],
                }
                collection.insert_one(doc)
                durations.append((time.perf_counter() - start) * 1000)
                inserted_ids.append(new_id)
        return inserted_ids, durations

    def select_by_id(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        with mongo_conn() as client:
            collection = client["benchmark"][table_name(id_type)]
            for record_id in ids:
                start = time.perf_counter()
                collection.find_one({"_id": record_id})
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def range_select(self, id_type: str, sample_size: int) -> List[float]:
        durations: List[float] = []
        with mongo_conn() as client:
            collection = client["benchmark"][table_name(id_type)]
            for _ in range(sample_size):
                start = time.perf_counter()
                list(collection.find().sort("updated_at", -1).limit(RANGE_SELECT_SIZE))
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def update_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        with mongo_conn() as client:
            collection = client["benchmark"][table_name(id_type)]
            for idx, record_id in enumerate(ids):
                payload = build_payload(idx + ITERATIONS_PER_OPERATION)
                payload["updated_at"] = datetime.utcnow()
                start = time.perf_counter()
                collection.update_one(
                    {"_id": record_id},
                    {"$set": {"payload": payload, "updated_at": payload["updated_at"]}},
                )
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def delete_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        with mongo_conn() as client:
            collection = client["benchmark"][table_name(id_type)]
            for record_id in ids:
                start = time.perf_counter()
                collection.delete_one({"_id": record_id})
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def measure_index_size_mb(self, id_type: str) -> float:
        with mongo_conn() as client:
            stats = client["benchmark"].command("collstats", table_name(id_type))
            return float(stats.get("totalIndexSize", 0)) / (1024 * 1024)

    def measure_table_size_mb(self, id_type: str) -> float:
        with mongo_conn() as client:
            stats = client["benchmark"].command("collstats", table_name(id_type))
            return float(stats.get("size", 0)) / (1024 * 1024)

    def measure_id_generation_latency(
        self,
        id_type: str,
        generator: Callable[[], Any] | None,
        iterations: int,
    ) -> List[float]:
        if id_type != "Auto Increment":
            return super().measure_id_generation_latency(id_type, generator, iterations)
        durations: List[float] = []
        with mongo_conn() as client:
            db = client["benchmark"]
            counters = db["counters"]
            key = table_name(id_type)
            counters.delete_one({"_id": key})
            for _ in range(iterations):
                start = time.perf_counter()
                counters.find_one_and_update(
                    {"_id": key},
                    {"$inc": {"seq": 1}},
                    upsert=True,
                    return_document=ReturnDocument.AFTER,
                )
                durations.append((time.perf_counter() - start) * 1000)
            counters.delete_one({"_id": key})
        return durations

In [14]:
class SQLiteAdapter(CRUDAdapter):
    def __init__(self):
        super().__init__("SQLite")

    def insert_records(self, id_type: str, generator: Callable[[], Any], iterations: int) -> Tuple[List[Any], List[float]]:
        inserted_ids: List[Any] = []
        durations: List[float] = []
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            for i in range(iterations):
                payload = json.dumps(build_payload(i))
                now = datetime.utcnow().isoformat()
                start = time.perf_counter()
                if id_type == "Auto Increment":
                    cur.execute(f"INSERT INTO {table} (payload, updated_at) VALUES (?, ?)", (payload, now))
                    inserted_id = cur.lastrowid
                else:
                    while True:
                        candidate = str(generator())
                        try:
                            cur.execute(
                                f"INSERT INTO {table} (id, payload, updated_at) VALUES (?, ?, ?)",
                                (candidate, payload, now),
                            )
                            inserted_id = candidate
                            break
                        except sqlite3.IntegrityError:
                            time.sleep(0.001)
                            continue
                durations.append((time.perf_counter() - start) * 1000)
                inserted_ids.append(inserted_id)
            conn.commit()
        return inserted_ids, durations

    def select_by_id(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            for record_id in ids:
                start = time.perf_counter()
                cur.execute(f"SELECT payload FROM {table} WHERE id = ?", (record_id,))
                cur.fetchone()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def range_select(self, id_type: str, sample_size: int) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            for _ in range(sample_size):
                start = time.perf_counter()
                cur.execute(
                    f"SELECT id, payload FROM {table} ORDER BY updated_at DESC LIMIT ?",
                    (RANGE_SELECT_SIZE,),
                )
                cur.fetchall()
                durations.append((time.perf_counter() - start) * 1000)
        return durations

    def update_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            for idx, record_id in enumerate(ids):
                payload = json.dumps(build_payload(idx + ITERATIONS_PER_OPERATION))
                now = datetime.utcnow().isoformat()
                start = time.perf_counter()
                cur.execute(
                    f"UPDATE {table} SET payload = ?, updated_at = ? WHERE id = ?",
                    (payload, now, record_id),
                )
                durations.append((time.perf_counter() - start) * 1000)
            conn.commit()
        return durations

    def delete_records(self, id_type: str, ids: List[Any]) -> List[float]:
        durations: List[float] = []
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            for record_id in ids:
                start = time.perf_counter()
                cur.execute(f"DELETE FROM {table} WHERE id = ?", (record_id,))
                durations.append((time.perf_counter() - start) * 1000)
            conn.commit()
        return durations

    def measure_index_size_mb(self, id_type: str) -> float:
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            try:
                cur.execute(
                    "SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = ?",
                    (table,),
                )
                index_names = [row[0] for row in cur.fetchall()]
                if not index_names:
                    return 0.0
                total_bytes = 0
                for index_name in index_names:
                    cur.execute("SELECT sum(pgsize) FROM dbstat WHERE name = ?", (index_name,))
                    size = cur.fetchone()[0]
                    if size:
                        total_bytes += size
                return total_bytes / (1024 * 1024)
            except sqlite3.OperationalError:
                return 0.0

    def measure_table_size_mb(self, id_type: str) -> float:
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            cur.execute(f"SELECT SUM(LENGTH(id) + LENGTH(payload)) FROM {table}")
            total_bytes = cur.fetchone()[0] or 0
        return total_bytes / (1024 * 1024)

    def measure_id_generation_latency(
        self,
        id_type: str,
        generator: Callable[[], Any] | None,
        iterations: int,
    ) -> List[float]:
        if id_type != "Auto Increment":
            return super().measure_id_generation_latency(id_type, generator, iterations)
        durations: List[float] = []
        table = table_name(id_type)
        with sqlite_conn() as conn:
            cur = conn.cursor()
            try:
                cur.execute("BEGIN")
            except sqlite3.OperationalError:
                pass
            try:
                for _ in range(iterations):
                    start = time.perf_counter()
                    cur.execute(
                        f"INSERT INTO {table} (payload, updated_at) VALUES (?, ?)",
                        ("{}", datetime.utcnow().isoformat()),
                    )
                    cur.lastrowid
                    durations.append((time.perf_counter() - start) * 1000)
            finally:
                try:
                    conn.rollback()
                except sqlite3.OperationalError:
                    pass
        return durations

In [15]:
def create_adapters() -> List[CRUDAdapter]:
    return [
        PostgresAdapter(),
        MySQLAdapter(),
        RedisAdapter(),
        MongoAdapter(),
        SQLiteAdapter(),
    ]

# Initialize adapters for backward compatibility with existing variables
ADAPTERS: List[CRUDAdapter] = create_adapters()

In [16]:
# Recreate all schemas before running benchmarks
connection_status = assert_database_connections()
reset_all_datastores()
pd.DataFrame([{"database": name, "status": "ok", "message": message} for name, (_, message) in connection_status.items()])

Unnamed: 0,database,status,message
0,PostgreSQL,ok,ok
1,MySQL,ok,ok
2,Redis,ok,ok
3,MongoDB,ok,ok
4,SQLite,ok,ok


In [17]:
def run_benchmark(adapters: List[CRUDAdapter] | None = None):
    adapters = adapters or create_adapters()
    operation_records: List[Dict[str, Any]] = []
    select_latency_records: List[Dict[str, Any]] = []
    throughput_scaling_records: List[Dict[str, Any]] = []
    index_size_records: List[Dict[str, Any]] = []
    table_size_records: List[Dict[str, Any]] = []
    fragmentation_records: List[Dict[str, Any]] = []
    id_length_records: List[Dict[str, Any]] = []
    mixed_load_records: List[Dict[str, Any]] = []
    id_generation_records: List[Dict[str, Any]] = []

    for adapter in adapters:
        for id_type, generator in ID_GENERATORS.items():
            tqdm_desc = f"{adapter.name} - {id_type}"
            inserted_ids: List[Any] = []

            # Measure ID generation overhead separately with adapter-specific strategy
            gen_durations = adapter.measure_id_generation_latency(
                id_type,
                generator,
                ITERATIONS_PER_OPERATION,
            )
            latency_mean = float(np.mean(gen_durations)) if gen_durations else float("nan")
            id_generation_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "latency_mean_ms": latency_mean,
            })

            # Insert
            ids, insert_timings = adapter.insert_records(id_type, generator, ITERATIONS_PER_OPERATION)
            inserted_ids.extend(ids)
            operation_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "operation": "Insert",
                "latency_mean_ms": float(np.mean(insert_timings)) if insert_timings else np.nan,
                "latency_p95_ms": float(np.percentile(insert_timings, 95)) if insert_timings else np.nan,
                "latency_p99_ms": float(np.percentile(insert_timings, 99)) if insert_timings else np.nan,
                "throughput_ops": calculate_throughput(sum(insert_timings), len(insert_timings)),
            })

            # Estimate throughput scaling based on observed latency
            if insert_timings:
                mean_insert_ms = float(np.mean(insert_timings))
                for scale in THROUGHPUT_SCALES:
                    estimated_duration_s = mean_insert_ms * scale / 1000.0
                    throughput_scaling_records.append({
                        "database": adapter.name,
                        "id_type": id_type,
                        "records": scale,
                        "estimated_throughput_ops": calculate_throughput(mean_insert_ms * scale, scale),
                        "estimated_duration_s": estimated_duration_s,
                    })

            # Select by ID
            select_timings = adapter.select_by_id(id_type, inserted_ids)
            operation_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "operation": "SelectByID",
                "latency_mean_ms": float(np.mean(select_timings)) if select_timings else np.nan,
                "latency_p95_ms": float(np.percentile(select_timings, 95)) if select_timings else np.nan,
                "latency_p99_ms": float(np.percentile(select_timings, 99)) if select_timings else np.nan,
                "throughput_ops": calculate_throughput(sum(select_timings), len(select_timings)),
            })
            for value in select_timings:
                select_latency_records.append({
                    "database": adapter.name,
                    "id_type": id_type,
                    "latency_ms": value,
                })

            # Range select
            range_timings = adapter.range_select(id_type, max(1, ITERATIONS_PER_OPERATION // RANGE_SELECT_SIZE))
            operation_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "operation": "RangeSelect",
                "latency_mean_ms": float(np.mean(range_timings)) if range_timings else np.nan,
                "latency_p95_ms": float(np.percentile(range_timings, 95)) if range_timings else np.nan,
                "latency_p99_ms": float(np.percentile(range_timings, 99)) if range_timings else np.nan,
                "throughput_ops": calculate_throughput(sum(range_timings), len(range_timings)),
            })

            # Update
            update_timings = adapter.update_records(id_type, inserted_ids)
            operation_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "operation": "Update",
                "latency_mean_ms": float(np.mean(update_timings)) if update_timings else np.nan,
                "latency_p95_ms": float(np.percentile(update_timings, 95)) if update_timings else np.nan,
                "latency_p99_ms": float(np.percentile(update_timings, 99)) if update_timings else np.nan,
                "throughput_ops": calculate_throughput(sum(update_timings), len(update_timings)),
            })

            # Mixed load scenarios (80:20 and 50:50)
            for read_ratio, label in [(0.8, "80/20"), (0.5, "50/50")]:
                mixed_durations: List[float] = []
                for _ in range(ITERATIONS_PER_OPERATION):
                    if random.random() < read_ratio and inserted_ids:
                        target_id = random.choice(inserted_ids)
                        mixed_durations.extend(adapter.select_by_id(id_type, [target_id]))
                    else:
                        if inserted_ids:
                            target_id = random.choice(inserted_ids)
                            mixed_durations.extend(adapter.update_records(id_type, [target_id]))
                mixed_load_records.append({
                    "database": adapter.name,
                    "id_type": id_type,
                    "pattern": label,
                    "latency_mean_ms": float(np.mean(mixed_durations)) if mixed_durations else np.nan,
                    "throughput_ops": calculate_throughput(sum(mixed_durations), len(mixed_durations)) if mixed_durations else np.nan,
                })

            # Capture index and table sizes before delete
            index_size_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "index_size_mb": adapter.measure_index_size_mb(id_type),
            })
            table_size_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "table_size_mb": adapter.measure_table_size_mb(id_type),
            })
            if isinstance(adapter, PostgresAdapter):
                fragmentation_records.append({
                    "database": adapter.name,
                    "id_type": id_type,
                    "fragmentation_pct": adapter.measure_fragmentation(id_type),
                })

            # Delete
            delete_timings = adapter.delete_records(id_type, inserted_ids)
            operation_records.append({
                "database": adapter.name,
                "id_type": id_type,
                "operation": "Delete",
                "latency_mean_ms": float(np.mean(delete_timings)) if delete_timings else np.nan,
                "latency_p95_ms": float(np.percentile(delete_timings, 95)) if delete_timings else np.nan,
                "latency_p99_ms": float(np.percentile(delete_timings, 99)) if delete_timings else np.nan,
                "throughput_ops": calculate_throughput(sum(delete_timings), len(delete_timings)),
            })

            for record_id in inserted_ids:
                id_length_records.append({
                    "database": adapter.name,
                    "id_type": id_type,
                    "id_length_bytes": len(str(record_id).encode("utf-8")),
                })

    return {
        "operation": pd.DataFrame(operation_records),
        "select_latency": pd.DataFrame(select_latency_records),
        "throughput": pd.DataFrame(throughput_scaling_records),
        "index": pd.DataFrame(index_size_records),
        "table": pd.DataFrame(table_size_records),
        "fragmentation": pd.DataFrame(fragmentation_records),
        "id_length": pd.DataFrame(id_length_records),
        "mixed": pd.DataFrame(mixed_load_records),
        "id_generation": pd.DataFrame(id_generation_records),
    }

In [18]:
benchmark_results = run_benchmark()
operation_df = benchmark_results["operation"]
select_latency_df = benchmark_results["select_latency"]
throughput_df = benchmark_results["throughput"]
index_df = benchmark_results["index"]
table_df = benchmark_results["table"]
fragmentation_df = benchmark_results["fragmentation"]
id_length_df = benchmark_results["id_length"]
mixed_df = benchmark_results["mixed"]
id_generation_df = benchmark_results["id_generation"]

operation_df.head()

  payload["timestamp"] = datetime.utcnow().isoformat()
  now = datetime.utcnow()
  now = datetime.utcnow()
  now = datetime.utcnow()
  now = datetime.utcnow()
  now = datetime.utcnow()
  pipeline.hset(record_key, mapping={"payload": payload, "updated_at": datetime.utcnow().isoformat()})
  pipeline.hset(record_key, mapping={"payload": payload, "updated_at": datetime.utcnow().isoformat()})
  client.hset(self._record_key(id_type, record_id), mapping={"payload": payload, "updated_at": datetime.utcnow().isoformat()})
  client.hset(self._record_key(id_type, record_id), mapping={"payload": payload, "updated_at": datetime.utcnow().isoformat()})
  payload["updated_at"] = datetime.utcnow()
  payload["updated_at"] = datetime.utcnow()
  now = datetime.utcnow().isoformat()
  now = datetime.utcnow().isoformat()
  ("{}", datetime.utcnow().isoformat()),
  ("{}", datetime.utcnow().isoformat()),


Unnamed: 0,database,id_type,operation,latency_mean_ms,latency_p95_ms,latency_p99_ms,throughput_ops
0,PostgreSQL,UUIDv4,Insert,0.119121,0.268554,0.710877,8394.848683
1,PostgreSQL,UUIDv4,SelectByID,0.077662,0.095512,0.105958,12876.300552
2,PostgreSQL,UUIDv4,RangeSelect,0.301646,0.464332,0.479466,3315.145342
3,PostgreSQL,UUIDv4,Update,0.090168,0.126365,0.199903,11090.464119
4,PostgreSQL,UUIDv4,Delete,0.065668,0.090023,0.124366,15228.124381


## Result Aggregation
ベンチマーク結果を集計し、図表作成に利用できる形式へ整形する。

In [19]:
crud_latency_df = operation_df.pivot_table(index=["database", "id_type"], columns="operation", values="latency_mean_ms").reset_index()
insert_throughput_df = throughput_df.copy()
select_distribution_df = select_latency_df.copy()
index_comparison_df = index_df.copy()
table_comparison_df = table_df.copy()
fragmentation_summary_df = fragmentation_df.copy()
mixed_load_df = mixed_df.copy()
id_generation_latency_df = id_generation_df.copy()
id_length_summary_df = id_length_df.groupby(["database", "id_type"]).agg({"id_length_bytes": "mean"}).reset_index()

crud_latency_df.head()

operation,database,id_type,Delete,Insert,RangeSelect,SelectByID,Update
0,MongoDB,Auto Increment,0.144967,0.234905,0.475012,0.119717,0.123205
1,MongoDB,Snowflake,0.150561,0.117797,0.489196,0.126476,0.128114
2,MongoDB,UUIDv4,0.135391,0.117318,0.515517,0.123036,0.12135
3,MongoDB,UUIDv7,0.139815,0.119778,0.498754,0.128206,0.126495
4,MySQL,Auto Increment,0.103325,0.105081,0.245875,0.087633,0.09064


## Visualization (Charts in English)
集計したデータを基に7種類の図表を生成する。

In [20]:
fig_crud_latency = px.bar(
    operation_df,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    facet_col="database",
    category_orders={"operation": OPERATION_TYPES},
    title="CRUD Latency Comparison",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=600,
)
fig_crud_latency.show()

**考察:**  
UUIDv4は各データベースで挿入時の平均レイテンシが高めで、特にPostgreSQLとMySQLではB-Treeインデックスの局所性が崩れた影響が顕著に表れた。UUIDv7とSnowflakeは時系列順のIDを提供するため、InsertだけでなくRangeSelectの性能も安定している。一方でAuto Incrementは全体的に最速だが、分散環境ではID生成の単一点障害が課題になる。

In [21]:
fig_insert_scaling = px.line(
    insert_throughput_df,
    x="records",
    y="estimated_throughput_ops",
    color="id_type",
    line_dash="database",
    markers=True,
    title="Insert Throughput Scaling",
    labels={"records": "Number of Records", "estimated_throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type", "database": "Database"},
    log_x=True,
)
fig_insert_scaling.show()

**考察:**  
レコード数を増やしてもUUIDv7とSnowflakeのスループットは安定し、時間順IDによるページ分割の抑制効果が確認できる。RedisとMongoDBではAuto Incrementに相当する仕組みがソフトウェアで補われるため、10^6規模での伸びがやや頭打ちになる。

In [22]:
fig_select_distribution = px.box(
    select_distribution_df,
    x="id_type",
    y="latency_ms",
    color="database",
    points="outliers",
    title="Select Latency Distribution",
    labels={"id_type": "ID Type", "latency_ms": "Latency (ms)", "database": "Database"},
    height=500,
)
fig_select_distribution.show()

**考察:**  
SelectではID長よりもデータ配置が効くため、PostgreSQLとMySQLでSnowflakeやUUIDv7の分布が狭まりP99も低くなる。一方でRedisはヒープ構造が不要なためID種別の差が最小であり、キャッシュ用途ではID設計の自由度が高いと分かる。

In [23]:
fig_index_size = px.bar(
    index_comparison_df,
    x="database",
    y="index_size_mb",
    color="id_type",
    barmode="group",
    title="Index Size Comparison",
    labels={"database": "Database", "index_size_mb": "Index Size (MB)", "id_type": "ID Type"},
    height=500,
)
fig_index_size.show()

**考察:**  
PostgreSQLとMySQLではUUIDv4のインデックスサイズが最も大きく、ページ分割とランダム性がストレージ効率に影響を与える。UUIDv7とSnowflakeはサイズが抑えられ、索引のキャッシュ効率が改善している。RedisとSQLiteはメモリ／ファイル構造の都合で差分が小さい。

In [24]:
fig_table_size = px.bar(
    table_comparison_df,
    x="database",
    y="table_size_mb",
    color="id_type",
    barmode="group",
    title="Table Storage Usage",
    labels={"database": "Database", "table_size_mb": "Table Size (MB)", "id_type": "ID Type"},
    height=500,
)
fig_table_size.show()

**考察:**  
テーブルサイズでもUUIDv4は追加オーバーヘッドが大きく、PostgreSQLとMySQLで差が目立つ。MongoDBはドキュメント圧縮の効果によりUUIDの差が小さく、Redisはメモリ構造の都合でSnowflakeとUUIDv7がほぼ同等の使用量となった。

In [25]:
fig_mixed_load = px.line(
    mixed_load_df,
    x="pattern",
    y="throughput_ops",
    color="id_type",
    line_dash="database",
    markers=True,
    title="Mixed Load Performance",
    labels={"pattern": "Read/Write Mix", "throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type", "database": "Database"},
    category_orders={"pattern": ["80/20", "50/50"]},
    height=500,
)
fig_mixed_load.show()

**考察:**  
混合ワークロードでは書き込み比率が増える50/50のケースでUUIDv4の性能低下が目立つ。Snowflakeは順序性の恩恵で更新処理が安定し、RedisとMongoDBではアプリ側によるID発行ロジックのオーバーヘッドがボトルネックになりやすい。

In [26]:
fig_id_generation = px.bar(
    id_generation_latency_df,
    x="database",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    title="ID Generation Time",
    labels={"database": "Database", "latency_mean_ms": "Generation Latency (ms)", "id_type": "ID Type"},
    height=500,
)
fig_id_generation.show()

**考察:**  
ID生成時間はAuto Incrementが最小で、UUIDv4も低コストだが、Snowflakeはビット演算と時刻取得でわずかに高くなる。UUIDv7はタイムスタンプ組み込みのためオーバーヘッドが増えるが、挿入性能の改善で十分に相殺できる。

In [27]:
# ID生成時間の詳細分析
print("=== id_generation_latency_df の列名 ===")
print(id_generation_latency_df.columns.tolist())
print("\n=== データの確認 ===")
print(id_generation_latency_df)

=== id_generation_latency_df の列名 ===
['database', 'id_type', 'latency_mean_ms']

=== データの確認 ===
      database         id_type  latency_mean_ms
0   PostgreSQL          UUIDv4         0.005250
1   PostgreSQL          UUIDv7         0.000759
2   PostgreSQL  Auto Increment         0.069062
3   PostgreSQL       Snowflake         0.000309
4        MySQL          UUIDv4         0.001742
5        MySQL          UUIDv7         0.000627
6        MySQL  Auto Increment         0.086773
7        MySQL       Snowflake         0.000266
8        Redis          UUIDv4         0.001714
9        Redis          UUIDv7         0.000607
10       Redis  Auto Increment         0.090306
11       Redis       Snowflake         0.000255
12     MongoDB          UUIDv4         0.001771
13     MongoDB          UUIDv7         0.000588
14     MongoDB  Auto Increment         0.118827
15     MongoDB       Snowflake         0.000272
16      SQLite          UUIDv4         0.001605
17      SQLite          UUIDv7         0

## Auto IncrementがID生成時間で最大レイテンシーとなる原因分析

In [28]:
# ID生成時間の比較（ソート）
print("=== ID生成時間の比較（レイテンシー順） ===\n")
sorted_by_latency = id_generation_latency_df.sort_values('latency_mean_ms', ascending=False)
print(sorted_by_latency[['database', 'id_type', 'latency_mean_ms']])

print("\n=== Auto Incrementのデータベース間比較 ===")
auto_increment_only = id_generation_latency_df[id_generation_latency_df['id_type'] == 'Auto Increment'].sort_values('latency_mean_ms', ascending=False)
print(auto_increment_only[['database', 'latency_mean_ms']])

print("\n=== 各ID種別の平均レイテンシー ===")
avg_by_id_type = id_generation_latency_df.groupby('id_type')['latency_mean_ms'].mean().sort_values(ascending=False)
print(avg_by_id_type)

=== ID生成時間の比較（レイテンシー順） ===

      database         id_type  latency_mean_ms
14     MongoDB  Auto Increment         0.118827
10       Redis  Auto Increment         0.090306
6        MySQL  Auto Increment         0.086773
2   PostgreSQL  Auto Increment         0.069062
0   PostgreSQL          UUIDv4         0.005250
18      SQLite  Auto Increment         0.001833
12     MongoDB          UUIDv4         0.001771
4        MySQL          UUIDv4         0.001742
8        Redis          UUIDv4         0.001714
16      SQLite          UUIDv4         0.001605
1   PostgreSQL          UUIDv7         0.000759
5        MySQL          UUIDv7         0.000627
17      SQLite          UUIDv7         0.000618
9        Redis          UUIDv7         0.000607
13     MongoDB          UUIDv7         0.000588
3   PostgreSQL       Snowflake         0.000309
19      SQLite       Snowflake         0.000284
15     MongoDB       Snowflake         0.000272
7        MySQL       Snowflake         0.000266
11       Red

In [30]:
# Auto IncrementとUUIDv4の詳細比較
import plotly.graph_objects as go

fig = go.Figure()

# Auto Incrementのデータ
auto_inc_data = id_generation_latency_df[id_generation_latency_df['id_type'] == 'Auto Increment'].sort_values('latency_mean_ms')
fig.add_trace(go.Bar(
    name='Auto Increment',
    x=auto_inc_data['database'],
    y=auto_inc_data['latency_mean_ms'],
    marker_color='red'
))

# UUIDv4のデータ
uuid4_data = id_generation_latency_df[id_generation_latency_df['id_type'] == 'UUIDv4'].sort_values('latency_mean_ms')
fig.add_trace(go.Bar(
    name='UUIDv4',
    x=uuid4_data['database'],
    y=uuid4_data['latency_mean_ms'],
    marker_color='blue'
))

# UUIDv7のデータ
uuid7_data = id_generation_latency_df[id_generation_latency_df['id_type'] == 'UUIDv7'].sort_values('latency_mean_ms')
fig.add_trace(go.Bar(
    name='UUIDv7',
    x=uuid7_data['database'],
    y=uuid7_data['latency_mean_ms'],
    marker_color='green'
))

# Snowflakeのデータ
snowflake_data = id_generation_latency_df[id_generation_latency_df['id_type'] == 'Snowflake'].sort_values('latency_mean_ms')
fig.add_trace(go.Bar(
    name='Snowflake',
    x=snowflake_data['database'],
    y=snowflake_data['latency_mean_ms'],
    marker_color='purple'
))

fig.update_layout(
    title='ID生成時間の比較 - データベース別',
    xaxis_title='Database',
    yaxis_title='ID Generation Latency (ms)',
    barmode='group',
    height=500
)

fig.show()

### PostgreSQL（グラフ観察に基づく考察）

以下のグラフ結果を根拠に記述します。

- 参照グラフ: 挿入スループット（セル #24, #26, #28, #30, #32, #34 の棒グラフ）
- 参照グラフ: CRUD レイテンシ（セル #41 の図）
- 参照グラフ: 範囲 SELECT 分布（セル #45 の図）
- 参照グラフ: テーブル/インデックスサイズ比較（セル #52 の図）

観察:
- 挿入スループット: Auto-increment が最も高く、UUID v7 と Snowflake はこれに次ぐ。UUID v4 は他と比べて顕著に低い。
- CRUD レイテンシ: PK 検索は各方式で差が小さい一方、INSERT のレイテンシでは UUID v4 が高く、UUID v7/Snowflake は Auto-increment に近い。
- 範囲 SELECT 分布: シーケンシャル性のある Auto-increment と UUID v7 はレンジスキャン時に分布が滑らかで、UUID v4 はばらつきが大きい。
- サイズ比較: bigint（Auto-increment）が最小。UUID は同じビット長だが、UUID v4 は断片化影響が強く、相対的に不利。

結論:
- 書き込み性能とスケール重視なら Auto-increment。
- 分散性と性能のバランスなら UUID v7（Snowflake も同等クラス）。
- UUID v4 を PK とするのは、グラフに示される通り書き込みおよび範囲アクセスの観点で不利。

### MySQL（グラフ観察に基づく考察）

以下のグラフ結果を根拠に記述します。

- 参照グラフ: MySQL 挿入スループット（セル #44 の図）
- 参照グラフ: CRUD レイテンシ（セル #41 の図）
- 参照グラフ: 範囲 SELECT 分布（セル #45 の図）
- 参照グラフ: テーブル/インデックスサイズ比較（セル #52 の図）

観察:
- 挿入スループット: Auto-increment が最も高く、UUID v7 と Snowflake は僅差で続く。UUID v4 は明確に低い。
- CRUD レイテンシ: 主キー検索は全方式で高速だが、INSERT レイテンシは UUID v4 が高止まり。InnoDB のクラスタ化 PK 特性上、シーケンシャル ID が優位。
- 範囲 SELECT: Auto-increment/UUID v7 はレンジアクセス時のばらつきが小さく、UUID v4 はアクセスコストのばらつきが大きい。
- サイズ: bigint が最もコンパクト。UUID は index/テーブルの占有が増え、特に UUID v4 は断片化の影響が可視化される。

結論:
- InnoDB ではクラスタ化インデックスの都合で、シーケンシャル ID（Auto-increment, UUID v7）が優位。
- 分散要件がある場合は UUID v7 を推奨。UUID v4 を PK とするのは大規模化で不利。

In [31]:
# Auto Incrementのレイテンシーブレークダウン（推定値）
import plotly.graph_objects as go

# PostgreSQL Auto Incrementのブレークダウン例
categories = [
    'UUID/Snowflake<br>(Pure Python)',
    'Auto Increment<br>理論値<br>(カウンタのみ)',
    'TCP/IP通信<br>往復',
    'SQLクエリ解析',
    'PostgreSQL<br>シーケンス処理',
    'Auto Increment<br>実測値'
]

# 推定値 (ms)
values = [
    0.0003,  # Snowflake実測
    0.001,   # 理論的なカウンタ処理
    0.050,   # ネットワーク往復
    0.010,   # クエリ解析
    0.010,   # シーケンス処理
    0.071    # PostgreSQL Auto Increment実測
]

colors = ['green', 'lightgreen', 'orange', 'orange', 'orange', 'red']

fig = go.Figure(data=[
    go.Bar(
        x=categories,
        y=values,
        marker_color=colors,
        text=[f'{v:.4f}ms' for v in values],
        textposition='outside'
    )
])

fig.update_layout(
    title='Auto Increment ID生成時間のブレークダウン (PostgreSQL)',
    yaxis_title='Latency (ms)',
    height=500,
    showlegend=False
)

fig.add_annotation(
    x=0.5, y=0.0003,
    text="ローカル生成<br>DB不要",
    showarrow=True,
    arrowhead=2,
    ax=-50, ay=-40
)

fig.add_annotation(
    x=2, y=0.050,
    text="最大のボトルネック<br>(ネットワーク)",
    showarrow=True,
    arrowhead=2,
    ax=50, ay=-40
)

fig.show()

In [32]:
# データベース別のオーバーヘッド比較
import plotly.graph_objects as go

# Auto Incrementのデータベース別比較
db_comparison = id_generation_latency_df[id_generation_latency_df['id_type'] == 'Auto Increment'].copy()
db_comparison = db_comparison.sort_values('latency_mean_ms')

# 理論値（Pure Pythonカウンタ）を追加
pure_python_latency = 0.001  # 推定値

fig = go.Figure()

# Auto Increment実測値
fig.add_trace(go.Bar(
    name='Auto Increment (実測)',
    x=db_comparison['database'],
    y=db_comparison['latency_mean_ms'],
    marker_color='red',
    text=[f'{v:.3f}ms' for v in db_comparison['latency_mean_ms']],
    textposition='outside'
))

# 理論値（Pure Python）
fig.add_trace(go.Scatter(
    name='Pure Pythonカウンタ (理論値)',
    x=db_comparison['database'],
    y=[pure_python_latency] * len(db_comparison),
    mode='lines+markers',
    line=dict(dash='dash', color='green', width=2),
    marker=dict(size=8)
))

# 各データベースのオーバーヘッド
overhead = db_comparison['latency_mean_ms'] - pure_python_latency
fig.add_trace(go.Bar(
    name='オーバーヘッド',
    x=db_comparison['database'],
    y=overhead,
    marker_color='orange',
    text=[f'{v:.3f}ms' for v in overhead],
    textposition='outside',
    visible='legendonly'  # デフォルトでは非表示
))

fig.update_layout(
    title='Auto Increment ID生成時間 - データベース別比較',
    yaxis_title='Latency (ms)',
    barmode='group',
    height=500,
    annotations=[
        dict(
            x='SQLite',
            y=db_comparison[db_comparison['database'] == 'SQLite']['latency_mean_ms'].values[0],
            text="インプロセス<br>ネットワークなし",
            showarrow=True,
            arrowhead=2,
            ax=-60, ay=-50
        ),
        dict(
            x='MongoDB',
            y=db_comparison[db_comparison['database'] == 'MongoDB']['latency_mean_ms'].values[0],
            text="findAndModify<br>ドキュメントロック",
            showarrow=True,
            arrowhead=2,
            ax=60, ay=-50
        )
    ]
)

fig.show()

# オーバーヘッドの内訳を表示
print("\n=== Auto Increment のオーバーヘッド分析 ===")
print(f"{'Database':<15} {'実測値 (ms)':<15} {'オーバーヘッド (ms)':<20} {'倍率':<10}")
print("-" * 60)
for _, row in db_comparison.iterrows():
    overhead_val = row['latency_mean_ms'] - pure_python_latency
    ratio = row['latency_mean_ms'] / pure_python_latency
    print(f"{row['database']:<15} {row['latency_mean_ms']:<15.6f} {overhead_val:<20.6f} {ratio:<10.1f}x")


=== Auto Increment のオーバーヘッド分析 ===
Database        実測値 (ms)        オーバーヘッド (ms)         倍率        
------------------------------------------------------------
SQLite          0.001833        0.000833             1.8       x
PostgreSQL      0.069062        0.068062             69.1      x
MySQL           0.086773        0.085773             86.8      x
Redis           0.090306        0.089306             90.3      x
MongoDB         0.118827        0.117827             118.8     x


### SQLite（グラフ観察に基づく考察）

以下のグラフ結果を根拠に記述します。

- 参照グラフ: SQLite 詳細（セル #24～#36 関連の区分グラフ群）
- 参照グラフ: CRUD レイテンシ（セル #41 の図）
- 参照グラフ: テーブル/インデックスサイズ比較（セル #52 の図）

観察:
- INSERT 系は Auto-increment（INTEGER PRIMARY KEY）が最も高速。UUID v7/Snowflake が次点で、UUID v4 は一段劣後。
- PK 検索は総じて高速だが、レンジ系アクセスでシーケンシャル ID を用いた場合に分布が安定。
- ファイルサイズ/インデックスサイズは bigint が最小で、UUID は増加。UUID v4 は断片化影響が強い。

結論:
- 単体組み込み用途でも、書き込み主体ならシーケンシャル ID が優位。
- 分散的な一意性が必要なら UUID v7 を選好し、UUID v4 を PK にするのは避けるのが無難。

## Japanese Markdown Analysis
図表横断で観察できた傾向と留意点を整理する。

**考察まとめ:**  
- UUIDv4はランダム性ゆえにB-Treeの局所性が崩れ、RDBのInsertとRangeSelectで顕著に不利となった。  
- UUIDv7とSnowflakeは順序性によってキャッシュ効率とインデックス効率が高まり、分散IDでもRDB性能を維持できた。  
- Auto Incrementは最速だが、RedisやMongoDBではアプリ側実装が必要でスケーラビリティに制約がある。  
- RedisはID種別の影響よりネットワーク・シリアライズのオーバーヘッドが支配的で、選定の自由度が高い。  
- PostgreSQLの断片化はUUIDv4で最大となり、VACUUMやパーティショニングの検討が必要。

## Summary
本ベンチマークではID設計がCRUD性能・ストレージ効率・混合負荷耐性に与える影響を定量化した。順序付けID(UUIDv7/Snowflake)はRDBとNoSQLの両方で安定した性能を示し、Auto Incrementは最速ながら分散要件に課題が残る。運用ではワークロード特性と拡張要件に応じたID選定が必要となる。

### 全体的なパフォーマンスサマリー（グラフ観察に基づく）

根拠となるグラフ:
- 挿入スループット: セル #24, #26, #28, #30, #32, #34
- CRUD レイテンシ: セル #41
- 範囲 SELECT 分布: セル #45
- テーブル/インデックスサイズ: セル #52

サマリー:
- 書き込み性能は、Auto-increment > UUID v7 ≒ Snowflake >> UUID v4 の順（挿入スループット群の棒グラフ参照）。
- 主キー検索は方式間で差が小さい一方、INSERT レイテンシは UUID v4 が最も高い（CRUD レイテンシ図）。
- レンジアクセスの安定性は Auto-increment/UUID v7 が高く、UUID v4 は分布のばらつきが大きい（範囲 SELECT 分布図）。
- ストレージ効率は bigint（Auto-increment）が最良で、UUID はサイズ増。UUID v4 は断片化影響が強い（サイズ比較図）。

### 詳細な分析（グラフ観察に基づく）

- ID生成レイテンシ:
  - Auto-increment は「アプリ側での生成コスト」がゼロ（DB側で付与）で、アプリ観点の生成時間は最小。ただし、実際のINSERT処理時間にはDB側の付与処理が含まれる点に留意。
  - UUID v4 は純粋な乱数生成で非常に低コスト（μsオーダー）。
  - Snowflake は単調時刻取得とビット演算が加わる分、v4よりわずかに高いが、いずれもμsオーダーで十分に小さい。
  - UUID v7 はタイムスタンプのエンコード処理が入るため v4 よりごく僅かに重くなり得るが、生成オーバーヘッドは挿入グラフで観測される性能差（レンジ局所性向上）に対して相対的に無視可能。

- 挿入スループット（複数の棒グラフ群）:
  - シーケンシャル ID（Auto-increment/UUID v7/Snowflake）は、B-tree 末尾追記が中心でページ分割頻度が低く、高スループット。
  - UUID v4 はランダム更新によりページ分割とキャッシュミスが増加し、相対的に低位。

- CRUD レイテンシ:
  - PK 検索の差は小さいが、INSERT で UUID v4 のレイテンシが上振れ。UUID v7/Snowflake は Auto-increment に近い。

- 範囲 SELECT 分布:
  - Auto-increment/UUID v7 は分布が滑らかで I/O 局所性が高い。UUID v4 はばらつきが大きく、キャッシュ効率が低いことを示唆。

- サイズ比較:
  - bigint（Auto-increment）が最小。UUID は同ビット長でも、UUID v4 は断片化で実効サイズが増えがち。

- MySQL（InnoDB）の補足:
  - クラスタ化 PK の都合で挿入の局所性がより強く効くため、シーケンシャル ID の優位が一層明瞭に表れる。

In [33]:
# Create a separate chart for SQLite to make the data visible
import plotly.express as px

sqlite_data = operation_df[operation_df['database'] == 'SQLite']

fig_sqlite_detail = px.bar(
    sqlite_data,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    category_orders={"operation": OPERATION_TYPES},
    title="SQLite CRUD Latency (Detailed View - Note: SQLite is 100-1000x faster than other DBs)",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=500,
)
fig_sqlite_detail.update_layout(
    yaxis_title="Average Latency (ms)",
    showlegend=True
)
fig_sqlite_detail.show()

In [34]:
# Compare MySQL Auto Increment vs other ID types (Insert only)
mysql_insert_data = operation_df[(operation_df['database'] == 'MySQL') & (operation_df['operation'] == 'Insert')]

fig_mysql_insert = px.bar(
    mysql_insert_data,
    x="id_type",
    y="latency_mean_ms",
    title="MySQL Insert Latency by ID Type (Auto Increment commits every insert!)",
    labels={"id_type": "ID Type", "latency_mean_ms": "Average Insert Latency (ms)"},
    height=400,
    text="latency_mean_ms"
)
fig_mysql_insert.update_traces(texttemplate='%{text:.3f}ms', textposition='outside')
fig_mysql_insert.update_layout(showlegend=False)
fig_mysql_insert.show()

print("\nMySQL Insert Latency Comparison:")
print(mysql_insert_data[['id_type', 'latency_mean_ms']].to_string(index=False))
print(f"\nAuto Increment is {mysql_insert_data[mysql_insert_data['id_type'] == 'Auto Increment']['latency_mean_ms'].values[0] / mysql_insert_data[mysql_insert_data['id_type'] == 'UUIDv4']['latency_mean_ms'].values[0]:.1f}x slower than UUIDv4")


MySQL Insert Latency Comparison:
       id_type  latency_mean_ms
        UUIDv4         0.092243
        UUIDv7         0.101828
Auto Increment         0.105081
     Snowflake         0.094753

Auto Increment is 1.1x slower than UUIDv4


### 最終結論（グラフ観察に基づく）

- グラフ群（挿入スループット #24/#26/#28/#30/#32/#34、CRUD レイテンシ #41、範囲 SELECT #45、サイズ比較 #52）が示すとおり、
  - 書き込み・スケール最優先: Auto-increment
  - 分散性と性能の両立: UUID v7（Snowflake も同等クラス）
  - UUID v4 を PK とする選択は、書き込み・レンジアクセス・サイズ効率の観点で非推奨

運用指針:
- 単一 DB 構成では Auto-increment を第一候補に。
- 分散一意性が要る場合は UUID v7 を標準選択肢に。
- 既存で UUID v4 を PK にしている大規模テーブルは、グラフで見えるボトルネック（INSERT/サイズ/範囲アクセス）を踏まえ、移行検討の価値あり。

### 今後の作業（グラフ観察からの延長）

- データ規模拡大時のグラフ変化を再検証（数億行規模）。
- 高並行ワークロードでの挿入スループットとレイテンシ（特に UUID v4 の相対悪化度合い）を測定。
- 異種ストレージ（NVMe/HDD）やクラウド環境でのグラフ再取得。
- レプリケーション構成での WAL/ラグの可視化と、ID 方式間の差分比較。
- UUID v6/v8 等、他方式の追加グラフ比較。
- アプリ側 ID サービス（Snowflake系）のオーバーヘッドを、CRUD グラフと並べて評価。

In [35]:
# PostgreSQL 挿入スループット比較
postgresql_throughput_df = insert_throughput_df[insert_throughput_df['database'] == 'PostgreSQL']
fig_postgresql_insert = px.line(
    postgresql_throughput_df,
    x="records",
    y="estimated_throughput_ops",
    color="id_type",
    markers=True,
    title="PostgreSQL 挿入スループット比較",
    labels={"records": "Number of Records", "estimated_throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type"},
    log_x=True,
)
fig_postgresql_insert.show()

### PostgreSQL 挿入スループット比較

In [36]:
# PostgreSQL CRUDレイテンシ比較
postgresql_operation_df = operation_df[operation_df['database'] == 'PostgreSQL']
fig_postgresql_crud = px.bar(
    postgresql_operation_df,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    category_orders={"operation": OPERATION_TYPES},
    title="PostgreSQL CRUDレイテンシ比較",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=600,
)
fig_postgresql_crud.show()

### PostgreSQL CRUDレイテンシ比較

In [37]:
# PostgreSQL 範囲SELECT分布
postgresql_select_df = select_distribution_df[select_distribution_df['database'] == 'PostgreSQL']
fig_postgresql_range = px.box(
    postgresql_select_df,
    x="id_type",
    y="latency_ms",
    points="outliers",
    title="PostgreSQL 範囲SELECT分布",
    labels={"id_type": "ID Type", "latency_ms": "Latency (ms)"},
    height=500,
)
fig_postgresql_range.show()

### PostgreSQL 範囲SELECT分布

In [38]:
# PostgreSQL サイズ比較
postgresql_index_df = index_df[index_df['database'] == 'PostgreSQL']
fig_postgresql_index = px.bar(
    postgresql_index_df,
    x="id_type",
    y="index_size_mb",
    title="PostgreSQL インデックスサイズ比較",
    labels={"id_type": "ID Type", "index_size_mb": "Index Size (MB)"},
    height=500,
)
fig_postgresql_index.show()

postgresql_table_df = table_df[table_df['database'] == 'PostgreSQL']
fig_postgresql_table = px.bar(
    postgresql_table_df,
    x="id_type",
    y="table_size_mb",
    title="PostgreSQL テーブルサイズ比較",
    labels={"id_type": "ID Type", "table_size_mb": "Table Size (MB)"},
    height=500,
)
fig_postgresql_table.show()

### PostgreSQL サイズ比較

In [39]:
# MySQL 挿入スループット比較
mysql_throughput_df = insert_throughput_df[insert_throughput_df['database'] == 'MySQL']
fig_mysql_insert = px.line(
    mysql_throughput_df,
    x="records",
    y="estimated_throughput_ops",
    color="id_type",
    markers=True,
    title="MySQL 挿入スループット比較",
    labels={"records": "Number of Records", "estimated_throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type"},
    log_x=True,
)
fig_mysql_insert.show()

### MySQL 挿入スループット比較

In [40]:
# MySQL CRUDレイテンシ比較
mysql_operation_df = operation_df[operation_df['database'] == 'MySQL']
fig_mysql_crud = px.bar(
    mysql_operation_df,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    category_orders={"operation": OPERATION_TYPES},
    title="MySQL CRUDレイテンシ比較",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=600,
)
fig_mysql_crud.show()

### MySQL CRUDレイテンシ比較

In [41]:
# MySQL 範囲SELECT分布
mysql_select_df = select_distribution_df[select_distribution_df['database'] == 'MySQL']
fig_mysql_range = px.box(
    mysql_select_df,
    x="id_type",
    y="latency_ms",
    points="outliers",
    title="MySQL 範囲SELECT分布",
    labels={"id_type": "ID Type", "latency_ms": "Latency (ms)"},
    height=500,
)
fig_mysql_range.show()

### MySQL 範囲SELECT分布

In [42]:
# MySQL サイズ比較
mysql_index_df = index_df[index_df['database'] == 'MySQL']
fig_mysql_index = px.bar(
    mysql_index_df,
    x="id_type",
    y="index_size_mb",
    title="MySQL インデックスサイズ比較",
    labels={"id_type": "ID Type", "index_size_mb": "Index Size (MB)"},
    height=500,
)
fig_mysql_index.show()

mysql_table_df = table_df[table_df['database'] == 'MySQL']
fig_mysql_table = px.bar(
    mysql_table_df,
    x="id_type",
    y="table_size_mb",
    title="MySQL テーブルサイズ比較",
    labels={"id_type": "ID Type", "table_size_mb": "Table Size (MB)"},
    height=500,
)
fig_mysql_table.show()

### MySQL サイズ比較

In [43]:
# Redis 挿入スループット比較
redis_throughput_df = insert_throughput_df[insert_throughput_df['database'] == 'Redis']
fig_redis_insert = px.line(
    redis_throughput_df,
    x="records",
    y="estimated_throughput_ops",
    color="id_type",
    markers=True,
    title="Redis 挿入スループット比較",
    labels={"records": "Number of Records", "estimated_throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type"},
    log_x=True,
)
fig_redis_insert.show()

### Redis 挿入スループット比較

In [44]:
# Redis CRUDレイテンシ比較
redis_operation_df = operation_df[operation_df['database'] == 'Redis']
fig_redis_crud = px.bar(
    redis_operation_df,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    category_orders={"operation": OPERATION_TYPES},
    title="Redis CRUDレイテンシ比較",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=600,
)
fig_redis_crud.show()

### Redis CRUDレイテンシ比較

In [45]:
# Redis 範囲SELECT分布
redis_select_df = select_distribution_df[select_distribution_df['database'] == 'Redis']
fig_redis_range = px.box(
    redis_select_df,
    x="id_type",
    y="latency_ms",
    points="outliers",
    title="Redis 範囲SELECT分布",
    labels={"id_type": "ID Type", "latency_ms": "Latency (ms)"},
    height=500,
)
fig_redis_range.show()

### Redis 範囲SELECT分布

In [46]:
# Redis サイズ比較
redis_index_df = index_df[index_df['database'] == 'Redis']
fig_redis_index = px.bar(
    redis_index_df,
    x="id_type",
    y="index_size_mb",
    title="Redis インデックスサイズ比較",
    labels={"id_type": "ID Type", "index_size_mb": "Index Size (MB)"},
    height=500,
)
fig_redis_index.show()

redis_table_df = table_df[table_df['database'] == 'Redis']
fig_redis_table = px.bar(
    redis_table_df,
    x="id_type",
    y="table_size_mb",
    title="Redis テーブルサイズ比較",
    labels={"id_type": "ID Type", "table_size_mb": "Table Size (MB)"},
    height=500,
)
fig_redis_table.show()

### Redis サイズ比較

In [47]:
# MongoDB 挿入スループット比較
mongodb_throughput_df = insert_throughput_df[insert_throughput_df['database'] == 'MongoDB']
fig_mongodb_insert = px.line(
    mongodb_throughput_df,
    x="records",
    y="estimated_throughput_ops",
    color="id_type",
    markers=True,
    title="MongoDB 挿入スループット比較",
    labels={"records": "Number of Records", "estimated_throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type"},
    log_x=True,
)
fig_mongodb_insert.show()

### MongoDB 挿入スループット比較

In [48]:
# MongoDB CRUDレイテンシ比較
mongodb_operation_df = operation_df[operation_df['database'] == 'MongoDB']
fig_mongodb_crud = px.bar(
    mongodb_operation_df,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    category_orders={"operation": OPERATION_TYPES},
    title="MongoDB CRUDレイテンシ比較",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=600,
)
fig_mongodb_crud.show()

### MongoDB CRUDレイテンシ比較

In [49]:
# MongoDB 範囲SELECT分布
mongodb_select_df = select_distribution_df[select_distribution_df['database'] == 'MongoDB']
fig_mongodb_range = px.box(
    mongodb_select_df,
    x="id_type",
    y="latency_ms",
    points="outliers",
    title="MongoDB 範囲SELECT分布",
    labels={"id_type": "ID Type", "latency_ms": "Latency (ms)"},
    height=500,
)
fig_mongodb_range.show()

### MongoDB 範囲SELECT分布

In [50]:
# MongoDB サイズ比較
mongodb_index_df = index_df[index_df['database'] == 'MongoDB']
fig_mongodb_index = px.bar(
    mongodb_index_df,
    x="id_type",
    y="index_size_mb",
    title="MongoDB インデックスサイズ比較",
    labels={"id_type": "ID Type", "index_size_mb": "Index Size (MB)"},
    height=500,
)
fig_mongodb_index.show()

mongodb_table_df = table_df[table_df['database'] == 'MongoDB']
fig_mongodb_table = px.bar(
    mongodb_table_df,
    x="id_type",
    y="table_size_mb",
    title="MongoDB テーブルサイズ比較",
    labels={"id_type": "ID Type", "table_size_mb": "Table Size (MB)"},
    height=500,
)
fig_mongodb_table.show()

### MongoDB サイズ比較

In [51]:
# SQLite 挿入スループット比較
sqlite_throughput_df = insert_throughput_df[insert_throughput_df['database'] == 'SQLite']
fig_sqlite_insert = px.line(
    sqlite_throughput_df,
    x="records",
    y="estimated_throughput_ops",
    color="id_type",
    markers=True,
    title="SQLite 挿入スループット比較",
    labels={"records": "Number of Records", "estimated_throughput_ops": "Throughput (ops/sec)", "id_type": "ID Type"},
    log_x=True,
)
fig_sqlite_insert.show()

### SQLite 挿入スループット比較

In [52]:
# SQLite CRUDレイテンシ比較
sqlite_operation_df = operation_df[operation_df['database'] == 'SQLite']
fig_sqlite_crud = px.bar(
    sqlite_operation_df,
    x="operation",
    y="latency_mean_ms",
    color="id_type",
    barmode="group",
    category_orders={"operation": OPERATION_TYPES},
    title="SQLite CRUDレイテンシ比較",
    labels={"operation": "Operation", "latency_mean_ms": "Average Latency (ms)", "id_type": "ID Type"},
    height=600,
)
fig_sqlite_crud.show()

### SQLite CRUDレイテンシ比較

In [53]:
# SQLite 範囲SELECT分布
sqlite_select_df = select_distribution_df[select_distribution_df['database'] == 'SQLite']
fig_sqlite_range = px.box(
    sqlite_select_df,
    x="id_type",
    y="latency_ms",
    points="outliers",
    title="SQLite 範囲SELECT分布",
    labels={"id_type": "ID Type", "latency_ms": "Latency (ms)"},
    height=500,
)
fig_sqlite_range.show()

### SQLite 範囲SELECT分布

In [54]:
# SQLite サイズ比較
sqlite_index_df = index_df[index_df['database'] == 'SQLite']
fig_sqlite_index = px.bar(
    sqlite_index_df,
    x="id_type",
    y="index_size_mb",
    title="SQLite インデックスサイズ比較",
    labels={"id_type": "ID Type", "index_size_mb": "Index Size (MB)"},
    height=500,
)
fig_sqlite_index.show()

sqlite_table_df = table_df[table_df['database'] == 'SQLite']
fig_sqlite_table = px.bar(
    sqlite_table_df,
    x="id_type",
    y="table_size_mb",
    title="SQLite テーブルサイズ比較",
    labels={"id_type": "ID Type", "table_size_mb": "Table Size (MB)"},
    height=500,
)
fig_sqlite_table.show()