# learned index db vs redis benchmark

comparing my implementation of learned index database against redis to see if the research paper claims hold up

## research paper

this benchmark tests the implementation based on:

**"The Case for Learned Index Structures"**  
*Tim Kraska, Alex Beutel, Ed H. Chi, Jeffrey Dean, Neoklis Polyzotis*  
SIGMOD 2018

📄 Paper: https://arxiv.org/abs/1712.01208

the paper proposes that indexes can be replaced with learned models (neural nets, regression) that predict record positions. they showed up to 70% speedup over b-trees with 10x less memory.

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

# load results from benchmark
with open('benchmark_results.json', 'r') as f:
    results = json.load(f)

print(f"benchmark run with {results['num_ops']} operations")
print(json.dumps(results, indent=2))

## performance comparison - ops/sec

In [None]:
# extract data for plotting
operations = ['SET', 'GET', 'MIXED']
learndb_ops = [
    results['learndb']['set_ops_per_sec'],
    results['learndb']['get_ops_per_sec'],
    results['learndb']['mixed_ops_per_sec']
]
redis_ops = [
    results['redis']['set_ops_per_sec'],
    results['redis']['get_ops_per_sec'],
    results['redis']['mixed_ops_per_sec']
]

# plot throughput comparison
x = np.arange(len(operations))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, learndb_ops, width, label='LearnDB', color='#2ecc71')
bars2 = ax.bar(x + width/2, redis_ops, width, label='Redis', color='#e74c3c')

ax.set_xlabel('Operation Type', fontsize=12)
ax.set_ylabel('Throughput (ops/sec)', fontsize=12)
ax.set_title('LearnDB vs Redis - Throughput Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(operations)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}',
            ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('benchmark_throughput.png', dpi=150)
plt.show()

print("saved plot to benchmark_throughput.png")

## speedup analysis

how much faster is learndb compared to redis?

In [None]:
# calculate speedup
speedups = []
for i in range(len(operations)):
    speedup = learndb_ops[i] / redis_ops[i]
    speedups.append(speedup)
    print(f"{operations[i]:6s}: {speedup:.2f}x faster")

# plot speedup
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(operations, speedups, color=['#3498db', '#9b59b6', '#f39c12'])

ax.set_ylabel('Speedup (x times faster)', fontsize=12)
ax.set_title('LearnDB Speedup over Redis', fontsize=14, fontweight='bold')
ax.axhline(y=1, color='red', linestyle='--', alpha=0.5, label='Baseline (1x)')
ax.legend()
ax.grid(axis='y', alpha=0.3)

# add value labels
for bar, speedup in zip(bars, speedups):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{speedup:.1f}x',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('benchmark_speedup.png', dpi=150)
plt.show()

print("saved plot to benchmark_speedup.png")

## execution time comparison

looking at raw execution times

In [None]:
# execution times
learndb_times = [
    results['learndb']['set_time'] * 1000,  # convert to ms
    results['learndb']['get_time'] * 1000,
    results['learndb']['mixed_time'] * 1000
]
redis_times = [
    results['redis']['set_time'] * 1000,
    results['redis']['get_time'] * 1000,
    results['redis']['mixed_time'] * 1000
]

x = np.arange(len(operations))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, learndb_times, width, label='LearnDB', color='#2ecc71', alpha=0.8)
bars2 = ax.bar(x + width/2, redis_times, width, label='Redis', color='#e74c3c', alpha=0.8)

ax.set_xlabel('Operation Type', fontsize=12)
ax.set_ylabel('Execution Time (ms)', fontsize=12)
ax.set_title('Execution Time Comparison (Lower is Better)', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(operations)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# add value labels
for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}ms',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}ms',
            ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('benchmark_time.png', dpi=150)
plt.show()

print("saved plot to benchmark_time.png")

## summary

key takeaways from this benchmark:

In [None]:
print("\n" + "="*50)
print("BENCHMARK SUMMARY")
print("="*50)

print(f"\nOperations tested: {results['num_ops']:,}")

print("\nLearnDB Performance:")
for op, val in zip(['SET', 'GET', 'MIXED'], learndb_ops):
    print(f"  {op:6s}: {val:,.0f} ops/sec")

print("\nRedis Performance:")
for op, val in zip(['SET', 'GET', 'MIXED'], redis_ops):
    print(f"  {op:6s}: {val:,.0f} ops/sec")

print("\nSpeedup (LearnDB vs Redis):")
for op, speedup in zip(['SET', 'GET', 'MIXED'], speedups):
    print(f"  {op:6s}: {speedup:.2f}x faster")

print("\nKey Findings:")
print("  - Learned index excels at read operations (GET)")
print("  - Direct in-process calls eliminate network overhead")
print("  - Best for read-heavy workloads with predictable keys")
print("\nCaveats:")
print("  - Redis comparison includes client/network overhead")
print("  - LearnDB is in-process, Redis is via tcp/socket")
print("  - This is a learning project, not production-ready")
print("="*50)