# AI Scalability Simulation
## Assumptions
- Users up to 1,000,000
- Active users = 10%
- Requests per user per minute = 2
- Baseline inference latency = 200 ms
- Cache hit rate = 40%
- Compression speedup = 30%

In [1]:
import numpy as np
import pandas as pd

users = np.linspace(1_000, 1_000_000, 20)
active_pct = 0.10
req_per_min = 2
latency = 0.2  # seconds

capacity = 1 / latency
rps = users * active_pct * req_per_min / 60
instances = rps / capacity

df = pd.DataFrame({
    'Users': users.astype(int),
    'RPS': rps,
    'Instances Required': instances
})
df.head()

Unnamed: 0,Users,RPS,Instances Required
0,1000,3.333333,0.666667
1,53578,178.596491,35.719298
2,106157,353.859649,70.77193
3,158736,529.122807,105.824561
4,211315,704.385965,140.877193


In [None]:
# Cache Impact
cache_hit = 0.4
effective_rps = rps * (1 - cache_hit)
instances_cache = effective_rps / capacity

# Compression Impact
speedup = 0.3
new_latency = latency * (1 - speedup)
new_capacity = 1 / new_latency
instances_compressed = rps / new_capacity

In [None]:
import matplotlib.pyplot as plt

plt.plot(users, instances, label='Baseline')
plt.plot(users, instances_cache, label='With Cache')
plt.plot(users, instances_compressed, label='Compressed Model')

plt.legend()
plt.xlabel('Users')
plt.ylabel('Instances Required')
plt.title('Scalability Simulation')
plt.show()