In [1]:
# Configure matplotlib for inline plotting in VS Code/Jupyter
# MUST come before importing matplotlib
%matplotlib inline

In [2]:
import time
from pathlib import Path

import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
import numpy as np
from jax import grad, jit, vmap

from nlsq import CurveFit, curve_fit


# Define the exponential model
def exponential_model(x, a, b):
    return a * jnp.exp(-b * x)

# Generate batch data for demonstrations
np.random.seed(42)
n_datasets = 1000
n_points = 100
x_batch_data = np.linspace(0, 5, n_points)
y_batch_data = np.array([
    3.0 * np.exp(-0.5 * x_batch_data) + np.random.normal(0, 0.1, n_points)
    for _ in range(n_datasets)
])

print(f"JAX backend: {jax.devices()[0].platform}")
print(f"Generated {n_datasets} synthetic datasets with {n_points} points each")

JAX backend: gpu
Generated 1000 synthetic datasets with 100 points each


In [3]:
# Measure sequential fitting time (for comparison)
print("Timing sequential fits (100 datasets for estimate)...")

def fit_one_sequential(y_data):
    params = jnp.array([3.0, 0.5])
    def loss(p):
        return jnp.sum((y_data - exponential_model(x_batch_data, *p)) ** 2)
    for _ in range(20):
        g = jax.grad(loss)(params)
        params = params - 0.05 * g
    return params

# Time 100 sequential fits
start = time.time()
for i in range(100):
    _ = fit_one_sequential(y_batch_data[i])
time_sequential = time.time() - start

print(f"Time for 100 sequential fits: {time_sequential:.3f}s")
print(f"Average time per fit: {time_sequential / 100 * 1000:.3f}ms")
print()


Timing sequential fits (100 datasets for estimate)...


Time for 100 sequential fits: 11.509s
Average time per fit: 115.087ms



In [4]:
def fit_one_dataset(y_single):
    """Fit a single dataset using gradient descent."""
    params = jnp.array([3.0, 0.5])
    def loss(p):
        return jnp.sum((y_single - exponential_model(x_batch_data, *p)) ** 2)
    for _ in range(20):
        g = jax.grad(loss)(params)
        params = params - 0.05 * g
    return params

fit_batch = jit(vmap(fit_one_dataset))
_ = fit_batch(y_batch_data[:10])
start = time.time()
results_batch = fit_batch(y_batch_data)
results_batch[0].block_until_ready()
time_batch = time.time() - start
print(
    f"  Time for {n_datasets} datasets: {time_batch * 1000:.0f} ms ({time_batch * 1000 / n_datasets:.3f} ms/fit)"
)
print(f"  Throughput: {n_datasets / time_batch:.0f} fits/second")
print()
estimated_sequential_time = time_sequential * n_datasets / 100
speedup = estimated_sequential_time / time_batch
print(f"Speedup: {speedup:.0f}x faster with vmap + JIT ✓")
print()
print("Key insight: vmap parallelizes across datasets, JIT compiles once")


  Time for 1000 datasets: 470 ms (0.470 ms/fit)
  Throughput: 2127 fits/second

Speedup: 245x faster with vmap + JIT ✓

Key insight: vmap parallelizes across datasets, JIT compiles once


Part 4: Memory Optimization

Avoiding out-of-memory (OOM) errors with large datasets.


In [5]:
print("Memory Optimization Strategies:")
print("=" * 60)
print()
print("1. Use float32 instead of float64:")
x_f64 = jnp.array([1.0, 2.0, 3.0], dtype=jnp.float64)
x_f32 = jnp.array([1.0, 2.0, 3.0], dtype=jnp.float32)
print(f"   float64 memory: {x_f64.nbytes} bytes per element")
print(f"   float32 memory: {x_f32.nbytes} bytes per element")
print(f"   Savings: {(1 - x_f32.nbytes / x_f64.nbytes) * 100:.0f}%")
print("   → Use float32 unless high precision is critical\n")
print("2. Process data in chunks (streaming):")
print("   # For very large datasets (millions of points)")
print("   chunk_size = 100000")
print("   for i in range(0, len(data), chunk_size):")
print("       chunk = data[i:i+chunk_size]")
print("       result = fit(chunk)")
print("       results.append(result)\n")
print("3. Clear JAX cache if needed:")
print("   from jax import clear_caches")
print("   clear_caches()  # Frees compilation cache\n")
print("4. Monitor memory usage:")
def get_array_memory_mb(arr):
    return arr.nbytes / (1024**2)
large_array = jnp.ones((10000, 1000), dtype=jnp.float32)
print(
    f"   Example: {large_array.shape} array uses {get_array_memory_mb(large_array):.1f} MB"
)
print()
print("5. Typical memory requirements:")
print("   10K points:     ~0.1 MB (negligible)")
print("   1M points:      ~10 MB (easy)")
print("   100M points:    ~1 GB (manageable)")
print("   1B points:      ~10 GB (need chunking or distributed)")
print()
print("→ For datasets >100M points, use chunked processing or streaming")


Memory Optimization Strategies:

1. Use float32 instead of float64:
   float64 memory: 24 bytes per element
   float32 memory: 12 bytes per element
   Savings: 50%
   → Use float32 unless high precision is critical

2. Process data in chunks (streaming):
   # For very large datasets (millions of points)
   chunk_size = 100000
   for i in range(0, len(data), chunk_size):
       chunk = data[i:i+chunk_size]
       result = fit(chunk)
       results.append(result)

3. Clear JAX cache if needed:
   from jax import clear_caches
   clear_caches()  # Frees compilation cache

4. Monitor memory usage:
   Example: (10000, 1000) array uses 38.1 MB

5. Typical memory requirements:
   10K points:     ~0.1 MB (negligible)
   1M points:      ~10 MB (easy)
   100M points:    ~1 GB (manageable)
   1B points:      ~10 GB (need chunking or distributed)

→ For datasets >100M points, use chunked processing or streaming


Part 5: Performance Benchmarking

Systematic performance measurement and optimization.


In [6]:
def benchmark_nlsq(n_points_list, n_params=2, n_runs=5):
    """Benchmark NLSQ across different problem sizes.
    Parameters
    ----------
    n_points_list : list
        List of dataset sizes to test
    n_params : int
        Number of parameters to fit
    n_runs : int
        Number of runs to average
    Returns
    -------
    results : dict
        Benchmark results
    """
    results = {"n_points": [], "mean_time_ms": [], "std_time_ms": []}
    cf_bench = CurveFit()
    for n_points in n_points_list:
        x = jnp.linspace(0, 5, n_points)
        y = 3.0 * jnp.exp(-0.5 * x) + np.random.normal(0, 0.1, n_points)
        _ = cf_bench.curve_fit(exponential_model, x, y, p0=[2.0, 0.3], maxiter=20)
        times = []
        for _ in range(n_runs):
            start = time.time()
            popt, _ = cf_bench.curve_fit(
                exponential_model, x, y, p0=[2.0, 0.3], maxiter=20
            )
            times.append((time.time() - start) * 1000)
        results["n_points"].append(n_points)
        results["mean_time_ms"].append(np.mean(times))
        results["std_time_ms"].append(np.std(times))
    return results
print("Running comprehensive benchmark...")
print("(This may take 30-60 seconds)")
print()
sizes = [100, 500, 1000, 5000, 10000]
bench_results = benchmark_nlsq(sizes, n_runs=5)
print("Benchmark Results:")
print("=" * 60)
print(f"{'N Points':<12} {'Mean Time (ms)':<20} {'Throughput (fits/s)'}")
print("-" * 60)
for i, n in enumerate(bench_results["n_points"]):
    mean_t = bench_results["mean_time_ms"][i]
    std_t = bench_results["std_time_ms"][i]
    throughput = 1000 / mean_t
    print(f"{n:<12} {mean_t:>8.2f} ± {std_t:<8.2f} {throughput:>12.1f}")
print()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.errorbar(
    bench_results["n_points"],
    bench_results["mean_time_ms"],
    yerr=bench_results["std_time_ms"],
    marker="o",
    capsize=5,
    label="NLSQ",
)
ax1.set_xlabel("Number of Data Points")
ax1.set_ylabel("Time (ms)")
ax1.set_title("Performance Scaling")
ax1.legend()
ax1.grid(alpha=0.3)
ax2.loglog(bench_results["n_points"], bench_results["mean_time_ms"], "o-", label="NLSQ")
ax2.set_xlabel("Number of Data Points")
ax2.set_ylabel("Time (ms)")
ax2.set_title("Scaling Behavior (log-log)")
ax2.legend()
ax2.grid(alpha=0.3, which="both")
plt.tight_layout()
# Create figures directory relative to notebook location
fig_dir = Path("figures") / "gpu_optimization_deep_dive"
fig_dir.mkdir(parents=True, exist_ok=True)
plt.savefig(fig_dir / "fig_01.png", dpi=300, bbox_inches="tight")
plt.close()
print("Interpretation:")
print("  - Nearly flat scaling: Well-optimized (GPU benefits)")
print("  - Linear scaling: Expected for iterative optimization")
print("  - Superlinear scaling: May indicate memory issues or poor caching")

Running comprehensive benchmark...
(This may take 30-60 seconds)



INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 100, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 100, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=5.805608e+00 | ‖∇f‖=1.034625e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.022348e-01 | ‖∇f‖=6.292718e+00 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.903545e-01 | ‖∇f‖=3.541027e-02 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.903525e-01 | ‖∇f‖=1.962208e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 1.009894s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.903525e-01 | time=1.010s | final_gradient_norm=2.730702952549735e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 1.328691s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 100, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 100, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=5.805608e+00 | ‖∇f‖=1.034625e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.022348e-01 | ‖∇f‖=6.292718e+00 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.903545e-01 | ‖∇f‖=3.541027e-02 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.903525e-01 | ‖∇f‖=1.962208e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.016258s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.903525e-01 | time=0.016s | final_gradient_norm=2.730702952549735e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.061507s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 100, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 100, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=5.805608e+00 | ‖∇f‖=1.034625e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.022348e-01 | ‖∇f‖=6.292718e+00 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.903545e-01 | ‖∇f‖=3.541027e-02 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.903525e-01 | ‖∇f‖=1.962208e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.018770s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.903525e-01 | time=0.019s | final_gradient_norm=2.730702952549735e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.068877s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 100, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 100, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=5.805608e+00 | ‖∇f‖=1.034625e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.022348e-01 | ‖∇f‖=6.292718e+00 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.903545e-01 | ‖∇f‖=3.541027e-02 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.903525e-01 | ‖∇f‖=1.962208e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.016250s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.903525e-01 | time=0.016s | final_gradient_norm=2.730702952549735e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.059244s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 100, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 100, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=5.805608e+00 | ‖∇f‖=1.034625e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.022348e-01 | ‖∇f‖=6.292718e+00 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.903545e-01 | ‖∇f‖=3.541027e-02 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.903525e-01 | ‖∇f‖=1.962208e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.019804s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.903525e-01 | time=0.020s | final_gradient_norm=2.730702952549735e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.070618s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 100, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 100, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=5.805608e+00 | ‖∇f‖=1.034625e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.022348e-01 | ‖∇f‖=6.292718e+00 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.903545e-01 | ‖∇f‖=3.541027e-02 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.903525e-01 | ‖∇f‖=1.962208e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.017952s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.903525e-01 | time=0.018s | final_gradient_norm=2.730702952549735e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.061305s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 500, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 500, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.928937e+01 | ‖∇f‖=5.095375e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.107576e+00 | ‖∇f‖=3.419677e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.494183e+00 | ‖∇f‖=3.509595e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.494144e+00 | ‖∇f‖=3.227774e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.284884s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.494144e+00 | time=0.285s | final_gradient_norm=3.690756956720165e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.666882s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 500, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 500, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.928937e+01 | ‖∇f‖=5.095375e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.107576e+00 | ‖∇f‖=3.419677e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.494183e+00 | ‖∇f‖=3.509595e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.494144e+00 | ‖∇f‖=3.227774e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.015880s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.494144e+00 | time=0.016s | final_gradient_norm=3.690756956720165e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.066285s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 500, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 500, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.928937e+01 | ‖∇f‖=5.095375e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.107576e+00 | ‖∇f‖=3.419677e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.494183e+00 | ‖∇f‖=3.509595e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.494144e+00 | ‖∇f‖=3.227774e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.018966s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.494144e+00 | time=0.019s | final_gradient_norm=3.690756956720165e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.063919s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 500, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 500, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.928937e+01 | ‖∇f‖=5.095375e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.107576e+00 | ‖∇f‖=3.419677e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.494183e+00 | ‖∇f‖=3.509595e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.494144e+00 | ‖∇f‖=3.227774e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.024776s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.494144e+00 | time=0.025s | final_gradient_norm=3.690756956720165e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.074691s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 500, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 500, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.928937e+01 | ‖∇f‖=5.095375e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.107576e+00 | ‖∇f‖=3.419677e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.494183e+00 | ‖∇f‖=3.509595e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.494144e+00 | ‖∇f‖=3.227774e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.015670s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.494144e+00 | time=0.016s | final_gradient_norm=3.690756956720165e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.057999s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 500, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 500, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.928937e+01 | ‖∇f‖=5.095375e+01 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.107576e+00 | ‖∇f‖=3.419677e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.494183e+00 | ‖∇f‖=3.509595e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.494144e+00 | ‖∇f‖=3.227774e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.027979s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.494144e+00 | time=0.028s | final_gradient_norm=3.690756956720165e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.086226s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 1000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 1000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.015983e+01 | ‖∇f‖=1.040831e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.147707e+00 | ‖∇f‖=7.150963e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.829473e+00 | ‖∇f‖=8.479726e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.829359e+00 | ‖∇f‖=1.358676e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.259629s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.829359e+00 | time=0.260s | final_gradient_norm=1.0085184021235705e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.503264s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 1000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 1000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.015983e+01 | ‖∇f‖=1.040831e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.147707e+00 | ‖∇f‖=7.150963e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.829473e+00 | ‖∇f‖=8.479726e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.829359e+00 | ‖∇f‖=1.358676e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.019694s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.829359e+00 | time=0.020s | final_gradient_norm=1.0085184021235705e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.071443s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 1000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 1000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.015983e+01 | ‖∇f‖=1.040831e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.147707e+00 | ‖∇f‖=7.150963e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.829473e+00 | ‖∇f‖=8.479726e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.829359e+00 | ‖∇f‖=1.358676e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.022838s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.829359e+00 | time=0.023s | final_gradient_norm=1.0085184021235705e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.073066s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 1000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 1000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.015983e+01 | ‖∇f‖=1.040831e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.147707e+00 | ‖∇f‖=7.150963e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.829473e+00 | ‖∇f‖=8.479726e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.829359e+00 | ‖∇f‖=1.358676e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.018795s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.829359e+00 | time=0.019s | final_gradient_norm=1.0085184021235705e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.068928s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 1000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 1000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.015983e+01 | ‖∇f‖=1.040831e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.147707e+00 | ‖∇f‖=7.150963e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.829473e+00 | ‖∇f‖=8.479726e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.829359e+00 | ‖∇f‖=1.358676e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.017784s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.829359e+00 | time=0.018s | final_gradient_norm=1.0085184021235705e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.072267s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 1000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 1000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.015983e+01 | ‖∇f‖=1.040831e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.147707e+00 | ‖∇f‖=7.150963e+01 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.829473e+00 | ‖∇f‖=8.479726e-01 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.829359e+00 | ‖∇f‖=1.358676e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.031145s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.829359e+00 | time=0.031s | final_gradient_norm=1.0085184021235705e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.082370s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 5000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 5000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.979595e+02 | ‖∇f‖=5.122920e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.112033e+01 | ‖∇f‖=3.511319e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.466691e+01 | ‖∇f‖=3.846845e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.466644e+01 | ‖∇f‖=7.435896e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.354890s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.466644e+01 | time=0.355s | final_gradient_norm=7.24148135899938e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.608353s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 5000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 5000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.979595e+02 | ‖∇f‖=5.122920e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.112033e+01 | ‖∇f‖=3.511319e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.466691e+01 | ‖∇f‖=3.846845e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.466644e+01 | ‖∇f‖=7.435896e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.022094s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.466644e+01 | time=0.022s | final_gradient_norm=7.24148135899938e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.086247s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 5000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 5000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.979595e+02 | ‖∇f‖=5.122920e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.112033e+01 | ‖∇f‖=3.511319e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.466691e+01 | ‖∇f‖=3.846845e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.466644e+01 | ‖∇f‖=7.435896e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.022371s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.466644e+01 | time=0.022s | final_gradient_norm=7.24148135899938e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.076715s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 5000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 5000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.979595e+02 | ‖∇f‖=5.122920e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.112033e+01 | ‖∇f‖=3.511319e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.466691e+01 | ‖∇f‖=3.846845e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.466644e+01 | ‖∇f‖=7.435896e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.020028s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.466644e+01 | time=0.020s | final_gradient_norm=7.24148135899938e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.073086s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 5000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 5000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.979595e+02 | ‖∇f‖=5.122920e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.112033e+01 | ‖∇f‖=3.511319e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.466691e+01 | ‖∇f‖=3.846845e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.466644e+01 | ‖∇f‖=7.435896e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.016891s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.466644e+01 | time=0.017s | final_gradient_norm=7.24148135899938e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.063846s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 5000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 5000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=2.979595e+02 | ‖∇f‖=5.122920e+02 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=3.112033e+01 | ‖∇f‖=3.511319e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=2.466691e+01 | ‖∇f‖=3.846845e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=2.466644e+01 | ‖∇f‖=7.435896e-04 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.034964s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=2.466644e+01 | time=0.035s | final_gradient_norm=7.24148135899938e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.095718s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 10000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 10000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.080605e+02 | ‖∇f‖=1.040739e+03 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.341674e+01 | ‖∇f‖=7.165313e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.997985e+01 | ‖∇f‖=7.950365e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.997884e+01 | ‖∇f‖=6.061372e-03 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.320077s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.997884e+01 | time=0.320s | final_gradient_norm=4.54963454621371e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.623932s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 10000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 10000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.080605e+02 | ‖∇f‖=1.040739e+03 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.341674e+01 | ‖∇f‖=7.165313e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.997985e+01 | ‖∇f‖=7.950365e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.997884e+01 | ‖∇f‖=6.061372e-03 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.023012s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.997884e+01 | time=0.023s | final_gradient_norm=4.54963454621371e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.069189s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 10000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 10000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.080605e+02 | ‖∇f‖=1.040739e+03 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.341674e+01 | ‖∇f‖=7.165313e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.997985e+01 | ‖∇f‖=7.950365e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.997884e+01 | ‖∇f‖=6.061372e-03 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.025696s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.997884e+01 | time=0.026s | final_gradient_norm=4.54963454621371e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.072534s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 10000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 10000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.080605e+02 | ‖∇f‖=1.040739e+03 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.341674e+01 | ‖∇f‖=7.165313e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.997985e+01 | ‖∇f‖=7.950365e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.997884e+01 | ‖∇f‖=6.061372e-03 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.021711s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.997884e+01 | time=0.022s | final_gradient_norm=4.54963454621371e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.080078s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 10000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 10000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.080605e+02 | ‖∇f‖=1.040739e+03 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.341674e+01 | ‖∇f‖=7.165313e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.997985e+01 | ‖∇f‖=7.950365e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.997884e+01 | ‖∇f‖=6.061372e-03 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.020436s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.997884e+01 | time=0.020s | final_gradient_norm=4.54963454621371e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.067936s




INFO:nlsq.curve_fit:Starting curve fit | {'n_params': 2, 'n_data_points': 10000, 'method': 'trf', 'solver': 'auto', 'batch_size': None, 'has_bounds': False, 'dynamic_sizing': False}


INFO:nlsq.least_squares:Starting least squares optimization | {'method': 'trf', 'n_params': 2, 'loss': 'linear', 'ftol': 1e-08, 'xtol': 1e-08, 'gtol': 1e-08}


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) | {'n_params': 2, 'n_residuals': 10000, 'max_nfev': None}


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=0 | cost=6.080605e+02 | ‖∇f‖=1.040739e+03 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=1 | cost=6.341674e+01 | ‖∇f‖=7.165313e+02 | step=2.022375e+00 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=2 | cost=4.997985e+01 | ‖∇f‖=7.950365e+00 | step=2.022375e+00 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Optimization: iter=3 | cost=4.997884e+01 | ‖∇f‖=6.061372e-03 | step=2.022375e+00 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization took 0.028685s


INFO:nlsq.least_squares:Convergence: reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.997884e+01 | time=0.029s | final_gradient_norm=4.54963454621371e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit took 0.077764s




Benchmark Results:
N Points     Mean Time (ms)       Throughput (fits/s)
------------------------------------------------------------
100             67.18 ± 4.42             14.9
500             72.56 ± 9.81             13.8
1000            76.54 ± 4.60             13.1
5000            82.30 ± 10.78            12.2
10000           76.81 ± 4.87             13.0



Interpretation:
  - Nearly flat scaling: Well-optimized (GPU benefits)
  - Linear scaling: Expected for iterative optimization
  - Superlinear scaling: May indicate memory issues or poor caching


Summary and Best Practices

Performance Optimization Checklist

**For Maximum Speed:**

1. ✅ **Use GPU** if available (5-50x speedup for large problems)
2. ✅ **Keep array shapes consistent** to avoid recompilation
3. ✅ **Use float32** unless high precision is needed (2x memory savings)
4. ✅ **Batch process** with `vmap` for multiple datasets (10-100x faster)
5. ✅ **Warm up JIT** with small dataset before benchmarking
6. ✅ **Use `block_until_ready()`** when timing (JAX is async)

**For Large Datasets:**

1. ✅ **Chunk data** if >100M points
2. ✅ **Monitor memory** usage
3. ✅ **Consider downsampling** for smooth, oversampled data
4. ✅ **Use streaming** for datasets that don't fit in memory

Performance Expectations

| **Scenario** | **Typical Time** | **Optimization** |
|--------------|------------------|------------------|
| First call (cold start) | 0.5-2 seconds | Expected (JIT compilation) |
| Subsequent calls (warm) | 1-50 ms | Cached compilation |
| Large dataset (10K points) | 5-100 ms | Use GPU if available |
| Batch (1000 fits) | 100-5000 ms | Use vmap for parallelization |
| Huge dataset (1M points) | 50-500 ms | GPU + chunking |

Troubleshooting Performance Issues

**Problem**: First call is slow (>5 seconds)
- **Solution**: Normal for JIT. Subsequent calls will be fast.

**Problem**: All calls are slow (>1 second for small data)
- **Solution**: Check if recompiling each time (varying shapes/dtypes)

**Problem**: Out of memory errors
- **Solution**: Use float32, chunk data, or downsample

**Problem**: GPU not being used
- **Solution**: Check `jax.devices()`, install jax[cuda] or jax[rocm]

**Problem**: Batch processing not faster than sequential
- **Solution**: Problem may be too small, try larger batches or datasets

Advanced Profiling

For detailed profiling:

```python
JAX profiling (requires jax[profiling])
import jax.profiler

Profile a code block
with jax.profiler.trace("/tmp/jax-trace", create_perfetto_link=True):
Your NLSQ code here
popt, pcov = cf.curve_fit(model, x, y, p0=...)

Opens profiling UI in browser
```

Production Recommendations

```python
Example: Optimized production setup
import jax
import jax.numpy as jnp
from nlsq import CurveFit

Configure JAX for production
jax.config.update('jax_enable_x64', False)  # Use float32

Pre-warm JIT cache at startup
cf = CurveFit()
x_dummy = jnp.linspace(0, 1, 100)
y_dummy = jnp.ones(100)
_ = cf.curve_fit(model, x_dummy, y_dummy, p0=initial_guess)

Now ready for fast production fitting
```

Next Steps

- **Scale up**: Try batch processing 10,000+ datasets with vmap
- **Optimize models**: Simplify model functions for faster evaluation
- **Profile**: Use JAX profiler to identify bottlenecks
- **Distribute**: For massive scale, consider JAX's `pmap` for multi-GPU

References

1. **JAX Performance**: https://jax.readthedocs.io/en/latest/notebooks/thinking_in_jax.html
2. **JAX Profiling**: https://jax.readthedocs.io/en/latest/profiling.html
3. **GPU Acceleration**: https://jax.readthedocs.io/en/latest/gpu_performance_tips.html
4. **Related examples**:
- `custom_algorithms_advanced.ipynb` - vmap for batch fitting
- `troubleshooting_guide.ipynb` - Performance debugging

---

**Remember**: Premature optimization is the root of all evil. Profile first, optimize what matters!
