# Lennard-Jones Potential CPU vs. GPU
---

Author: Stefan Bringuier

Email: <a href=mailto:stefanbringuier@gmail.com>stefanbringuier@gmail.com</a>


In [2]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import time

Compute [Lennard-Jones potential](https://wikipedia.org/wiki/Lennard-Jones_potential) using a single CPU and a single GPU. The potential energy of atom $i$ is with all other atoms so no neighbor list is used. The GPU implementation uses pycuda to compile the function kernel to CUDA code.

In [3]:
# CPU implementation
def lennard_jones_cpu(positions, epsilon, sigma):
    num_atoms = positions.shape[0]
    potential = np.zeros(num_atoms, dtype=np.float32)
    
    for i in range(num_atoms):
        for j in range(num_atoms):
            if i != j:
                r = positions[i] - positions[j]
                r2 = np.dot(r, r)
                r6 = r2 * r2 * r2
                r12 = r6 * r6
                potential[i] += 4 * epsilon * ((sigma**12 / r12) - (sigma**6 / r6))
    
    return potential

In [13]:
# GPU implementation
cuda_module = SourceModule("""
__global__ void lennard_jones_gpu(float *potential, float *positions, int num_atoms, float epsilon, float sigma)
{
    const int i = threadIdx.x + blockDim.x * blockIdx.x;
    if (i < num_atoms) {
        float total_potential = 0.0f;
        float3 pos_i = make_float3(positions[3*i], positions[3*i+1], positions[3*i+2]);
        float sigma6 = powf(sigma, 6);
        float sigma12 = sigma6 * sigma6;
        for (int j = 0; j < num_atoms; j++) {
            if (i != j) {
                float3 pos_j = make_float3(positions[3*j], positions[3*j+1], positions[3*j+2]);
                float3 r = make_float3(pos_i.x - pos_j.x, pos_i.y - pos_j.y, pos_i.z - pos_j.z);
                float r2 = r.x*r.x + r.y*r.y + r.z*r.z;
                float r6 = r2*r2*r2;
                float r12 = r6*r6;

                total_potential += 4.0f * epsilon * ((sigma12/r12) - (sigma6/r6));
            }
        }
        potential[i] = total_potential;
    }
}
""")
lennard_jones_gpu = cuda_module.get_function("lennard_jones_gpu")

In [14]:
# Benchmarking function
def benchmark(num_atoms, epsilon, sigma):
    positions = np.random.rand(num_atoms, 3).astype(np.float32)
    
    # CPU benchmark
    start_time = time.time()
    cpu_potential = lennard_jones_cpu(positions, epsilon, sigma)
    cpu_time = time.time() - start_time
    
    # GPU benchmark
    gpu_potential = np.zeros(num_atoms, dtype=np.float32)
    start_time = time.time()
    lennard_jones_gpu(
        drv.Out(gpu_potential), drv.In(positions.ravel()),
        np.int32(num_atoms), np.float32(epsilon), np.float32(sigma),
        block=(256, 1, 1), grid=((num_atoms + 255) // 256, 1)
    )
    drv.Context.synchronize()
    gpu_time = time.time() - start_time
    
    # Make sure arrays for CPU and GPU are within some floating point precision
    np.testing.assert_allclose(cpu_potential, gpu_potential, rtol=1e-5, atol=1e-5)
    
    return cpu_time, gpu_time

In [15]:
# Run benchmarks for different system sizes
system_sizes = [100, 500, 1000, 2000, 5000]
epsilon = 1.0
sigma = 1.0

print("Number of Atoms | CPU Time (s) | GPU Time (s) | Speedup")
print("----------------------------------------------------")

for num_atoms in system_sizes:
    cpu_time, gpu_time = benchmark(num_atoms, epsilon, sigma)
    speedup = cpu_time / gpu_time
    print(f"{num_atoms:14d} | {cpu_time:11.4f} | {gpu_time:11.4f} | {speedup:7.2f}x")

Number of Atoms | CPU Time (s) | GPU Time (s) | Speedup
----------------------------------------------------
           100 |      0.1490 |      0.0008 |  183.40x
           500 |      2.7130 |      0.0009 | 3140.79x
          1000 |     10.8806 |      0.0009 | 11457.80x
          2000 |     43.7833 |      0.0127 | 3441.92x
          5000 |    274.5926 |      0.0135 | 20356.05x
