This example is intended to show how to use CuPy to run mdopt on GPU. It has two blocks of code which are to be run in Colab to show the performance difference between CPU and GPU.

In [None]:
!pip -q install "git+https://github.com/quicophy/mdopt.git"

import time, numpy as np
from tqdm import tqdm
from mdopt.mps.utils import create_simple_product_state
from mdopt.utils.utils import create_random_mpo, mpo_to_matrix
from mdopt.contractor.contractor import mps_mpo_contract
from mdopt.backend import array as A

print("Backend GPU flag:", A.GPU)  # should be False

def bench(run_label="CPU", num_sites=48, phys_dim=2, mpo_len=32, chi=256, reps=5):
    mps = create_simple_product_state(num_sites=num_sites, which="0", phys_dim=phys_dim)
    mpo = create_random_mpo(
        num_sites=mpo_len,
        bond_dimensions=[chi]*(mpo_len-1),
        phys_dim=phys_dim,
        which="uniform",
    )
    start_site = 0
    # warm-up
    _ = mps_mpo_contract(mps.copy(), mpo, start_site=start_site, renormalise=False)
    A.synchronize()
    t0 = time.perf_counter()
    for _ in tqdm(range(reps)):
        _ = mps_mpo_contract(mps.copy(), mpo, start_site=start_site, renormalise=False)
    A.synchronize()
    t1 = time.perf_counter()
    print(f"{run_label}: {((t1 - t0)/reps):.4f} s per run")

cpu_time = bench("CPU", num_sites=48, mpo_len=32, chi=256, reps=10)

In [None]:
# In Colab, don't forget to switch the runtime to GPU before running the next block

!nvidia-smi
!pip -q install cupy-cuda12x
!pip -q install "git+https://github.com/quicophy/mdopt.git"

%env MDOPT_BACKEND=cupy

import time, numpy as np
from tqdm import tqdm
from mdopt.mps.utils import create_simple_product_state
from mdopt.utils.utils import create_random_mpo
from mdopt.contractor.contractor import mps_mpo_contract
from mdopt.backend import array as A

print("Backend GPU flag:", A.GPU)  # should be True

def bench(run_label="GPU", num_sites=48, phys_dim=2, mpo_len=32, chi=256, reps=5):
    mps = create_simple_product_state(num_sites=num_sites, which="0", phys_dim=phys_dim)
    mpo = create_random_mpo(
        num_sites=mpo_len,
        bond_dimensions=[chi]*(mpo_len-1),
        phys_dim=phys_dim,
        which="uniform",
    )
    start_site = 0
    # warm-up
    _ = mps_mpo_contract(mps.copy(), mpo, start_site=start_site, renormalise=False)
    A.synchronize()
    t0 = time.perf_counter()
    for _ in tqdm(range(reps)):
        _ = mps_mpo_contract(mps.copy(), mpo, start_site=start_site, renormalise=False)
    A.synchronize()
    t1 = time.perf_counter()
    print(f"{run_label}: {((t1 - t0)/reps):.4f} s per run")

cpu_time = bench("GPU", num_sites=48, mpo_len=32, chi=256, reps=10)