# Parallelization

This is a notebook that showcases the effects of parallelization when performing the self-consistency cycle. For brevity, we will define the pre-processing steps as a single function and re-call it with different atom configurations whenever necessary. For the config we choose 40x40x40 $k$-points for the single atom and 40x40 $k$-points for the slab. In all cases, we will do 5 reps without caring for convergence. We will count two types of times: [1] CPU time (i.e. processing time) and [2] Real time (i.e. wall time). At the end of the day, what we care about is wall time, however processing time is also valuable to understand the effects of parallelization.

In [1]:
import numpy as np
import scipy as sp

# To have access to tsc module
import sys
import os

# Get the parent directory of the current working directory
gparent_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# Append the parent directory to the PYTHONPATH
sys.path.append(gparent_dir)

# ---------------------------------------------------------------------------------

import config

from tsc.utilities import get_RPTS, get_nndists, get_connections, fermi, get_KPTS
from tsc.basis_atoms import single_atom, extract_atom_vectors, slab
from tsc.hamiltonian import hopping_consts, hopping_elements, get_exponentials
from tsc.hamiltonian import prep_N_hamiltonian, get_N_hamiltonian

import time

# Get config variables
globals().update({k: v for k, v in vars(config.Config).items() if not k.startswith("__")})

In [2]:
def do_stuff(atoms, a, Nkz):
    
    # Construct the lattice sites matrix
    RPTS = get_RPTS(a_1, a_2, a, NCELLS)

    N_k = Nkz*N_x*N_y

    # Extract the vectors from the called function
    TPTS, atom_types, E_0, U, n_bar, B, Λ = extract_atom_vectors(atoms)
    
    # Write down the number of basis atoms
    N_b: int = TPTS.shape[0]
    
    num_neighbs = get_nndists(RPTS, TPTS, R_max)
    
    # Get the maximum number of neighbours
    max_neighb = num_neighbs.max()
    
    # Get the atom_IJ and Rvec_ij matrices
    atom_IJ, Rvec_ij = get_connections(max_neighb, RPTS, TPTS, R_max)
    
    # Get the number of different atom types based on the basis_atoms configuration
    # Also re-encode the atom_types as 0, 1, 2, etc.
    unique_atoms, atom_types = np.unique(atom_types, return_inverse=True)
    N_unique: int = unique_atoms.shape[0]
    
    # Construct a completely symmetrical case
    t_0 = np.ones((N_unique, N_unique))
    
    # Get the hopping elements
    t = hopping_elements(atom_IJ, num_neighbs, Rvec_ij, atom_types, t_0, R_0)

    # Get a k-mesh with resolution that depends on N_x, N_y, N_z
    KPTS = get_KPTS(a_1, a_2, a, N_x, N_y, Nkz)

    # Get the exponentials
    fourier = get_exponentials(Rvec_ij, KPTS)

    μ = μ_0
    n = np.ones((N_b))*n_0

    return N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k

## Case 1: Single Atom - No Optimization

In [3]:
atoms = single_atom()

a = np.array([0.0,0.0,1.0])
Nkz = 40

N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k = do_stuff(atoms, a, Nkz)

In [4]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    for k in range(N_k):
        # Get a deepcopy of H_prep so that we don't have to re-generate it
        H_copied = np.copy(H_prep)
        # Get a new H(k) for every k
        H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)

        # Diagonalize H(k)
        w, v = sp.linalg.eigh(H, driver="ev")

        # Store results
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    end1 = time.process_time()
    end2 = time.time()
    
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 1.8281.	 Real time = 1.8449
Processing time = 1.6562.	 Real time = 1.6604
Processing time = 1.8750.	 Real time = 1.8768
Processing time = 1.6406.	 Real time = 1.6566
Processing time = 2.6250.	 Real time = 2.7513


## Case 2: Single Atom - Optimized

In [5]:
from joblib import Parallel, delayed
import functools

def k_loop(k, H_prep, atom_IJ, num_neighbs, fourier, t, N_b):
    H_copied = np.copy(H_prep)
    H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)
    w, v = sp.linalg.eigh(H, driver="ev")
    return w, v, k

In [6]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k) for k in range(N_k))

    for w, v, k in results:
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 2.0625.	 Real time = 3.9224
Processing time = 2.0312.	 Real time = 2.1145
Processing time = 1.8594.	 Real time = 1.9604
Processing time = 1.9062.	 Real time = 2.0379
Processing time = 1.9219.	 Real time = 1.9734


While the processing time is slightly better than in the non-optimized case, it appears that all times are pretty much comparable. The reason for this is that the parallelized task is the diagonalization of a 2x2 matrix, which is somewhat trivial.

## Case 3: Single Atom - Batch Optimized

In [7]:
def k_batch_loop(start, end, H_prep, atom_IJ, num_neighbs, fourier, t, N_b):
    
    batch_w = np.zeros((end - start, 2*N_b))
    batch_v = np.zeros((end - start, 2*N_b, 2*N_b), dtype=np.complex128)
    for i, k in enumerate(range(start, end)):
        H_copied = np.copy(H_prep)
        H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)
        w, v = sp.linalg.eigh(H, driver="ev")
        batch_w[i, :] = w
        batch_v[i, :, :] = v
        
    return batch_w, batch_v, start, end

In [8]:
from joblib import cpu_count

# Batch the parallelism
n_cores = cpu_count()
batch_size = N_k // n_cores

for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_batch_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))

    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 0.0938.	 Real time = 0.4586
Processing time = 0.0625.	 Real time = 0.4987
Processing time = 0.0156.	 Real time = 0.4540
Processing time = 0.0312.	 Real time = 0.4599
Processing time = 0.0625.	 Real time = 0.4412


This is where things get really interesting. Since the diagonalization of a 2x2 matrix is a rather trivial proceedure, we evenly distributed the job to multiple cores, so that each core has many such diagonalizations to perform, in batches. Let's see what happens when we move on to another type of problem, i.e. the slab, where the matrix to be diagonalized is higher dimensional (e.g. 200x200).

## Case 4: Slab - No Optimization

In [9]:
atoms = slab(slab_length=100)

a = np.array([0.0,0.0,200.0])
Nkz = 1

N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k = do_stuff(atoms, a, Nkz)

In [10]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    for k in range(N_k):
        # Get a deepcopy of H_prep so that we don't have to re-generate it
        H_copied = np.copy(H_prep)
        # Get a new H(k) for every k
        H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)

        # Diagonalize H(k)
        w, v = sp.linalg.eigh(H, driver="ev")

        # Store results
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    end1 = time.process_time()
    end2 = time.time()
    
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 59.4531.	 Real time = 7.6581
Processing time = 58.1406.	 Real time = 7.4427
Processing time = 65.4531.	 Real time = 8.4214
Processing time = 65.9688.	 Real time = 8.5020
Processing time = 63.6094.	 Real time = 8.1470


## Case 5: Slab - Optimized

In [11]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k) for k in range(N_k))

    for w, v, k in results:
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 3.3594.	 Real time = 2.5927
Processing time = 2.5312.	 Real time = 2.6041
Processing time = 2.4375.	 Real time = 2.6218
Processing time = 2.5469.	 Real time = 2.6724
Processing time = 2.5312.	 Real time = 2.6497


It is evident that in this case parallelization does wonders even without batching, as the processing time is reduced by more than 20 times, while the real time is also reduced by more than 2.5 times.

## Case 6: Slab - Batch Optimized

In [12]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_batch_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))

    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 1.0938.	 Real time = 10.4319
Processing time = 1.0781.	 Real time = 9.6360
Processing time = 1.0781.	 Real time = 9.6607
Processing time = 1.0781.	 Real time = 9.6097
Processing time = 1.0938.	 Real time = 9.6054


It appears that now where the problem is not as trivial as this of the 2x2 matrix, batch parallelization induces overheard that leads to higher wall times even when compared to the non-parallelized case.

Let's see what happens if we experiment with the batch size manually.

In [13]:
batch_size = 20

for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_batch_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))

    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 1.5938.	 Real time = 1.9932
Processing time = 1.5781.	 Real time = 2.7021
Processing time = 1.6250.	 Real time = 1.9998
Processing time = 1.6719.	 Real time = 2.0416
Processing time = 1.6719.	 Real time = 2.1051


It becomes evident that choosing smaller batch sizes allows us to achieve better than non-batched/non-parallel times.

Note that if we set the batch size equal to 1, we get a situation which is equivalent to this of the non-batched parallelization.

In [14]:
batch_size = 1

for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_batch_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))

    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 2.4062.	 Real time = 2.4958
Processing time = 2.6094.	 Real time = 2.5806
Processing time = 2.3750.	 Real time = 2.6047
Processing time = 2.5625.	 Real time = 2.5665
Processing time = 2.3438.	 Real time = 2.5798


## Alternative Approach

In what follows, we will perform the exact same processes, only this time constructing the Hamiltonian matrix in a different way, i.e. it is constructed for all $k$ points and then each $k$-slice is diagonalized.

In [15]:
from tsc.hamiltonian import prep_N_hamiltonian_vectorized, get_N_hamiltonian_vectorized

### Case 1 Revisited

In [16]:
atoms = single_atom()

a = np.array([0.0,0.0,1.0])
Nkz = 40

N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k = do_stuff(atoms, a, Nkz)

In [17]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    for k in range(N_k):
        # Diagonalize H(k)
        w, v = sp.linalg.eigh(H[k,:,:], driver="ev")

        # Store results
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    end1 = time.process_time()
    end2 = time.time()
    
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 1.4062.	 Real time = 1.4027
Processing time = 1.3906.	 Real time = 1.3884
Processing time = 1.4062.	 Real time = 1.4104
Processing time = 1.3906.	 Real time = 1.3952
Processing time = 1.5156.	 Real time = 1.5124


### Case 2 Revisited

In [18]:
def k_loop_2(k, H):
    w, v = sp.linalg.eigh(H[k,:,:], driver="ev")
    return w, v, k

In [19]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    partial_compute_for_k = functools.partial(k_loop_2, H=H)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k) for k in range(N_k))

    for w, v, k in results:
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 1.8125.	 Real time = 1.9269
Processing time = 1.6562.	 Real time = 1.8337
Processing time = 1.7500.	 Real time = 1.8316
Processing time = 1.6875.	 Real time = 1.8651
Processing time = 1.6719.	 Real time = 1.8061


### Case 3 Revisited

In [20]:
def k_batch_loop_2(start, end, H):
    batch_w = np.zeros((end - start, H.shape[1]))
    batch_v = np.zeros((end - start, H.shape[1], H.shape[2]), dtype=np.complex128)
    for i, k in enumerate(range(start, end)):
        w, v = sp.linalg.eigh(H[k,:,:], driver="ev")
        batch_w[i, :] = w
        batch_v[i, :, :] = v
    return batch_w, batch_v, start, end

In [21]:
batch_size = N_k // n_cores

for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    # Run the jobs in parallel and gather all results in a results list
    results = Parallel(n_jobs=-1)(delayed(functools.partial(k_batch_loop_2, H=H))(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))
    
    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 0.1094.	 Real time = 0.4598
Processing time = 0.0625.	 Real time = 0.4621
Processing time = 0.0781.	 Real time = 0.4666
Processing time = 0.0781.	 Real time = 0.4545
Processing time = 0.0938.	 Real time = 0.5299


### Case 4 Revisited

In [22]:
atoms = slab(slab_length=100)

a = np.array([0.0,0.0,200.0])
Nkz = 1

N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k = do_stuff(atoms, a, Nkz)

In [23]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    for k in range(N_k):
        # Diagonalize H(k)
        w, v = sp.linalg.eigh(H[k,:,:], driver="ev")

        # Store results
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    end1 = time.process_time()
    end2 = time.time()
    
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 56.0625.	 Real time = 7.2817
Processing time = 55.6875.	 Real time = 7.2271
Processing time = 59.1719.	 Real time = 7.7047
Processing time = 58.5625.	 Real time = 7.6334
Processing time = 57.2344.	 Real time = 7.4497


### Case 5 Revisited

In [24]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    partial_compute_for_k = functools.partial(k_loop_2, H=H)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k) for k in range(N_k))

    for w, v, k in results:
        E_vals[k, :] = w
        E_vecs[k, :, :] = v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 2.6406.	 Real time = 2.8899
Processing time = 2.9062.	 Real time = 3.0370
Processing time = 2.8438.	 Real time = 2.9906
Processing time = 2.8281.	 Real time = 3.0223
Processing time = 2.8750.	 Real time = 3.3856


### Case 6 Revisited

In [25]:
batch_size = N_k // n_cores

for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    # Run the jobs in parallel and gather all results in a results list
    results = Parallel(n_jobs=-1)(delayed(functools.partial(k_batch_loop_2, H=H))(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))
    
    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 1.7031.	 Real time = 3.0864
Processing time = 1.6562.	 Real time = 3.0784
Processing time = 1.7656.	 Real time = 3.2413
Processing time = 1.6094.	 Real time = 2.9889
Processing time = 1.7031.	 Real time = 3.1486


And also trying for another batch size:

In [26]:
batch_size = 20

for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((N_k, 2*N_b)), np.zeros((N_k, 2*N_b, 2*N_b), dtype=np.complex128)

    # Get the full-k prepared Hamiltonian
    H = prep_N_hamiltonian_vectorized(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3, N_k)

    start1 = time.process_time()
    start2 = time.time()

    # Get the full-k actual Hamiltonian
    H = get_N_hamiltonian_vectorized(H, atom_IJ, num_neighbs, fourier, t)
    
    # Run the jobs in parallel and gather all results in a results list
    results = Parallel(n_jobs=-1)(delayed(functools.partial(k_batch_loop_2, H=H))(k, min(k+batch_size, N_k)) for k in range(0, N_k, batch_size))
    
    for batch_w, batch_v, start, end in results:
        E_vals[start:end, :] = batch_w
        E_vecs[start:end, :, :] = batch_v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------


Processing time = 2.0938.	 Real time = 2.6775
Processing time = 2.1406.	 Real time = 2.7522
Processing time = 2.1406.	 Real time = 2.6077
Processing time = 2.1719.	 Real time = 2.5490
Processing time = 2.2188.	 Real time = 2.5122
