# Parallelization

This is a notebook that showcases the effects of parallelization when performing the self-consistency cycle. For brevity, we will define the pre-processing steps as a single function and re-call it with different atom configurations whenever necessary. For the config we choose 40x40x40 $k$-points for the single atom and 40x40 $k$-points for the slab. In all cases, we will do 5 reps without caring for convergence. We will count two types of times: [1] CPU time (i.e. processing time) and [2] Real time (i.e. wall time). At the end of the day, what we care about is wall time, however processing time is also valuable to understand the effects of parallelization.

In [1]:
import numpy as np
import scipy as sp

import copy

# To have access to tsc module
import sys
import os

# Get the parent directory of the current working directory
gparent_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# Append the parent directory to the PYTHONPATH
sys.path.append(gparent_dir)

# ---------------------------------------------------------------------------------

import config

from tsc.utilities import get_RPTS, get_nndists, get_connections, fermi, get_KPTS
from tsc.basis_atoms import single_atom, extract_atom_vectors, slab
from tsc.hamiltonian import hopping_consts, hopping_elements, get_exponentials
from tsc.hamiltonian import prep_N_hamiltonian, get_N_hamiltonian, prep_SC_hamiltonian, get_SC_hamiltonian

import time

# Get config variables
globals().update({k: v for k, v in vars(config.Config).items() if not k.startswith("__")})

In [2]:
def do_stuff(atoms, a, Nkz):
    
    # Construct the lattice sites matrix
    RPTS = get_RPTS(a_1, a_2, a, NCELLS)

    N_k = Nkz*N_x*N_y

    # Extract the vectors from the called function
    TPTS, atom_types, E_0, U, n_bar, B, Λ = extract_atom_vectors(atoms)
    
    # Write down the number of basis atoms
    N_b: int = TPTS.shape[0]
    
    num_neighbs = get_nndists(RPTS, TPTS, R_max)
    
    # Get the maximum number of neighbours
    max_neighb = num_neighbs.max()
    
    # Get the atom_IJ and Rvec_ij matrices
    atom_IJ, Rvec_ij = get_connections(max_neighb, RPTS, TPTS, R_max)
    
    # Get the number of different atom types based on the basis_atoms configuration
    # Also re-encode the atom_types as 0, 1, 2, etc.
    unique_atoms, atom_types = np.unique(atom_types, return_inverse=True)
    N_unique: int = unique_atoms.shape[0]
    
    # Construct a completely symmetrical case
    hop_mat = np.ones((N_unique, N_unique))
    
    # Get the hopping terms
    t_0 = hopping_consts(hop_mat, atom_types, TPTS, RPTS, R_0)
    
    # Get the hopping elements
    t = hopping_elements(atom_IJ, num_neighbs, Rvec_ij, atom_types, t_0, R_0)

    # Get a k-mesh with resolution that depends on N_x, N_y, N_z
    KPTS = get_KPTS(a_1, a_2, a, N_x, N_y, Nkz)

    # Get the exponentials
    fourier = get_exponentials(Rvec_ij, KPTS)

    μ = μ_0
    n = np.ones((N_b))*n_0

    return N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k

## Case 1: Single Atom - No Optimization

In [3]:
atoms = single_atom()

a = np.array([0.0,0.0,1.0])
Nkz = 40

N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k = do_stuff(atoms, a, Nkz)

In [4]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((2*N_b*N_k)), np.zeros((2*N_b, 2*N_b*N_k), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    for k in range(N_k):
        # Get a deepcopy of H_prep so that we don't have to re-generate it
        H_copied = copy.deepcopy(H_prep)
        # Get a new H(k) for every k
        H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)

        # Diagonalize H(k)
        w, v = sp.linalg.eigh(H, driver="ev")

        # Store results
        ini: int = k*2*N_b
        fin: int = (k+1)*2*N_b
        E_vals[ini:fin] = w
        E_vecs[:, ini:fin] = v
    end1 = time.process_time()
    end2 = time.time()
    
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 1.9375.	 Real time = 1.9663
Processing time = 1.8281.	 Real time = 1.8150
Processing time = 1.8125.	 Real time = 1.8150
Processing time = 1.8281.	 Real time = 1.8230
Processing time = 1.8125.	 Real time = 1.8161


## Case 2: Single Atom - Optimized

In [5]:
from joblib import Parallel, delayed
import functools

def k_loop(k, H_prep, atom_IJ, num_neighbs, fourier, t, N_b):
    H_copied = copy.deepcopy(H_prep)
    H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)
    w, v = sp.linalg.eigh(H, driver="ev")
    ini = k * 2 * N_b
    fin = (k + 1) * 2 * N_b
    return w, v, ini, fin

In [6]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((2*N_b*N_k)), np.zeros((2*N_b, 2*N_b*N_k), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k) for k in range(N_k))

    for w, v, ini, fin in results:
        E_vals[ini:fin] = w
        E_vecs[:, ini:fin] = v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 2.0469.	 Real time = 3.5760
Processing time = 1.7500.	 Real time = 1.9217
Processing time = 1.7656.	 Real time = 1.8720
Processing time = 1.7500.	 Real time = 1.8500
Processing time = 1.7031.	 Real time = 1.7838


While the processing time is slightly better than in the non-optimized case, it appears that all times are pretty much comparable. The reason for this is that the parallelized task is the diagonalization of a 2x2 matrix, which is somewhat trivial. Let's see what happens when we move on to another type of problem, i.e. the slab, where the matrix to be diagonalized is higher dimensional (e.g. 200x200).

## Case 3: Slab - No Optimization

In [7]:
atoms = slab(slab_length=100)

a = np.array([0.0,0.0,200.0])
Nkz = 1

N_b, E_0, μ, U, n, n_bar, B, atom_IJ, num_neighbs, fourier, t, KPTS, N_k = do_stuff(atoms, a, Nkz)

In [8]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((2*N_b*N_k)), np.zeros((2*N_b, 2*N_b*N_k), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    for k in range(N_k):
        # Get a deepcopy of H_prep so that we don't have to re-generate it
        H_copied = copy.deepcopy(H_prep)
        # Get a new H(k) for every k
        H = get_N_hamiltonian(k, H_copied, atom_IJ, num_neighbs, fourier, t)

        # Diagonalize H(k)
        w, v = sp.linalg.eigh(H, driver="ev")

        # Store results
        ini: int = k*2*N_b
        fin: int = (k+1)*2*N_b
        E_vals[ini:fin] = w
        E_vecs[:, ini:fin] = v
    end1 = time.process_time()
    end2 = time.time()
    
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 49.9688.	 Real time = 6.2560
Processing time = 50.5469.	 Real time = 6.3250
Processing time = 50.8281.	 Real time = 6.3628
Processing time = 49.4844.	 Real time = 6.2045
Processing time = 49.4688.	 Real time = 6.1895


## Case 4: Slab - Optimized

In [9]:
for rep in range(5):
    
    # Create placeholder arrays for the eigenvalues and eigenvectors
    E_vals, E_vecs = np.zeros((2*N_b*N_k)), np.zeros((2*N_b, 2*N_b*N_k), dtype=np.complex128)

    # prepare the Hamiltonian
    H_prep = prep_N_hamiltonian(E_0, μ, U, n, n_bar, B, s_0, s_1, s_2, s_3)

    start1 = time.process_time()
    start2 = time.time()
    
    partial_compute_for_k = functools.partial(k_loop, H_prep=H_prep, atom_IJ=atom_IJ, num_neighbs=num_neighbs, fourier=fourier, t=t, N_b=N_b)
    results = Parallel(n_jobs=-1)(delayed(partial_compute_for_k)(k) for k in range(N_k))

    for w, v, ini, fin in results:
        E_vals[ini:fin] = w
        E_vecs[:, ini:fin] = v
    
    end1 = time.process_time()
    end2 = time.time()
    print(f"Processing time = {end1-start1:.4f}.\t Real time = {end2-start2:.4f}")

    # ---------------------------------------------------------
    # We do not care about the rest stuff
    # ---------------------------------------------------------

Processing time = 3.7500.	 Real time = 2.6056
Processing time = 2.6719.	 Real time = 2.6031
Processing time = 2.5156.	 Real time = 2.5940
Processing time = 2.5625.	 Real time = 2.5853
Processing time = 2.6094.	 Real time = 2.6321


It is evident that in this case parallelization does wonders, as the processing time is reduced by more than 20 times, while the real time is also reduced by more than 2.5 times.