# Peformance comparison of different algorithms

In this notebook we analyze the performance of various algorithms, including MLPs, dense dynamical systems, and various sorts of sparse dynamical systems.

In [None]:
import torch
import warnings
import time
warnings.filterwarnings("ignore")
# TODO Add these imports and test MaskeLinear and SparseLinear
from iterativennsimple.MaskedLinear import MaskedLinear
from iterativennsimple.SparseLinear import SparseLinear

In [16]:
def generate(size, entries, device="cuda"):
    """create a variety of sparse matrices 

    Args:
        size (int, optional): Size of the square matrix. Defaults to 1000.
        entries (int, optional): Total number of non-zero entries. Note this is an upper bound, but should be close to the actual size. Defaults to 23*1000.
        device (str, optional): "cuda" or "cpu". Defaults to "cuda".

    Returns:
        dict: Dictionary of the sparse matrices
    """ 
    output = {}
    # We first create a COO tensor since that is easier to create
    indices = torch.randint(0, size, (entries,2))
    vals = torch.randn(entries)
    coo = torch.sparse_coo_tensor(indices.t(), vals, (size, size), device=device)
    coo = coo.coalesce()

    # Then we convert it to CSC, CSR and dense
    dense = coo.to_dense()
    csc = coo.to_sparse_csc()
    csr = coo.to_sparse_csr()
    
    # We also create the MaskedLinear and SparseLinear objects
    class moduleWrapper(object):
        def __init__(self, module, device=device):
            self.module = module
            self.device = device
        def __matmul__(self, x):
            return self.module(x.T).T
        
    maskedLinear = moduleWrapper(MaskedLinear.from_coo(coo).to(device), device)
    sparseLinear = moduleWrapper(SparseLinear.from_coo(coo).to(device), device)

    output["dense"] = dense
    output["coo"] = coo
    output["csc"] = csc
    output["csr"] = csr
    output["maskedLinear"] = maskedLinear
    output["sparseLinear"] = sparseLinear
    return output

In [23]:
## check that all of the matrices are the same operator
def matrix_check(size, entries, device):
    # Generate the matrices
    matrices = generate(size, entries, device=device)
    # Size of the RHS
    x_cols = 100
    x = torch.randn(size, x_cols, device=device)

    print('Matrix-matrix multiplication check')
    y_true = None
    base_name = None

    for matrix, matrix_name in zip(matrices.values(), matrices.keys()):
        y = matrix @ x
        if y_true is None:
            y_true = y
            base_name = matrix_name
        else:
            print(f"{matrix_name} == {base_name}: {torch.allclose(y, y_true)}")
            

In [24]:
matrix_check(1000, 23*1000, "cuda")

Matrix-matrix multiplication check
coo == dense: False
csc == dense: False
csr == dense: False
maskedLinear == dense: False


RuntimeError: The size of tensor a (1000) must match the size of tensor b (100) at non-singleton dimension 1

In [17]:
def run_timing(size, entries, device, syncgpu):
    # test if cuda is available
    if device == "cuda":
        if not torch.cuda.is_available():
            print("CUDA is not available, using CPU instead")
            device = "cpu"

    print(f"Running on {device}")
    print(f"Size: {size}")
    print(f"Entries: {entries}")
    print(f"Synchronize GPU: {syncgpu}")
    
    # Generate the matrices
    matrices = generate(size, entries, device=device)

    # Size of the RHS
    x_cols = 100

    # Compute the timings
    print('Matrix-matrix multiplication timings:')
    for matrix, matrix_name in zip(matrices.values(), matrices.keys()):
        # first, do a few runs to warm up the cache
        for i in range(2):
            x = torch.randn(size, x_cols, device=device)
            y = matrix @ x
        # now do the timings
        matrix_times = []
        for i in range(5):
            x = torch.randn(size, x_cols, device=device)    
            if syncgpu:
                torch.cuda.synchronize()
            start = time.perf_counter()
            y = matrix @ x
            if syncgpu:
                torch.cuda.synchronize()
            matrix_time = time.perf_counter()-start
            matrix_times.append(matrix_time)
        avg_matrix_time = sum(matrix_times)/len(matrix_times)
        min_matrix_time = min(matrix_times)
        max_matrix_time = max(matrix_times)

        print(f'{matrix_name} is on {matrix.device}')
        print(f"{matrix_name} avg time:", avg_matrix_time)
        print(f"{matrix_name} min time:", min_matrix_time)
        print(f"{matrix_name} max time:", max_matrix_time)

In [19]:
run_timing(10000, 23*10000, "cuda", True)

Running on cuda
Size: 10000
Entries: 230000
Synchronize GPU: True


Matrix-matrix multiplication timings:
dense is on cuda:0
dense avg time: 0.0006227235309779644
dense min time: 0.0006165849044919014
dense max time: 0.0006317049264907837
coo is on cuda:0
coo avg time: 0.00011655818670988082
coo min time: 0.00011586863547563553
coo max time: 0.00011704117059707642
csc is on cuda:0
csc avg time: 0.00027557080611586573
csc min time: 0.0002740276977419853
csc max time: 0.0002782456576824188
csr is on cuda:0
csr avg time: 5.301395431160927e-05
csr min time: 5.24190254509449e-05
csr max time: 5.364092066884041e-05
maskedLinear is on cuda
maskedLinear avg time: 0.0031874104402959346
maskedLinear min time: 0.003185573033988476
maskedLinear max time: 0.0031895912252366543
sparseLinear is on cuda
sparseLinear avg time: 0.0004885297268629075
sparseLinear min time: 0.00048765214160084724
sparseLinear max time: 0.0004896358586847782
