# Peformance comparison of different algorithms

In this notebook we analyze the performance of various algorithms, including MLPs, dense dynamical systems, and various sorts of sparse dynamical systems.

In [1]:
import torch
import warnings
import time
warnings.filterwarnings("ignore")
# TODO Add these imports and test MaskeLinear and SparseLinear
from iterativennsimple.MaskedLinear import MaskedLinear
from iterativennsimple.SparseLinear import SparseLinear

In [2]:
def generate(size, entries, device="cuda"):
    """create a variety of sparse matrices 

    Args:
        size (int, optional): Size of the square matrix. Defaults to 1000.
        entries (int, optional): Total number of non-zero entries. Note this is an upper bound, but should be close to the actual size. Defaults to 23*1000.
        device (str, optional): "cuda" or "cpu". Defaults to "cuda".

    Returns:
        dict: Dictionary of the sparse matrices
    """ 
    output = {}
    # We first create a COO tensor since that is easier to create
    indices = torch.randint(0, size, (entries,2))
    vals = torch.randn(entries)
    coo = torch.sparse_coo_tensor(indices.t(), vals, (size, size), device=device)
    coo = coo.coalesce()

    # Then we convert it to CSC, CSR and dense
    dense = coo.to_dense()
    csc = coo.to_sparse_csc()
    csr = coo.to_sparse_csr()
    
    # We also create the MaskedLinear and SparseLinear objects
    class moduleWrapper(object):
        def __init__(self, module, device=device):
            self.module = module
            self.device = device
        def __matmul__(self, x):
            return self.module(x.T).T
        
    maskedLinear = moduleWrapper(MaskedLinear.from_coo(coo).to(device), device)
    sparseLinear = moduleWrapper(SparseLinear.from_coo(coo).to(device), device)

    output["dense"] = dense
    output["coo"] = coo
    output["csc"] = csc
    output["csr"] = csr
    output["maskedLinear"] = maskedLinear
    output["sparseLinear"] = sparseLinear
    return output

In [3]:
## check that all of the matrices are the same operator
def matrix_check(size, entries, device):
    # Generate the matrices
    matrices = generate(size, entries, device=device)
    # Size of the RHS
    x_cols = 100
    x = torch.randn(size, x_cols, device=device)

    print('Matrix-matrix multiplication check')
    y_true = None
    base_name = None

    for matrix, matrix_name in zip(matrices.values(), matrices.keys()):
        y = matrix @ x
        if y_true is None:
            y_true = y
            base_name = matrix_name
        else:
            # print the frobenius norm of the difference
            print(f"||{matrix_name} - {base_name}||_F: {torch.norm(y-y_true)}")
            

In [4]:
matrix_check(1000, 23*1000, "cuda")

Matrix-matrix multiplication check
||coo - dense||_F: 0.00014353354345075786
||csc - dense||_F: 0.00014352313883136958
||csr - dense||_F: 0.00014348502736538649
||maskedLinear - dense||_F: 0.00013009528629481792
||sparseLinear - dense||_F: 0.0001639742695260793


In [5]:
def run_timing(size, entries, device, syncgpu):
    # test if cuda is available
    if device == "cuda":
        if not torch.cuda.is_available():
            print("CUDA is not available, using CPU instead")
            device = "cpu"

    print(f"Running on {device}")
    print(f"Size: {size}")
    print(f"Entries: {entries}")
    print(f"Synchronize GPU: {syncgpu}")
    
    # Generate the matrices
    matrices = generate(size, entries, device=device)

    # Size of the RHS
    x_cols = 100

    # Compute the timings
    print('Matrix-matrix multiplication timings:')
    base_time = None
    base_name = None
    all_times = {}
    for matrix, matrix_name in zip(matrices.values(), matrices.keys()):
        # first, do a few runs to warm up the cache
        for i in range(2):
            x = torch.randn(size, x_cols, device=device)
            y = matrix @ x
        # now do the timings
        matrix_times = []
        for i in range(5):
            x = torch.randn(size, x_cols, device=device)    
            if syncgpu:
                torch.cuda.synchronize()
            start = time.perf_counter()
            y = matrix @ x
            if syncgpu:
                torch.cuda.synchronize()
            matrix_time = time.perf_counter()-start
            matrix_times.append(matrix_time)
        avg_matrix_time = sum(matrix_times)/len(matrix_times)
        min_matrix_time = min(matrix_times)
        max_matrix_time = max(matrix_times)
        if base_time is None:
            base_time = avg_matrix_time
            base_name = matrix_name

        all_times[matrix_name] = avg_matrix_time

        print(f"{matrix_name} avg time:", avg_matrix_time)
        print(f"{matrix_name} min time:", min_matrix_time)
        print(f"{matrix_name} max time:", max_matrix_time)
        print(f"{matrix_name} speedup over {base_name}:", base_time/avg_matrix_time)

    return all_times

In [6]:
all_times = run_timing(20000, 100*20000, "cuda", True)

Running on cuda
Size: 20000
Entries: 2000000
Synchronize GPU: True


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.49 GiB. GPU 0 has a total capacity of 23.64 GiB of which 556.62 MiB is free. Process 2099963 has 20.99 GiB memory in use. Including non-PyTorch memory, this process has 2.05 GiB memory in use. Of the allocated memory 1.57 GiB is allocated by PyTorch, and 33.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import pandas as pd

# Create a dataframe with row and column names for the heatmap
df = pd.DataFrame(columns=all_times.keys(), index=all_times.keys())

# Fill in df with the values we want to display
for name1, time1 in all_times.items():
    for name2, time2 in all_times.items():
        df.loc[name1, name2] = time1/time2
fig = px.imshow(df, text_auto=True, color_continuous_scale='RdBu')
fig.show()

NameError: name 'all_times' is not defined