# Boilerplate code

## Let called libraries be reloaded in the notebook

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

import IPython.core
IPython.core.page = print

## The default the libraries are loaded we need for the code.

In [2]:
import torch

import numpy as np

import time

import logging
logging.basicConfig(level = logging.DEBUG)

from iterativenn.nn_modules.MaskedLinear import MaskedLinear
from iterativenn.nn_modules.SparseLinear import SparseLinear 


# Setup models

In [3]:
def model_factory(cfg, device):
    model = MaskedLinear.from_description(
                                    cfg['row_sizes'],
                                    cfg['col_sizes'],
                                    cfg['block_types'],
                                    cfg['initialization_types'],
                                    cfg['trainable'],
                                    device=device
                                    )
    return model


def sparse_model_factory(model, device):
    return SparseLinear.from_MaskedLinearExact(model, transpose=False, device=device)

def sparse_model_optimized_factory(model, device):
    return SparseLinear.from_MaskedLinearExact(model, transpose=False, device=device, optimized_implementation=True)

model_setups = []
model_setups.append({'name': 'MaskedLinear', 'factory': model_factory})
model_setups.append({'name': 'SparseLinear', 'factory': sparse_model_factory})
model_setups.append({'name': 'SparseLinear_optimized', 'factory': sparse_model_optimized_factory})


# Parameters

In [4]:
use_cuda=True

In [5]:
# We detect if we are running on a GPU or not.  If we are, we will use it.
cuda_params = {}

if torch.cuda.is_available() and use_cuda:
    cuda_params['available'] = True
    cuda_params['current_device'] = torch.cuda.current_device()
    cuda_params['device'] = torch.cuda.device(0)
    cuda_params['device_count'] = torch.cuda.device_count()
    cuda_params['device_name'] = torch.cuda.get_device_name(0)
    devices = ['cuda:0', 'cpu']
else:
    cuda_params['available'] = False
    cuda_params['current_device'] = None
    cuda_params['device'] = 'cpu'
    cuda_params['device_count'] = None
    cuda_params['device_name'] = None
    devices = ['cpu']
logging.info(cuda_params)
logging.info(devices)

INFO:root:{'available': True, 'current_device': 0, 'device': <torch.cuda.device object at 0x7fb903d634c0>, 'device_count': 2, 'device_name': 'NVIDIA GeForce RTX 4090'}
INFO:root:['cuda:0', 'cpu']


# Timing comparison of MaskedLinear and SparseLinear

In [6]:
experiments = [
    # The 2x2 case tickles some bug in CUDA...
    # {'row_sizes': [2],
    # 'col_sizes': [2],
    # 'block_types': [['D']],
    # 'initialization_types': [['G']], 
    # 'trainable': [['non-zero']]},

    # {'row_sizes': [3],
    # 'col_sizes': [4],
    # 'block_types': [['W']],
    # 'initialization_types': [['G']], 
    # 'trainable': [[1]]},

    # {'row_sizes': [3],
    # 'col_sizes': [3],
    # 'block_types': [['D']],
    # 'initialization_types': [['G']], 
    # 'trainable': [['non-zero']]},

    # {'row_sizes': [3],
    # 'col_sizes': [3],
    # 'block_types': [['D']],
    # 'initialization_types': [['G']], 
    # 'trainable': [[0]]},

    # {'row_sizes': [3],
    # 'col_sizes': [3],
    # 'block_types': [['D']],
    # 'initialization_types': [['G']], 
    # 'trainable': [[1]]},

    {'row_sizes': [1000],
    'col_sizes': [1000],
    'block_types': [['R=0.0001']],
    'initialization_types': [['G']], 
    'trainable': [['non-zero']]},

    {'row_sizes': [50, 70, 90],
    'col_sizes': [60, 80, 100],
    'block_types': [[0,       'W',     'D'],
                    ['R=0.5', 'S=5',   'Row=3'],
                    ['S=2',   'R=0.9', 'Row=1']],
    'initialization_types': [[0,            torch.ones((50, 80)), 'C=1.0'],
                             ['G',          'G=0.2,0.7',          'U'],
                             ['U=-0.5,0.5', 1,                    torch.randn(size=(90, 100))]],
    'trainable': [[0,          'non-zero', True      ],
                  ['non-zero', 'non-zero', 'non-zero'],
                  ['non-zero', 'non-zero', 'non-zero']]},

    {'row_sizes': [50, 70, 90],
    'col_sizes': [60, 80, 100],
    'block_types': [[0, 'W', 'D'],
                    ['R=0.5', 'S=5', 'Row=3'],
                    ['S=2', 'R=0.9', 'Row=1']],
    'initialization_types': [[0, torch.ones((50, 80)), 'C=1.0'],
                            ['G', 'G=0.2,0.7', 'U'],
                            ['U=-0.5,0.5', 1, torch.randn(size=(90, 100))]],
    'trainable': [[0, 'non-zero', 'non-zero'],
                ['non-zero', 'non-zero', 'non-zero'],
                ['non-zero', 'non-zero', 'non-zero']]},

    {'row_sizes': [50, 70, 90],
    'col_sizes': [60, 80, 100],
    'block_types': [[0, 'W', 'D'],
                    ['R=0.5', 'S=5', 'Row=3'],
                    ['S=2', 'R=0.9', 'Row=1']],
    'initialization_types': [[0, torch.ones((50, 80)), 'C=1.0'],
                            ['G', 'G=0.2,0.7', 'U'],
                            ['U=-0.5,0.5', 1, torch.randn(size=(90, 100))]],
    'trainable': [[0, 'non-zero', 0],
                ['non-zero', 'non-zero', 'non-zero'],
                ['non-zero', 'non-zero', 'non-zero']]},
]

In [7]:
def runTest(model_setups, cfg, device):
    logging.info(f'device: {device}')
    x = torch.randn(128, np.sum(cfg['col_sizes']), device=device)
    y = torch.randn(128, np.sum(cfg['row_sizes']), device=device)

    # Create the models
    for i, model_setup in enumerate(model_setups):
        # Create the model
        if i==0:
            model = model_setup['factory'](cfg, device)
            model_0 = model
        else:
            model = model_setup['factory'](model_0, device)
        model_setup['model'] = model

    # Fill in the data for each run into the model
    for i, model_setup in enumerate(model_setups):
        # Run the model and time it
        start_time = time.perf_counter()
        model_setup['output'] = model_setup['model'](x)
        model_setup['forward_time'] = time.perf_counter()-start_time

        # Compute the gradient and take a step
        optimizer = torch.optim.SGD(model_setup['model'].parameters(), lr=0.01)
        loss = torch.norm(y-model_setup['output'])

        start_time = time.perf_counter()
        loss.backward()
        model_setup['backward_time'] = time.perf_counter()-start_time

        start_time = time.perf_counter()
        optimizer.step()
        model_setup['step_time'] = time.perf_counter()-start_time

        # Now the model has been updated, and let's compare the updated models
        model_setup['after_step_output'] = model_setup['model'](x)

    # Compare the results
    logging.info('--------forward time comparison--------')
    for i, model_setup in enumerate(model_setups):
        if i==0:
            name_0 = model_setup['name']
            forward_time_0 = model_setup['forward_time']
        else:    
            name = model_setup['name']
            forward_time = model_setup['forward_time']

            logging.info(f"{name_0} forward time {forward_time_0:e}")
            logging.info(f"{name} forward time {forward_time:e}")
            logging.info(f"{name} forward time ratio {forward_time/forward_time_0:e}")

    logging.info('--------backward time comparison--------')
    for i, model_setup in enumerate(model_setups):
        if i==0:
            name_0 = model_setup['name']
            backward_time_0 = model_setup['backward_time']
        else:    
            name = model_setup['name']
            backward_time = model_setup['backward_time']

            logging.info(f"{name_0} backward time {backward_time_0:e}")
            logging.info(f"{name} backward time {backward_time:e}")
            logging.info(f"{name} backward time ratio {backward_time/backward_time_0:e}")

    logging.info('--------step time comparison--------')
    for i, model_setup in enumerate(model_setups):
        if i==0:
            name_0 = model_setup['name']
            step_time_0 = model_setup['step_time']
        else:    
            name = model_setup['name']
            step_time = model_setup['step_time']

            logging.info(f"{name_0} step time {step_time_0:e}")
            logging.info(f"{name} step time {step_time:e}")
            logging.info(f"{name} step time ratio {step_time/step_time_0:e}")

    logging.info('--------output value comparison--------')
    for i, model_setup in enumerate(model_setups):
        if i==0:
            name_0 = model_setup['name']
            output_0 = model_setup['output']
        else:    
            name = model_setup['name']
            output = model_setup['output']

            logging.info(f"{name_0} output_0 {output_0.norm():f}")
            logging.info(f"{name} output {output.norm():f}")
            logging.info(f"{name} output diff {(output_0-output).norm()/output_0.norm():f}")

    logging.info('--------after step value comparison--------')
    for i, model_setup in enumerate(model_setups):
        if i==0:
            name_0 = model_setup['name']
            after_step_output_0 = model_setup['after_step_output']
        else:    
            name = model_setup['name']
            after_step_output = model_setup['after_step_output']

            logging.info(f"{name_0} after step output_0 {after_step_output_0.norm():f}")
            logging.info(f"{name} after step output {after_step_output.norm():f}")
            logging.info(f"{name} after step output diff {(after_step_output_0-after_step_output).norm()/after_step_output_0.norm():f}")


In [8]:
for cfg in experiments:
    for device in devices:
        runTest(model_setups, cfg, device)

INFO:root:device: cuda:0
DEBUG:iterativenn.nn_modules.SparseLinear:len(sparse_trainable.values())=87
DEBUG:iterativenn.nn_modules.SparseLinear:len(sparse_not_trainable.values())=0
  self.sparse_trainable_values = torch.nn.parameter.Parameter(torch.tensor(sparse_trainable.values(), **factory_kwargs),
DEBUG:iterativenn.nn_modules.SparseLinear:len(sparse_trainable.values())=87
DEBUG:iterativenn.nn_modules.SparseLinear:len(sparse_not_trainable.values())=0
DEBUG:iterativenn.nn_modules.MaskedLinear:forward time 4.289063e-01
DEBUG:iterativenn.nn_modules.MaskedLinear:forward time 1.825229e-04
DEBUG:iterativenn.nn_modules.SparseLinear:sparse mm 4.250000e-04
DEBUG:iterativenn.nn_modules.SparseLinear:sparse not trainable 2.665222e-04
DEBUG:iterativenn.nn_modules.SparseLinear:bias 2.163269e-04
DEBUG:iterativenn.nn_modules.SparseLinear:sparse mm 7.785601e-05
DEBUG:iterativenn.nn_modules.SparseLinear:sparse not trainable 2.043040e-04
DEBUG:iterativenn.nn_modules.SparseLinear:bias 9.824499e-05
DEBUG: