In [2]:
%load_ext autoreload
%autoreload 2
# %env CUDA_VISIBLE_DEVICES=3

In [5]:
import pathlib
# BASE_DIR will be like '/home/jovyan/DemoExample/'
BASE_DIR = pathlib.Path().absolute().parents[0]
print(f"Working dir: {BASE_DIR}")

Working dir: /notebook/GreenAl


In [13]:
pathlib.Path().absolute().parents[1]

PosixPath('/notebook')

In [20]:
str(BASE_DIR)

'/notebook/GreenAl'

In [17]:
import sys
sys.path.append(str(BASE_DIR))

In [18]:
import torch as t
import numpy as np
from tqdm.auto import tqdm
from matplotlib import pyplot as plt

import src.ttm_linear.ttm_linear
from src.ttm_linear.ttm import TTM, einsum_forward, by_hands_forward, with_self_checkpoint, forward_backward_module, full_matrix_backward, full_einsum_backward, super_full_einsum_backward
#import ttm_linear.old.linear
from old.linear import TTMLinear

In [21]:
DEVICE = t.device("cuda:2" if t.cuda.is_available() else "cpu") 

In [28]:
N_CORES = 11
# DIMS = [[2, 2]] * N_CORES
DIMS = [(8, 8), (8, 8), (12, 12), (1, 3)]
N_CORES = len(DIMS)
RANK = 16
BS = 16 * 1024

# Memory

In [26]:
def analitical_memory():
    parameters = (
        sum(dims[0] * dims[1] * RANK for dims in DIMS[:1] + DIMS[-1:]) + 
        sum(dims[0] * dims[1] * RANK**2 for dims in DIMS[1:-1]) 
    )
    
    dims = np.array(DIMS)
    
    forward_tensors = [BS * np.prod(dims[:, 0])]
    for i in range(N_CORES - 1):
        forward_tensors.append(BS * np.prod(dims[i + 1:, 0]) * np.prod(dims[:i + 1, 1]) * RANK)
    forward_tensors.append(BS * np.prod(dims[:, 1]))

    print(f'parameters: {parameters / 2**20}')
    print(f'forward tensors: {sum(forward_tensors) / 2**20}')
    for forward_tensor in forward_tensors:
        print(f'\t{forward_tensor / 2**20}')
    print(f'total number of parameters: {(parameters + sum(forward_tensors)) / 2**20}')

    
def cuda_memory(offset: int = 0):
    return (t.cuda.memory_allocated(DEVICE)) / 2**20
    

def test(forward_backward_module):
    #print(f'memory before start: {cuda_memory()}')
    ttm = TTM(DIMS, RANK, forward_backward_module).to(DEVICE)
    parameters = cuda_memory()
    print(f'memory after layer initializing: {cuda_memory()}')

    x = t.randn(BS, ttm.dim_in, requires_grad=True, device=DEVICE)
    print(f'size of input tensor: {cuda_memory() - parameters}')
    y = ttm(x)
    
    print(f'total memory after forward: {cuda_memory() - parameters}')
    
    y.mean().backward()
    
    print(f'total memory after backward: {cuda_memory() - parameters}')
    print()
    
    


## Compare different einsum strategies in TTM 

In [29]:
analitical_memory()

for forward_module in [einsum_forward, by_hands_forward]:
    for backward_module in [None, full_matrix_backward]:
        print('FORWARD and BACKWARD modules: ', forward_module.__name__, None if backward_module is None else backward_module.__name__)

        if backward_module:
            module = forward_backward_module(forward_module, backward_module(forward_module))
        else:
            module = forward_module
            
        test(module)

parameters: 0.0518035888671875
forward tensors: 624.0
	12.0
	192.0
	192.0
	192.0
	36.0
total number of parameters: 624.0518035888672
FORWARD and BACKWARD modules:  einsum_forward None
memory after layer initializing: 72.52685546875
size of input tensor: 48.0
total memory after forward: 1008.2763671875
total memory after backward: 240.20751953125

FORWARD and BACKWARD modules:  einsum_forward full_matrix_backward
memory after layer initializing: 72.52685546875
size of input tensor: 48.0
total memory after forward: 192.0
total memory after backward: 240.20751953125

FORWARD and BACKWARD modules:  by_hands_forward None
memory after layer initializing: 72.52685546875
size of input tensor: 48.0
total memory after forward: 2544.06689453125
total memory after backward: 240.20751953125

FORWARD and BACKWARD modules:  by_hands_forward full_matrix_backward
memory after layer initializing: 72.52685546875
size of input tensor: 48.0
total memory after forward: 192.0
total memory after backward: 240

## Comapare OLD and NEW versions of TTM layers

In [27]:
for RANK in [16, 32, 64, 128]:
    print ("TTM NEW, rank", RANK)
    parameters = cuda_memory()
    print(f'memory after layer initializing: {cuda_memory()}')
    
    ttm = TTM(DIMS, RANK, forward_backward_module(einsum_forward, full_matrix_backward(einsum_forward))).to(DEVICE)

    x = t.randn(BS, ttm.dim_in, requires_grad=True, device=DEVICE)
    print(f'size of input tensor: {cuda_memory() - parameters}')
    y = ttm(x)
    
    print(f'total memory after forward: {cuda_memory()}')
    
    y.mean().backward()
    
    print(f'total memory after backward: {cuda_memory()}')
    
    print ("TTM OLD, rank", RANK)
    
    
    ttm_old = TTMLinear(768, 768 * 4, RANK).to(DEVICE)
    parameters = cuda_memory()
    print(f'memory after layer initializing: {cuda_memory()}')

    x = t.randn(BS, ttm.dim_in, requires_grad=True, device=DEVICE)
    #print(f'size of input tensor: {cuda_memory() - parameters}')
    y = ttm_old(x)
    
    print(f'total memory after forward: {cuda_memory()}')
    
    y.mean().backward()
    
    print(f'total memory after backward: {cuda_memory()}')
    print("\n\n\n\n")

TTM NEW, rank 16
memory after layer initializing: 342.8193359375
size of input tensor: -19.32666015625
total memory after forward: 44.49267578125
total memory after backward: 47.7001953125
TTM OLD, rank 16
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
memory after layer initializing: 15.46533203125
total memory after forward: 67.76611328125
total memory after backward: 19.515625





TTM NEW, rank 32
memory after layer initializing: 19.515625
size of input tensor: 3.40576171875
total memory after forward: 12.92138671875
total memory after backward: 16.7421875
TTM OLD, rank 32
[(8, 8), (8, 8), (12, 8), (1, 6)]
memory after layer initializing: 17.275390625
total memory after forward: 29.845703125
total memory after backward: 20.9091796875





TTM NEW, rank 64
memory after layer initializing: 20.9091796875
size of input tensor: 5.375
total memory after forward: 17.2841796875
total memory after backward: 24.30078125
TTM OLD, rank 64
[(8, 8), (8, 8), (12, 8), (1, 6)]
memory after layer initial

# Speed

In [7]:
for module_name, module in {
    'einsum': einsum_forward,
    'self-checkpoint einsum': with_self_checkpoint(einsum_forward),
    'full matrix einsum': forward_backward_module(einsum_forward, full_matrix_backward(einsum_forward)),
    'full einsum einsum': forward_backward_module(einsum_forward, full_einsum_backward(einsum_forward)),
    'super full einsum einsum': forward_backward_module(einsum_forward, super_full_einsum_backward),
}.items():
    print(module_name)

    ttm = TTM(DIMS, RANK, module).to(device=DEVICE)
    x = t.randn(BS, ttm.dim_in, requires_grad=True, device=DEVICE)

    def forward():
        (ttm(x)**2).mean()
        t.cuda.synchronize()

    def forward_backward():
        (ttm(x)**2).mean().backward()
        t.cuda.synchronize()

    %timeit forward()
    %timeit forward_backward()
    print()

einsum
1.16 ms ± 372 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
18.3 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

self-checkpoint einsum
7.63 ms ± 1.05 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
25.5 ms ± 973 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

full matrix einsum
7.65 ms ± 1.27 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
18.5 ms ± 3.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

full einsum einsum
7.63 ms ± 447 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
45.7 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

super full einsum einsum
7.65 ms ± 1.18 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
42.5 ms ± 1.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

