In [1]:
import torch
import sigkernel2
import sigkernel
import csv
import timeit
import math
device = torch.cuda.device('cuda')
import pandas as pd

In [2]:
def bench_memory(func):
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    func()
    return torch.cuda.max_memory_allocated() / 1024 ** 2

In [3]:
def generate(batch_size, length, dimension, device = torch.device('cpu')):
  random_walks = torch.randn(batch_size, length, dimension, dtype = torch.double, device = device)
  random_walks = torch.cumsum(random_walks, dim=1)
  return random_walks

In [4]:
sig = sigkernel2.SigKernel(sigkernel2.LinearKernel(0.05), 0)
sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), 0)
sig3 = sigkernel2.SigKernel2(sigkernel2.LinearKernel(0.05), 0)

In [5]:
# Warm up to ensure JIT compilation
X = generate(2, 128, 7, device = torch.device('cuda:0'))
sig.compute_kernel(X, X, strided=False, alt_scheme=False)
sig.compute_kernel(X, X, strided=True, alt_scheme=False)
sig.compute_kernel(X, X, strided=False, alt_scheme=True)
sig.compute_kernel(X, X, strided=True, alt_scheme=True)
sig.compute_kernel(X, X, lean=True)
sig.compute_kernel(X, X, strided=True, lean=True)
sig1.compute_kernel(X, X)

sig3.gram(X)
sig3.gram(X, X)
sig1.compute_Gram(X, X, sym=True)



tensor([[2.4854e+149, -5.2223e+90],
        [-5.2223e+90, 2.5568e+152]], device='cuda:0', dtype=torch.float64)

In [42]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 0

sig = sigkernel2.SigKernel(sigkernel2.LinearKernel(0.05), dyadic_order = 0)
sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), dyadic_order = 0)

lengths = t = [16 * 2**i for i in range(6)]
lengths.append(1023)

execs = 100
reps = 10

with open("../speciale/rplots/bench_gpu_low.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Run", "Result"])
      
    for l in lengths:
        print(f"\rLength: {l}", end="")
        
        x = generate(100, l, 7, device = torch.device('cuda:0'))
        
        impl = {
            "Baseline": lambda: sig1.compute_kernel(x, x),
            "Improved": lambda: sig.compute_kernel(x, x, lean=True)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=execs, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, run, result / execs])

Length: 1023

In [43]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 1

sig = sigkernel2.SigKernel(sigkernel2.LinearKernel(0.05), dyadic_order = dyadic_order)
sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), dyadic_order = dyadic_order)

lengths = t = [16 * 2**i for i in range(5)]
lengths.append(511)

execs = 100
reps = 10

with open("../speciale/rplots/bench_gpu_low_1.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Run", "Result"])
      
    for l in lengths:
        print(f"\rLength: {l}", end="")
        
        x = generate(100, l, 7, device = torch.device('cuda:0'))
        
        impl = {
            "Baseline": lambda: sig1.compute_kernel(x, x),
            "Improved": lambda: sig.compute_kernel(x, x, lean=True)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=execs, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, run, result / execs])

Length: 16



Length: 1024

In [53]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 0

sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), dyadic_order = dyadic_order)

lengths = t = [16 * 2**i for i in range(6)]
lengths.append(1023)

execs = 100
reps = 10

with open("../speciale/rplots/bench_gpu_cpu.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Run", "Result"])
      
    for l in lengths:
        print(f"\rLength: {l}", end="")
        
        x = generate(10, l, 7, device = torch.device('cuda:0'))
        x_h = x.cpu()
        
        impl = {
            "Baseline - CPU": lambda: sig1.compute_kernel(x_h, x_h),
            "Baseline - GPU": lambda: sig1.compute_kernel(x, x)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=execs, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, run, result / execs])

Length: 16



Length: 1023

In [46]:
orders = [0, 1, 2, 3, 4]

with open("../speciale/rplots/bench_gpu_mem.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Result"])
    
    x = generate(100, 64, 7, device = torch.device('cuda:0'))
    
    for ord in orders:
        print(f"\rOrder: {ord}", end="")
        sig = sigkernel2.SigKernel(sigkernel2.LinearKernel(0.05), ord)
        sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), ord)
        
        impl = {
            "Baseline": lambda: sig1.compute_kernel(x, x),
            "Improved": lambda: sig.compute_kernel(x, x, lean=True)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            result = bench_memory(func)
            
            writer.writerow([name, 128, ord, result]) 

Order: 4

In [9]:
lengths = [16, 32, 64, 128, 256, 512, 1023]

with open("../speciale/rplots/bench_gpu_mem_len.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Result"])
    
    for l in lengths:
        print(f"\rLength: {l}", end="")
        x = generate(100, l, 7, device = torch.device('cuda:0'))
        sig = sigkernel2.SigKernel(sigkernel2.LinearKernel(0.05), 0)
        sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), 0)
        
        impl = {
            "Baseline": lambda: sig1.compute_kernel(x, x),
            "Improved": lambda: sig.compute_kernel(x, x, lean=True)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            result = bench_memory(func)
            
            writer.writerow([name, l, 0, result]) 

Length: 1023



In [10]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 0

sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), dyadic_order)
sig3 = sigkernel2.SigKernel2(sigkernel2.LinearKernel(0.05), dyadic_order)

obs = [512, 256, 128, 64, 32, 16, 8]
l = 100

execs = [5, 10, 15, 25, 50, 100, 200]
reps = 10

with open("../speciale/rplots/bench_gram_0.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Obs", "Run", "Result"])
      
    for o, exec in zip(obs, execs):
        torch.cuda.empty_cache()
        print(f"\rObs: {o}", end="")
        
        x = generate(o, l, 7, device = torch.device('cuda:0'))
        
        impl = {
            "Baseline - GPU": lambda: sig1.compute_Gram(x, x, max_batch=172),
            "Improved - GPU": lambda: sig3.gram(x, x, max_batch=172),
            "Improved - GPU symmetric": lambda: sig3.gram(x, max_batch=172)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=exec, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, o, run, result / exec])

Obs: 868



In [6]:
obs = [8, 16, 32, 64, 128, 256, 512]
l = 100

with open("../speciale/rplots/bench_gram_mem_obs.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Obs", "Result"])
    
    for o in obs:
        print(f"\rObs: {o}", end="")
        x = generate(o, l, 7, device = torch.device('cuda:0'))
        sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), 0)
        sig3 = sigkernel2.SigKernel2(sigkernel2.LinearKernel(0.05), 0)
        
        impl = {
            "Baseline - GPU": lambda: sig1.compute_Gram(x, x, max_batch=172),
            "Improved - GPU": lambda: sig3.gram(x, x, max_batch=172),
            "Improved - GPU symmetric": lambda: sig3.gram(x, max_batch=172)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            result = bench_memory(func)
            
            writer.writerow([name, l, 0, o, result]) 

Obs: 128



Obs: 512

In [13]:
l = 32
dyadic_orders = [0, 1, 2, 3, 4]

with open("../speciale/rplots/bench_gram_mem_order.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Result"])
    
    for d in dyadic_orders:
        print(f"\r Dyadic order: {d}", end="")
        x = generate(32, l, 7, device = torch.device('cuda:0'))
        sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), d)
        sig3 = sigkernel2.SigKernel2(sigkernel2.LinearKernel(0.05), d)
        
        impl = {
            "Baseline - GPU": lambda: sig1.compute_Gram(x, x, max_batch=172),
            "Improved - GPU": lambda: sig3.gram(x, x, max_batch=172),
            "Improved - GPU symmetric": lambda: sig3.gram(x, max_batch=172)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            result = bench_memory(func)
            
            writer.writerow([name, l, d, result]) 

 Dyadic order: 4

In [6]:
def thin_seq(x, id):
    l = x.shape[1]
    for i in range(x.shape[0]):
        ii = id[i]
        x[i, ii:l, :] = x[i, ii, :]
    return x    
    
idx = pd.read_csv("../speciale/rplots/test_idx.csv").squeeze()
idx = torch.tensor(idx.values, dtype=torch.int32) - 1

In [7]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 0

sig1 = sigkernel.SigKernel(sigkernel.LinearKernel(0.05), dyadic_order)
sig3 = sigkernel2.SigKernel2(sigkernel2.LinearKernel(0.05), dyadic_order)

obs = [512, 256, 128, 64, 32, 16, 8]
l = 100

execs = [2, 7, 12, 16, 35, 60, 100]
reps = 5

with open("../speciale/rplots/bench_gram_0_var.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Obs", "Run", "Result"])
      
    for o, exec in zip(obs, execs):
        torch.cuda.empty_cache()
        print(f"\rObs: {o}", end="")
        
        x = generate(o, idx[0:o].max() + 1, 7, device = torch.device('cuda:0'))
        x = thin_seq(x, idx)
        
        impl = {
            "Baseline - GPU": lambda: sig1.compute_Gram(x, x, max_batch=75),
            "Improved - GPU": lambda: sig3.gram(x, x, max_batch=75),
            "Improved - GPU symmetric": lambda: sig3.gram(x, max_batch=75)
        }
        
        for name, func in impl.items():
            torch.cuda.empty_cache()
            print(f"name: {name}", end="")
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=exec, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, o, run, result / exec])

Obs: 8name: Baseline - GPUUUname: Improved - GPUname: Improved - GPU symmetricname: Improved - GPUname: Improved - GPU symmetricname: Improved - GPUname: Improved - GPU symmetricname: Improved - GPUname: Improved - GPU symmetricname: Improved - GPUname: Improved - GPU symmetricname: Improved - GPUname: Improved - GPU symmetric



name: Improved - GPUname: Improved - GPU symmetric