In [29]:
import torch
import sigkernel2
import sigkernel
import csv
import timeit
device = torch.cuda.device('cuda')

In [2]:
def generate(batch_size, length, dimension, device = torch.device('cpu')):
  random_walks = torch.randn(batch_size, length, dimension, dtype = torch.double, device = device)
  random_walks = torch.cumsum(random_walks, dim=1)
  return random_walks

In [24]:
sig = sigkernel2.SigKernel(sigkernel2.RBFKernel(1), 0)
sig1 = sigkernel.SigKernel(sigkernel.RBFKernel(1), 0)

In [25]:
# Warm up to ensure JIT compilation
X = generate(1, 128, 7, device = torch.device('cuda:0'))
sig.compute_kernel(X, X, strided=False, alt_scheme=False)
sig.compute_kernel(X, X, strided=True, alt_scheme=False)
sig.compute_kernel(X, X, strided=False, alt_scheme=True)
sig.compute_kernel(X, X, strided=True, alt_scheme=True)
sig1.compute_kernel(X, X)



tensor([3.3948e+25], device='cuda:0', dtype=torch.float64)

In [51]:
X = generate(10, 1023, 7, device = torch.device('cuda:0'))
Y = generate(10, 800, 7, device = torch.device('cuda:0'))

In [52]:
sig1.compute_kernel(X, X)

tensor([1.2157e+203, 2.1838e+203, 4.2814e+203, 4.1860e+203, 5.7354e+203,
        4.3087e+203, 4.3497e+203, 5.7231e+203, 3.1675e+203, 3.3709e+203],
       device='cuda:0', dtype=torch.float64)

In [27]:
%timeit -n 100 -r 1 sig.compute_kernel(X, X, strided=False, alt_scheme=False) # DUMMY
%timeit -n 100 -r 10 sig.compute_kernel(X, X, strided=False, alt_scheme=False)
%timeit -n 100 -r 10 sig.compute_kernel(X, X, strided=True, alt_scheme=False)
%timeit -n 100 -r 10 sig.compute_kernel(X, X, strided=False, alt_scheme=True)
%timeit -n 100 -r 10 sig.compute_kernel(X, X, strided=True, alt_scheme=True)
%timeit -n 100 -r 10 sig1.compute_kernel(X, X)


556 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 100 loops each)
428 µs ± 17.5 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
447 µs ± 13.9 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
430 µs ± 18.5 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
427 µs ± 9.86 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
1.23 ms ± 35.2 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [57]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 0

sig = sigkernel2.SigKernel(sigkernel2.RBFKernel(1), dyadic_order = 0)
sig1 = sigkernel.SigKernel(sigkernel.RBFKernel(1), dyadic_order = 0)

lengths = t = [16 * 2**i for i in range(6)]
lengths.append(1023)

execs = 100
reps = 30

with open("../speciale/rplots/bench_gpu_low.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Run", "Result"])
      
    for l in lengths:
        
        x = generate(10, l, 7, device = torch.device('cuda:0'))
        
        impl = {
            "Baseline": lambda: sig1.compute_kernel(x, x),
            "Improved": lambda: sig.compute_kernel(x, x, strided=False, alt_scheme=False),
            "Improved Alt": lambda: sig.compute_kernel(x, x, strided=False, alt_scheme=True)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=execs, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, run, result])

In [58]:
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

dyadic_order = 1

sig = sigkernel2.SigKernel(sigkernel2.RBFKernel(1), dyadic_order = 0)
sig1 = sigkernel.SigKernel(sigkernel.RBFKernel(1), dyadic_order = 0)

lengths = t = [16 * 2**i for i in range(5)]
lengths.append(511)

execs = 100
reps = 30

with open("../speciale/rplots/bench_gpu_low_1.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Implementation", "Length", "Dyadic Order", "Run", "Result"])
      
    for l in lengths:
        
        x = generate(10, l, 7, device = torch.device('cuda:0'))
        
        impl = {
            "Baseline": lambda: sig1.compute_kernel(x, x),
            "Improved": lambda: sig.compute_kernel(x, x, strided=False, alt_scheme=False),
            "Improved Alt": lambda: sig.compute_kernel(x, x, strided=False, alt_scheme=True)
        }
        
        for name, func in impl.items():
            # Use timeit with the function directly
            timing_results = timeit.repeat(func, number=execs, repeat=reps)
            
            # Save results to CSV
            for run, result in enumerate(timing_results, start=1):
                writer.writerow([name, l, dyadic_order, run, result])