<a href="https://colab.research.google.com/github/rochmanofenna/BICEP/blob/main/triton_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
set -euo pipefail
pip uninstall -y torch torchvision torchaudio triton cupy-cuda118 || true

# 2. Install PyTorch nightly for CUDA 12.5
pip install --pre torch torchvision torchaudio \
    --extra-index-url https://download.pytorch.org/whl/nightly/cu125

# 3. Install Triton and CuPy matching CUDA 12.x
pip install --quiet triton==3.3.1 cupy-cuda12x

Found existing installation: torch 2.7.1
Uninstalling torch-2.7.1:
  Successfully uninstalled torch-2.7.1
Found existing installation: torchvision 0.22.1
Uninstalling torchvision-0.22.1:
  Successfully uninstalled torchvision-0.22.1
Found existing installation: torchaudio 2.7.1
Uninstalling torchaudio-2.7.1:
  Successfully uninstalled torchaudio-2.7.1
Found existing installation: triton 3.3.1
Uninstalling triton-3.3.1:
  Successfully uninstalled triton-3.3.1
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cu125
Collecting torch
  Using cached torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Using cached torchvision-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached torchaudio-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting triton==3.3.1 (from torch)
  Using cached triton-3.3.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.w

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.7.1 which is incompatible.


In [2]:
import torch, triton, cupy
print("GPU:       ", torch.cuda.get_device_name(0))
print("Torch CUDA:", torch.version.cuda, "   Triton:", triton.__version__)
print("CuPy device:", cupy.cuda.runtime.getDeviceProperties(0)["name"])

GPU:        NVIDIA A100-SXM4-40GB
Torch CUDA: 12.6    Triton: 3.3.1
CuPy device: b'NVIDIA A100-SXM4-40GB'


In [3]:
%%bash
git clone --depth 1 https://github.com/rochmanofenna/mismatch-trading.git
cd mismatch-trading

fatal: destination path 'mismatch-trading' already exists and is not an empty directory.


In [4]:
import triton
import triton.language as tl
import torch
import time

# 4.1) Define a minimal “just RNG + cumsum” kernel
@triton.jit
def fused_sde_stub(path_ptr, n_steps, stride, T, directional_bias):
    pid  = tl.program_id(0)
    path = path_ptr + pid * stride
    dt   = T / n_steps
    acc  = tl.load(path)
    for i in range(n_steps):
        rnd = tl.randn(seed=pid, offset=i)
        acc += rnd * tl.sqrt(dt)
        tl.store(path + i + 1, acc)

# 4.2) Host setup & benchmark
n_paths, n_steps = 1024, 1000
stride = n_steps + 1
paths  = torch.zeros((n_paths, stride), device='cuda', dtype=torch.float32)
grid   = (n_paths,)

# warm-up compile
fused_sde_stub[grid](paths, n_steps, stride, 1.0, 0.0)

# measure
t0 = time.time()
for _ in range(100):
    fused_sde_stub[grid](paths, n_steps, stride, 1.0, 0.0)
print("Stub avg time:", (time.time() - t0)/100)

Stub avg time: 2.5968551635742188e-05


In [5]:
# 5.1) Fuse in your apply_stochastic_controls math
@triton.jit
def fused_sde_control(path_ptr, n_steps, stride, T,
                      feedback, decay, hi_th, lo_th,
                      total_steps, base_var):
    pid  = tl.program_id(0)
    path = path_ptr + pid * stride
    dt   = T / n_steps
    acc  = tl.load(path)

    for i in range(n_steps):
        rnd = tl.randn(seed=pid, offset=i)

        # simplified control_randomness_by_state
        norm = 1.0 / total_steps
        f1   = tl.where(norm < lo_th,
                        1.5,
                        tl.where(norm > hi_th, 0.5, 1.0))
        t   = i * dt
        vf  = base_var * f1 * tl.exp(-decay * t)
        # clamp(0.5 + feedback*0.5, 0.2, 1.0)
        tmp    = 0.5 + feedback * 0.5
        scale2 = tl.maximum(tl.minimum(tmp, 1.0), 0.2)

        inc = rnd * tl.sqrt(dt) * scale2 * vf
        acc += inc
        tl.store(path + i + 1, acc)

# 5.2) Benchmark
# (reuse paths, grid from above)
fused_sde_control[grid](
    paths, n_steps, stride, 1.0,
    0.5, 0.1, 10.0, 2.0,
    float(n_steps), 1.0
)
t0 = time.time()
for _ in range(100):
    fused_sde_control[grid](
        paths, n_steps, stride, 1.0,
        0.5, 0.1, 10.0, 2.0,
        float(n_steps), 1.0
    )
print("Control avg time:", (time.time() - t0)/100)

Control avg time: 3.020763397216797e-05


In [6]:
!sudo apt-get install ninja-build

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ninja-build is already the newest version (1.10.1-1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [7]:
!ls

mismatch-trading  sample_data


In [8]:
!export CUDA_HOME=/usr/local/cuda-11.8
!export PATH=$CUDA_HOME/bin:$PATH

In [10]:
%%bash
rm -rf /root/.cache/torch_extensions

In [11]:
%%bash
# throw away any half‐built cruft:
rm -rf /root/.cache/torch_extensions

In [14]:
%cd mismatch-trading/

[Errno 2] No such file or directory: 'mismatch-trading/'
/content/mismatch-trading


In [18]:
from torch.utils.cpp_extension import load

# (no need to pre‐mkdir build_directory if you omit that argument)
sde_ext = load(
    name="sde_ext",
    sources=[
      "backends/bicep/sde_int/curand_kernel.cu",
      "backends/bicep/sde_int/binding.cu"
    ],
    extra_cuda_cflags=["-O3","-I/usr/local/cuda/include"],
    extra_ldflags=["-lcurand"],
    verbose=True,
)

Using /root/.cache/torch_extensions/py311_cu126 as PyTorch extensions root...
The input conditions for extension module sde_ext have changed. Bumping to version 2 and re-building as sde_ext_v2...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu126/sde_ext/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module sde_ext_v2...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module sde_ext_v2...


In [19]:
import torch, time

# 1) Simulation parameters
n_paths        = 1024
n_steps        = 1000
stride         = n_steps + 1

# 2) Control-kernel parameters
feedback_value = 0.5
decay_rate     = 0.1
high_threshold = 10.0
low_threshold  = 2.0
base_variance  = 1.0

# 3) Allocate the paths tensor
paths = torch.zeros((n_paths, stride),
                    device='cuda',
                    dtype=torch.float32)

# 4) Warm up the extension (flush any JIT / CUDA overhead)
for _ in range(10):
    sde_ext.sde_curand(
        paths, n_steps, stride,
        1.0,           # T
        feedback_value,
        decay_rate,
        high_threshold,
        low_threshold,
        float(n_steps),
        base_variance
    )

# 5) Create and record CUDA events
starter = torch.cuda.Event(enable_timing=True)
ender   = torch.cuda.Event(enable_timing=True)

starter.record()
for _ in range(100):
    sde_ext.sde_curand(
        paths, n_steps, stride,
        1.0,
        feedback_value,
        decay_rate,
        high_threshold,
        low_threshold,
        float(n_steps),
        base_variance
    )
ender.record()
torch.cuda.synchronize()

# 6) Report
ms = starter.elapsed_time(ender) / 100
print(f"CURAND avg time: {ms:.4f} ms   ({1e6 * ms / n_steps:.1f} μs per path)")

CURAND avg time: 0.4961 ms   (496.1 μs per path)
