<a href="https://colab.research.google.com/github/skj092/cuda-programming/blob/main/tensor_addition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch, os, math, gzip, pickle
import matplotlib.pyplot as plt
from urllib.request import urlretrieve
from pathlib import Path

from torch import tensor
import torchvision as tv
import torchvision.transforms.functional as tvf
from torchvision import io
from torch.utils.cpp_extension import load_inline

## Vector Addition

In [2]:
def kernal(x, y, res, times):
  for i in range(times):
    res[i] = x[i] + y[i]

In [3]:
def add(x, y):
  h, w = x.shape
  z = torch.zeros(h*w)
  x = x.view(-1)
  y = y.view(-1)
  threads = 256
  kernal(x, y, z, h*w)
  return z.view(h, w)

In [4]:
h, w = 5*4**3, 5*4**3
a = torch.rand((h, w))
b = torch.rand((h, w))

In [5]:
%%time

c = add(a, b)

CPU times: user 979 ms, sys: 4.65 ms, total: 983 ms
Wall time: 1.02 s


# Using block and thread

In [6]:
def blk_kernal(f, blocks, threads, *args):
  for i in range(blocks):
    for j in range(threads):
      f(i, j, threads, *args)

In [7]:
def adder(blockidx, threadidx, blockdim, x, y, res, n):
    i = blockidx*blockdim + threadidx
    if i<n: res[i] = x[i] + y[i]

In [8]:
def add(x, y):
  h, w = x.shape
  res = torch.empty(h*w)
  x = x.view(-1)
  y = y.view(-1)
  n = h * w
  threads = 256
  blocks = int(math.ceil(h*w/threads))
  blk_kernal(adder, blocks, threads, x, y, res, n)
  return res.view(h, w)

In [9]:
%%time

c = add(a, b)

CPU times: user 1.42 s, sys: 3.35 ms, total: 1.42 s
Wall time: 1.47 s


# Using CUDA

In [10]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [11]:
%pip install -q wurlitzer ninja

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/307.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m297.0/307.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
%load_ext wurlitzer

In [13]:
def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")

In [16]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

In [23]:
cuda_src = cuda_begin + r'''
__global__ void add_tensors_cuda_kernel(float *a, float *b, float *out, int n) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < n) {
    out[index] = a[index] + b[index];
  }
}

torch::Tensor add_tensors_cuda(torch::Tensor tensor1, torch::Tensor tensor2) {
  CHECK_INPUT(tensor1);
  CHECK_INPUT(tensor2);
  AT_ASSERTM(tensor1.sizes() == tensor2.sizes(), "tensors must have the same size");

  auto n = tensor1.numel();
  auto result = torch::empty_like(tensor1);

  const int threads = 256;
  const int blocks = cdiv(n, (unsigned int)threads);

  add_tensors_cuda_kernel<<<blocks, threads>>>(
    tensor1.data_ptr<float>(),
    tensor2.data_ptr<float>(),
    result.data_ptr<float>(),
    n
  );

  return result;
}

'''

In [24]:
cpp_src = "torch::Tensor add_tensors_cuda(torch::Tensor tensor1, torch::Tensor tensor2);"

In [25]:
module = load_cuda(cuda_src, cpp_src, ['add_tensors_cuda'], verbose=True)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
The input conditions for extension module inline_ext have changed. Bumping to version 2 and re-building as inline_ext_v2...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/inline_ext/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module inline_ext_v2...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v2 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /root/.cache/torch_extensions/py310_cu121/inline_ext/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v2 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.10/dist-packages/torch/i

Loading extension module inline_ext_v2...


In [26]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'add_tensors_cuda']

In [29]:
%%time
a = a.to(device='cuda')
b = b.to(device='cuda')
res = module.add_tensors_cuda(a, b).cpu()
h,w = res.shape
h,w,h*w

CPU times: user 27 ms, sys: 80.5 ms, total: 108 ms
Wall time: 162 ms


(320, 320, 102400)