In [1]:
from inspect import signature

import numpy as np
from numpy import ndarray
#
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

import shell.shell
import shell.utils
import time
import cProfile
import skcuda.linalg



## Import Ottergrad

In [2]:
from Ottergrad.autograd import Tensor
from Ottergrad.autograd import Func

## Check GPU device info

In [3]:
gpu_device = cuda.Device(0)
device_attributes_tuples = gpu_device.get_attributes().items()
for item, count in device_attributes_tuples:
    print(item, count)

ASYNC_ENGINE_COUNT 3
CAN_MAP_HOST_MEMORY 1
CLOCK_RATE 1200000
COMPUTE_CAPABILITY_MAJOR 7
COMPUTE_CAPABILITY_MINOR 5
COMPUTE_MODE DEFAULT
CONCURRENT_KERNELS 1
ECC_ENABLED 0
GLOBAL_L1_CACHE_SUPPORTED 1
GLOBAL_MEMORY_BUS_WIDTH 192
GPU_OVERLAP 1
INTEGRATED 0
KERNEL_EXEC_TIMEOUT 1
L2_CACHE_SIZE 3145728
LOCAL_L1_CACHE_SUPPORTED 1
MANAGED_MEMORY 1
MAXIMUM_SURFACE1D_LAYERED_LAYERS 2048
MAXIMUM_SURFACE1D_LAYERED_WIDTH 32768
MAXIMUM_SURFACE1D_WIDTH 32768
MAXIMUM_SURFACE2D_HEIGHT 65536
MAXIMUM_SURFACE2D_LAYERED_HEIGHT 32768
MAXIMUM_SURFACE2D_LAYERED_LAYERS 2048
MAXIMUM_SURFACE2D_LAYERED_WIDTH 32768
MAXIMUM_SURFACE2D_WIDTH 131072
MAXIMUM_SURFACE3D_DEPTH 16384
MAXIMUM_SURFACE3D_HEIGHT 16384
MAXIMUM_SURFACE3D_WIDTH 16384
MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS 2046
MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH 32768
MAXIMUM_SURFACECUBEMAP_WIDTH 32768
MAXIMUM_TEXTURE1D_LAYERED_LAYERS 2048
MAXIMUM_TEXTURE1D_LAYERED_WIDTH 32768
MAXIMUM_TEXTURE1D_LINEAR_WIDTH 268435456
MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH 32768
MAXIM

## Pycuda basic data transform

In [4]:
skcuda.linalg.init()
W = np.ones((5,5), dtype=np.float32)
x = np.ones((1,5), dtype=np.float32)
W_gpu = gpuarray.to_gpu(W)
x_gpu = gpuarray.to_gpu(x)
a = np.dot(x, W)
print(a)
a = skcuda.linalg.dot(x_gpu, W_gpu)
print(a)

[[5. 5. 5. 5. 5.]]
[[5. 5. 5. 5. 5.]]


  ## GPU vs CPU

In [5]:
W = np.random.random((5000,5000)).astype(np.float32)
x = np.random.random((10000,5000)).astype(np.float32)
y = np.random.random((10000,5000)).astype(np.float32)

start_cpu = time.time()
W_cpu = Tensor(W, device="cpu")
x_cpu = Tensor(x, device="cpu")
y_cpu = Tensor(y, device="cpu")

func_cpu = x_cpu @ W_cpu + y_cpu
func_cpu = Func(func_cpu)
func_cpu.forward()
func_cpu.backward()
end_cpu = time.time()

time_cpu = end_cpu - start_cpu

In [6]:
start_gpu = time.time()

W_gpu = Tensor(W, device="gpu")
x_gpu = Tensor(x, device="gpu")
y_gpu = Tensor(y, device="gpu")

func_gpu = x_gpu @ W_gpu + y_gpu
func_gpu = Func(func_gpu)
func_gpu.forward()
func_gpu.backward()

end_gpu = time.time()
time_gpu = end_gpu - start_gpu

Check gpu's calculation is the same as cpu

In [9]:
np.allclose(W_cpu.getgrad(), W_gpu.getgrad().get())

True

In [8]:
print("GPU exec time: ", time_gpu)
print("CPU exec time: ", time_cpu)

GPU exec time:  1.0856504440307617
CPU exec time:  6.0453760623931885
