In [177]:
import pycuda.autoinit
import numpy as np
from pycuda import gpuarray
import libcudnn, ctypes
import pycuda.driver as drv
import pandas as pd
from pycuda.compiler import SourceModule
from skcuda import cublas



In [2]:
# Define some globals. Should be defined in a constructor for a sequential class.

# Initialize the cuDNN context
cudnn_context = libcudnn.cudnnCreate()

# Set some options and tensor dimensions
softmax_mode = libcudnn.cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_INSTANCE']
softmax_algo = libcudnn.cudnnSoftmaxAlgorithm['CUDNN_SOFTMAX_ACCURATE']

In [3]:
# Helper functions. !!!! Are these all used more than once? !!!!
def create_4d_tensor_desc(tensor_format, data_type, n, c, h, w):
    
    tensor_desc = libcudnn.cudnnCreateTensorDescriptor()
    libcudnn.cudnnSetTensor4dDescriptor(tensor_desc, tensor_format, data_type, n, c, h, w)
    return tensor_desc

def create_pooling_2d_desc(mode, propogation_mode, windowHeight, windowWidth, verticalPadding, 
                           horizontalPadding, verticalStride, horizontalStride):
    pooling_desc = libcudnn.cudnnCreatePoolingDescriptor()
    libcudnn.cudnnSetPooling2dDescriptor(pooling_desc, mode, propogation_mode,
                                         windowHeight, windowWidth, 
                                         verticalPadding, horizontalPadding, 
                                         verticalStride, horizontalStride)
    return pooling_desc

def create_activation_desc(mode, reluNanOpt, coef):
    
    activation_desc = libcudnn.cudnnCreateActivationDescriptor()
    libcudnn.cudnnSetActivationDescriptor(activation_desc, mode, reluNanOpt, coef)
    return activation_desc

def create_convolution_2d_desc(pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType):
    
    conv2d_desc = libcudnn.cudnnCreateConvolutionDescriptor()
    libcudnn.cudnnSetConvolution2dDescriptor(conv2d_desc, pad_h, pad_w, u, v, 
                                             dilation_h, dilation_w, mode, computeType)
    return conv2d_desc

def create_filter_4d_desc(data_type, tensor_format, k, c, h, w):
    
    filter4d_desc = libcudnn.cudnnCreateFilterDescriptor()
    libcudnn.cudnnSetFilter4dDescriptor(filter4d_desc, data_type, tensor_format, k, c, h, w)
    return filter4d_desc

In [160]:
class convolution_layer:
    
    def __init__(self, cudnn_context, batch_size, in_channels, in_height, in_width,
                 out_channels, kernel_size):
        
        # !!!! Are these needed? !!!!
        self.cudnn_context = cudnn_context
        self.tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
        self.data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
        self.data_type_np = np.float32
        self.pooling_format = libcudnn.cudnnPoolingMode['CUDNN_POOLING_MAX']
        self.activation_mode = libcudnn.cudnnActivationMode['CUDNN_ACTIVATION_RELU']
        self.propogation_mode = libcudnn.cudnnNanPropagation['CUDNN_PROPAGATE_NAN']
        self.convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION']
        # !!!! The convolution is found by function calls in the LeNet code. Is choosing this ok? !!!!
        self.convolution_algo = libcudnn.cudnnConvolutionFwdAlgo['CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM']
        self.batch_size = batch_size
        
        # Store inputs (!!!! note: assumes stride 1, 0 padding !!!!)
        self.in_channels = in_channels
        self.n_filters = out_channels
        self.kernel_size = kernel_size
        self.in_height = in_height
        self.in_width = in_width
        #self.out_width = in_width - kernel_size + 1
        #self.out_height = in_height - kernel_size + 1
        
        # Reserve memory
        w = np.ones((self.in_channels, 
                      self.kernel_size, 
                      self.kernel_size, 
                      self.n_filters), dtype=self.data_type_np)
        w_bias = np.ones((self.n_filters, ), dtype=self.data_type_np)
        
        # Send memory to GPU
        self.w_gpu = gpuarray.to_gpu(w)
        self.w_bias_gpu = gpuarray.to_gpu(w_bias)
        
        # Descriptors
        self.w_bias_desc = create_4d_tensor_desc(
            self.tensor_format, self.data_type, 1, self.n_filters, 1, 1)
        self.input_desc = create_4d_tensor_desc(
            self.tensor_format, self.data_type, self.batch_size, 
            self.in_channels, self.in_height, self.in_width)
        self.w_desc = create_filter_4d_desc(
            self.data_type, self.tensor_format, self.n_filters, self.in_channels, 
            self.kernel_size, self.kernel_size)
        self.conv_desc = create_convolution_2d_desc(
            0, 0, 1, 1, 1, 1, self.convolution_mode, self.data_type)
        self.activation_desc = create_activation_desc(
            self.activation_mode, self.propogation_mode, 0.0)  # !!!! What does coef do? !!!!
        
        # Calculate output dimensions
        [_, _, self.out_height, self.out_width] = \
        libcudnn.cudnnGetConvolution2dForwardOutputDim(
            self.conv_desc, self.input_desc, self.w_desc)
        
        # Set the output descriptor and allocate space
        self.output_desc = create_4d_tensor_desc(
            self.tensor_format, self.data_type, self.batch_size, 
            self.n_filters, self.out_height, self.out_width)
        self.output_gpu = gpuarray.empty(
            (self.batch_size, self.n_filters, self.out_height, self.out_width), 
            self.data_type_np)
        self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
            self.cudnn_context, self.input_desc, self.w_desc,
            self.conv_desc, self.output_desc, self.convolution_algo)
        
        # !!!! Reserve workspace !!!!
        if(self.ws_size > 0):
                print(f"Warning: workspace is not 0: {self.ws_size}")
        self.workspace = None
    
    
    def forward(self, input_gpu):
        
        alpha = 1.0
        beta = 0.0

        libcudnn.cudnnConvolutionForward(self.cudnn_context, 
                                         alpha,
                                         self.input_desc, input_gpu.ptr, 
                                         self.w_desc, self.w_gpu.ptr,
                                         self.conv_desc, 
                                         self.convolution_algo,
                                         self.workspace, 
                                         self.ws_size,
                                         beta,
                                         self.output_desc,
                                         self.output_gpu.ptr)
        libcudnn.cudnnAddTensor(self.cudnn_context, 
                                alpha, 
                                self.w_bias_desc,
                                self.w_bias_gpu.ptr,
                                alpha,
                                self.output_desc,
                                self.output_gpu.ptr)
        libcudnn.cudnnActivationForward(self.cudnn_context, 
                                        self.activation_desc, 
                                        alpha, 
                                        self.output_desc,
                                        self.output_gpu.ptr,
                                        beta,
                                        self.output_desc,
                                        self.output_gpu.ptr)

In [161]:
class pooling_layer:
    
    def __init__(self, cudnn_context, batch_size, in_channels, in_width, in_height,
                 stride, kernel_size):
        
        # !!!! Are these needed? !!!!
        self.cudnn_context = cudnn_context
        self.tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
        self.data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
        self.data_type_np = np.float32
        self.pooling_format = libcudnn.cudnnPoolingMode['CUDNN_POOLING_MAX']
        self.activation_mode = libcudnn.cudnnActivationMode['CUDNN_ACTIVATION_RELU']
        self.propogation_mode = libcudnn.cudnnNanPropagation['CUDNN_PROPAGATE_NAN']
        self.convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION']
        # !!!! The convolution is found by function calls in the LeNet code. Is choosing this ok? !!!!
        self.convolution_algo = libcudnn.cudnnConvolutionFwdAlgo['CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM']
        self.batch_size = batch_size
        
        # Store inputs (!!!! assumes 0 padding !!!!)
        self.in_channels = in_channels
        self.out_channels = in_channels  # !!!! out or output? !!!!
        self.stride = stride
        self.kernel_size = kernel_size
        self.in_height = in_height
        self.in_width = in_width
        
        # Calculate the output sizes (!!!! Use cudnnGetPoolingNdForwardOutputDim !!!!)
        self.out_height = (self.in_height - self.kernel_size) // self.stride + 1
        self.out_width = (self.in_width - self.kernel_size) // self.stride + 1
        
        # Descriptors
        self.input_desc = create_4d_tensor_desc(
            self.tensor_format, self.data_type, self.batch_size, 
            self.in_channels, self.in_height, self.in_width)
        self.output_desc = create_4d_tensor_desc(
            self.tensor_format, self.data_type, self.batch_size, 
            self.out_channels, self.out_height, self.out_width)
        self.pooling_desc = create_pooling_2d_desc(
            self.pooling_format, self.propogation_mode, 
            self.kernel_size, self.kernel_size, 0, 0, self.stride, self.stride)
        
        # Reserve space for the output
        self.output_gpu = gpuarray.empty(
            (self.batch_size, self.out_channels, self.out_height, self.out_width), 
            self.data_type_np)
    
    def forward(self, input_gpu):
        
        alpha = 1.0
        beta = 0.0
        
        libcudnn.cudnnPoolingForward(self.cudnn_context,
                                     self.pooling_desc,
                                     alpha,
                                     self.input_desc,
                                     input_gpu.ptr,
                                     beta,
                                     self.output_desc,
                                     self.output_gpu.ptr)

In [None]:
class dense_layer:
    
    def __init__(self, cudnn_context, batch_size, in_size, out_size, activation_type='CUDNN_ACTIVATION_RELU'):
        
        # !!!! Are these needed? !!!!
        self.cudnn_context = cudnn_context
        self.tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
        self.data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
        self.data_type_np = np.float32
        self.pooling_format = libcudnn.cudnnPoolingMode['CUDNN_POOLING_MAX']
        self.activation_mode = libcudnn.cudnnActivationMode[activation_type]
        self.propogation_mode = libcudnn.cudnnNanPropagation['CUDNN_PROPAGATE_NAN']
        self.convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION']
        # !!!! The convolution is found by function calls in the LeNet code. Is choosing this ok? !!!!
        self.convolution_algo = libcudnn.cudnnConvolutionFwdAlgo['CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM']
        self.batch_size = batch_size
        
        # Store inputs
        self.batch_size = batch_size
        self.in_size = in_size
        self.out_size = out_size
        
        # Descriptors
        self.activation_desc = create_activation_desc(
            self.activation_mode, self.propogation_mode, 0.0)  # !!!! What does coef do? !!!!
        
        # Reserve memory
        w = np.zeros((self.out_size, self.in_size), dtype=self.data_type_np)
        b = np.zeros((self.out_size,), dtype=self.data_type_np)
        self.w_gpu = gpuarray.to_gpu(w)
        self.b_gpu = gpuarray.to_gpu(b)
        self.out_gpu = gpuarry.empty((batch_size, self.out_size))
        
    def forward(self, input_gpu):
        
        alpha = 1.0
        beta = 1.0
        
        cublas.cublasSgemm(self.cudnn_context, 
                           cublas._CUBLAS_OP['T'],
                           cublas._CUBLAS_OP['N'],
                           self.out_size, self.batch_size, self.in_size,
                           alpha,
                           self.w_gpu.ptr, self.in_size,
                           input_gpu.ptr, self.in_size,
                           beta,
                           self.out_gpu.ptr, self.out_size)

# Test convolution

In [162]:
conv_layer = convolution_layer(cudnn_context=cudnn_context,
                               batch_size=2,
                               in_channels=1,
                               in_height=4,
                               in_width=4,
                               out_channels=1,
                               kernel_size=3)

In [163]:
print((conv_layer.ws_size, conv_layer.out_height, conv_layer.out_width))

(0, 2, 2)


In [173]:
input_data = np.ones((2, 1, 4, 4), dtype=np.float32)
input_data[0][0][0][0] = -7
input_data_gpu = gpuarray.to_gpu(input_data)

In [174]:
conv_layer.forward(input_data_gpu)

In [175]:
conv_layer.output_gpu

array([[[[ 2., 10.],
         [10., 10.]]],


       [[[10., 10.],
         [10., 10.]]]], dtype=float32)

# Test pooling

In [109]:
pool_layer = pooling_layer(cudnn_context=cudnn_context, 
                              batch_size=2,
                              in_channels=1,
                              in_width=8,
                              in_height=8,
                              stride=1,
                              kernel_size=7)

In [110]:
print((pool_layer.out_height, pool_layer.out_width))

(2, 2)


In [111]:
input_data = np.ones((2, 1, 8, 8), dtype=np.float32)
input_data[0][0][0][1] = 22
input_data_gpu = gpuarray.to_gpu(input_data)

In [112]:
pool_layer.forward(input_data_gpu)

In [113]:
pool_layer.output_gpu

array([[[[22., 22.],
         [ 1.,  1.]]],


       [[[ 1.,  1.],
         [ 1.,  1.]]]], dtype=float32)