In [None]:
import pycuda.autoinit
import numpy as np
from pycuda import gpuarray
import libcudnn, ctypes
import pycuda.driver as drv
import pandas as pd
from pycuda.compiler import SourceModule

In [None]:
# Define some globals. Should be defined in a constructor for a sequential class.

# Initialize the cuDNN context
cudnn_context = libcudnn.cudnnCreate()

# Set some options and tensor dimensions
softmax_mode = libcudnn.cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_INSTANCE']
softmax_algo = libcudnn.cudnnSoftmaxAlgorithm['CUDNN_SOFTMAX_ACCURATE']

In [None]:
def create_4d_tensor_desc(tensor_format, data_type, n, c, h, w):
    
    tensor_desc = libcudnn.cudnnCreateTensorDescriptor()
    libcudnn.cudnnSetTensor4dDescriptor(tensor_desc, tensor_format, data_type, n, c, h, w)
    return tensor_desc

def create_pooling_2d_desc(mode, propogation_mode, windowHeight, windowWidth, verticalPadding, 
                           horizontalPadding, verticalStride, horizontalStride):
    pooling_desc = libcudnn.cudnnCreatePoolingDescriptor()
    libcudnn.cudnnSetPooling2dDescriptor(pooling_desc, mode, propogation_mode,
                                         windowHeight, windowWidth, 
                                         verticalPadding, horizontalPadding, 
                                         verticalStride, horizontalStride)
    return pooling_desc

def create_activation_desc(mode, reluNanOpt, coef):
    
    activation_desc = libcudnn.cudnnCreateActivationDescriptor()
    libcudnn.cudnnSetActivationDescriptor(activation_desc, mode, reluNanOpt, coef)
    return activation_desc

def create_convolution_2d_desc(pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType):
    
    conv2d_desc = libcudnn.cudnnCreateConvolutionDescriptor()
    libcudnn.cudnnSetConvolution2dDescriptor(conv2d_desc, pad_h, pad_w, u, v, 
                                             dilation_h, dilation_w, mode, computeType)
    return conv2d_desc

def create_filter_4d_desc(data_type, tensor_format, k, c, h, w):
    
    filter4d_desc = libcudnn.cudnnCreateFilterDescriptor()
    libcudnn.cudnnSetFilter4dDescriptor(filter4d_desc, data_type, tensor_format, k, c, h, w)
    return filter4d_desc

In [None]:
def allocate_fwd_convolution_tensors(cudnn_context, tensor_format, data_type, batch_size, 
                                     conv_in_channels, conv_in_height, conv_in_width, 
                                     conv_out_channels, conv_kernel_size, convolution_mode,
                                     convolution_algo):
    
    data_tensor_desc = create_4d_tensor_desc(tensor_format, data_type,
                                        batch_size, conv_in_channels, conv_in_height, conv_in_width)
    conv1_filter_desc = create_filter_4d_desc(data_type, tensor_format,
                                         conv_out_channels, conv_in_channels, 
                                         conv_kernel_size, conv_kernel_size)
    conv1_desc = create_convolution_2d_desc(0, 0, 1, 1, 1, 1, convolution_mode, data_type)
    [n, c, h, w] = libcudnn.cudnnGetConvolution2dForwardOutputDim(conv1_desc, 
                                                                  data_tensor_desc, 
                                                                  conv1_filter_desc)
    
    conv1_tensor_desc = create_4d_tensor_desc(tensor_format, data_type, n, c, h, w)
    ws_conv1_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context,
                                                                     data_tensor_desc,
                                                                     conv1_filter_desc,
                                                                     conv1_desc,
                                                                     conv1_tensor_desc,
                                                                     convolution_algo)
    
    return [data_tensor_desc, conv1_filter_desc, conv1_desc, conv1_tensor_desc, ws_conv1_size]

In [None]:
class nn_lenet:
    
    def __init__(self, cudnn_context, batch_size, width, height):
        
        self.cudnn_context = cudnn_context
        self.tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
        self.data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
        self.data_type_np = np.float32
        self.pooling_format = libcudnn.cudnnPoolingMode['CUDNN_POOLING_MAX']
        self.activation_mode = libcudnn.cudnnActivationMode['CUDNN_ACTIVATION_RELU']
        self.propogation_mode = libcudnn.cudnnNanPropagation['CUDNN_PROPAGATE_NAN']
        self.convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION']
        # !!!! The convolution is found by function calls in the LeNet code. Is choosing this ok? !!!!
        self.convolution_algo = libcudnn.cudnnConvolutionFwdAlgo['CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM']
        self.batch_size = batch_size

        # Some constants !!!! Can these be removed? Maybe just hardcoded. !!!!
        self.pool1_size = 2
        self.pool1_stride = 2
        self.pool2_size = 2
        self.pool2_stride = 2
        self.fc1_outputs = 500
        self.fc2_outputs = 10
        
        self.conv1_in_channels = 1
        self.conv1_out_channels = 20
        self.conv1_kernel_size = 5
        self.conv1_in_width = width
        self.conv1_in_height = height
        self.conv1_out_width = self.conv1_in_width - self.conv1_kernel_size + 1
        self.conv1_out_height = self.conv1_in_height - self.conv1_kernel_size + 1
        
        self.conv2_in_channels = self.conv1_out_channels
        self.conv2_out_channels = 50
        self.conv2_kernel_size = 5
        self.conv2_in_width = self.conv1_out_width // self.pool1_stride
        self.conv2_in_height = self.conv1_out_height // self.pool1_stride
        self.conv2_out_width = self.conv2_in_width - self.conv2_kernel_size + 1
        self.conv2_out_height = self.conv2_in_height - self.conv2_kernel_size + 1

        # Reserve memory
        w_conv1 = np.zeros((self.conv1_in_channels, 
                            self.conv1_kernel_size, 
                            self.conv1_kernel_size, 
                            self.conv1_out_channels), dtype=self.data_type_np)
        self.w_conv1_gpu = gpuarray.to_gpu(w_conv1)
        
        self.conv1_output_gpu = gpuarray.empty((self.batch_size,
                                                self.conv1_out_channels,
                                                self.conv1_out_height,
                                                self.conv1_out_width), self.data_type_np)
        
        w_conv1_bias_tensor = np.zeros((self.conv1_out_channels, ), dtype=self.data_type_np)
        self.conv1_bias_tensor_gpu = gpuarray.to_gpu(w_conv1_bias_tensor)
        
        # Define tensor descriptors
        self.conv1_bias_tensor_desc = create_4d_tensor_desc(self.tensor_format, self.data_type, 
                                                       1, self.conv1_out_channels, 1, 1)
        self.conv2_bias_tensor_desc = create_4d_tensor_desc(self.tensor_format, self.data_type, 
                                                       1, self.conv2_out_channels, 1, 1)
        self.pooling_desc = create_pooling_2d_desc(self.pooling_format, self.propogation_mode,
                                                   self.pool1_size, self.pool1_size,
                                                   0, 0,
                                                   self.pool1_stride, self.pool1_stride)
        self.pooling2_tensor_desc = create_4d_tensor_desc(self.tensor_format,
                                                     self.data_type,
                                                     self.batch_size, self.conv2_out_channels,
                                                     self.conv2_out_height // self.pool2_stride,
                                                     self.conv2_out_width // self.pool2_stride)
        self.fc1_tensor_desc = create_4d_tensor_desc(self.tensor_format, self.data_type,
                                                self.batch_size, self.fc1_outputs, 1, 1)
        self.fc2_tensor_desc = create_4d_tensor_desc(self.tensor_format, self.data_type,
                                                self.batch_size, self.fc2_outputs, 1, 1)
        self.fc1_activation_desc = create_activation_desc(self.activation_mode,
                                                          self.propogation_mode, 0.0)

        [self.data_tensor_desc, 
         self.conv1_filter_desc, 
         self.conv1_desc, 
         self.conv1_tensor_desc, 
         ws_conv1_size] = allocate_fwd_convolution_tensors(self.cudnn_context, self.tensor_format, self.data_type, 
                                         self.batch_size, self.conv1_in_channels, self.conv1_in_height,
                                         self.conv1_in_width, self.conv1_out_channels,
                                         self.conv1_kernel_size, self.convolution_mode,
                                         self.convolution_algo)

        [self.pooling1_tensor_desc, 
         self.conv2_filter_desc, 
         self.conv2_desc, 
         self.conv2_tensor_desc, 
         ws_conv2_size] = allocate_fwd_convolution_tensors(self.cudnn_context, self.tensor_format, self.data_type, 
                                         self.batch_size, self.conv2_in_channels, self.conv2_in_height,
                                         self.conv2_in_width, self.conv2_out_channels,
                                         self.conv2_kernel_size, self.convolution_mode,
                                         self.convolution_algo)

        self.workspace_size = max([ws_conv1_size, ws_conv2_size])
        if(self.workspace_size > 0):
            print(f"Warning: workspace is not 0: {self.workspace_size}")
        self.workspace = None
        
    
    def forward(self, data_desc, data_gpu):
        
        
        alpha = 1.0
        beta = 0.0

        # Conv1 layer
        libcudnn.cudnnConvolutionForward(self.cudnn_context, 
                                         alpha,
                                         data_desc, data_gpu.ptr, 
                                         self.conv1_filter_desc, self.w_conv1_gpu.ptr,
                                         self.conv1_desc, 
                                         self.convolution_algo,
                                         self.workspace, 
                                         self.workspace_size,
                                         beta,
                                         self.conv1_tensor_desc,
                                         self.conv1_output_gpu.ptr)
        libcudnn.cudnnAddTensor(self.cudnn_context, 
                                alpha, 
                                self.conv1_bias_tensor_desc,
                                self.conv1_bias_tensor_gpu.ptr,
                                alpha,
                                self.conv1_tensor_desc,
                                self.conv1_output_gpu.ptr)
        
        '''
        // Pool1 layer
        checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, conv1Tensor,
                                       conv1, &beta, pool1Tensor, pool1));

        // Conv2 layer
        checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, pool1Tensor,
                                           pool1, conv2filterDesc, pconv2, conv2Desc, 
                                           conv2algo, workspace, m_workspaceSize, &beta,
                                           conv2Tensor, conv2));
        checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, conv2BiasTensor,
                                  pconv2bias, &alpha, conv2Tensor, conv2));

        // Pool2 layer
        checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, conv2Tensor,
                                       conv2, &beta, pool2Tensor, pool2));

        // FC1 layer
        // Forward propagate neurons using weights (fc1 = pfc1'*pool2)
        checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
                                    ref_fc1.outputs, m_batchSize, ref_fc1.inputs,
                                    &alpha,
                                    pfc1, ref_fc1.inputs,
                                    pool2, ref_fc1.inputs,
                                    &beta,
                                    fc1, ref_fc1.outputs));
        // Add bias using GEMM's "beta" (fc1 += pfc1bias*1_vec')
        checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
                                    ref_fc1.outputs, m_batchSize, 1,
                                    &alpha,
                                    pfc1bias, ref_fc1.outputs,
                                    onevec, 1,
                                    &alpha,
                                    fc1, ref_fc1.outputs));

        // ReLU activation
        checkCUDNN(cudnnActivationForward(cudnnHandle, fc1Activation, &alpha,
                                          fc1Tensor, fc1, &beta, fc1Tensor, fc1relu));

        // FC2 layer
        // Forward propagate neurons using weights (fc2 = pfc2'*fc1relu)
        checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
                                    ref_fc2.outputs, m_batchSize, ref_fc2.inputs,
                                    &alpha,
                                    pfc2, ref_fc2.inputs,
                                    fc1relu, ref_fc2.inputs,
                                    &beta,
                                    fc2, ref_fc2.outputs));
        // Add bias using GEMM's "beta" (fc2 += pfc2bias*1_vec')
        checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
                                    ref_fc2.outputs, m_batchSize, 1,
                                    &alpha,
                                    pfc2bias, ref_fc2.outputs,
                                    onevec, 1,
                                    &alpha,
                                    fc2, ref_fc2.outputs));

        // Softmax loss
        checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
                                       &alpha, fc2Tensor, fc2, &beta, fc2Tensor, result));
        '''

In [None]:
model = nn_lenet(cudnn_context, batch_size=1, width=28, height=28)

In [None]:
inf_data = np.zeros((1, 1, 28, 28), dtype=np.float32)
inf_data_gpu = gpuarray.to_gpu(inf_data)
inf_data_desc = create_4d_tensor_desc(model.tensor_format, model.data_type, 1, 1, 28, 28)
model.forward(inf_data_desc, inf_data_gpu)

In [None]:
model.conv1_output_gpu.get()