In [37]:
import sys
import ctypes
import ctypes.util
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
import numpy as np
import cv2
import pycuda.gpuarray as gpuarray
import skcuda.cublas
import matplotlib.pyplot as plt
%matplotlib inline 

np.random.seed(1234)

try:
        libcudnn=ctypes.cdll.LoadLibrary('/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudnn.so.5.1.10')
except OSError:
        print("OSError");
print(libcudnn.cudnnGetVersion())
libcudnn.cudnnGetErrorString.restype = ctypes.c_char_p
libcudnn.cudnnGetErrorString.argtypes = [ctypes.c_int]

def cudnnCheckStatus(status):
    print(libcudnn.cudnnGetErrorString(status))

#**********Defining Handle***************
handle=ctypes.c_void_p()
print("Handle Creation Status:"),
cudnnCheckStatus(libcudnn.cudnnCreate(ctypes.byref(handle)))


#************Image Data****************
def normalize(image_data):
    a = 0.1; b = 0.9; MIN = 0; MAX = 255
    b=a + (((image_data - MIN)*(b - a))/(MAX - MIN))
    return b

img = cv2.imread('One.jpg',0)
img=np.asarray(img)
img = normalize(img)

#************Enums*****************
tensor_format=0
data_type=1

#***********Creating descriptors******

input_desc = ctypes.c_void_p()
print("Input Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(input_desc)))

input_grad_desc = ctypes.c_void_p()
print("Input Grad Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(input_grad_desc)))

output_desc = ctypes.c_void_p()
print("Output Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(output_desc)))

output_grad_desc = ctypes.c_void_p()
print("Output Grad Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(output_grad_desc)))


filter_desc = ctypes.c_void_p()
print("Filter Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateFilterDescriptor(ctypes.byref(filter_desc)))

filter_grad_desc = ctypes.c_void_p()
print("Filter Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateFilterDescriptor(ctypes.byref(filter_grad_desc)))

bias_grad_desc = ctypes.c_void_p()
print("Bias Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateFilterDescriptor(ctypes.byref(bias_grad_desc)))


5110
Handle Creation Status: CUDNN_STATUS_SUCCESS
Input Tensor Descriptor: CUDNN_STATUS_SUCCESS
Input Grad Tensor Descriptor: CUDNN_STATUS_SUCCESS
Output Tensor Descriptor: CUDNN_STATUS_SUCCESS
Output Grad Tensor Descriptor: CUDNN_STATUS_SUCCESS
Filter Descriptor: CUDNN_STATUS_SUCCESS
Filter Grad Descriptor: CUDNN_STATUS_SUCCESS
Bias Grad Descriptor: CUDNN_STATUS_SUCCESS


In [38]:
#Convolution

#*******Enums********************
convolution_mode=0
algo=0
preference=0

#**********Dimensions************
n_i=1
c_i=1
h_i=28
w_i=28
h_k=2
w_k=2
k=1
pad_h=0
pad_w=0
stride_h=1
stride_w=1
upscalex=1
upscaley=1

#********GPU arrays***********
array_type=np.float64
#x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0],]]],dtype=array_type)
#x=np.random.rand(1,1,28,28)
x=img.reshape((n_i,c_i,h_i,w_i))
X=gpuarray.to_gpu(x)
w=np.array([[[[1.0,0.0],[0.0,0.0]]]],dtype=array_type)
W=gpuarray.to_gpu(w)

#***********Creating descriptors******
conv_desc = ctypes.c_void_p()
print("Convolution Descriptor"),
cudnnCheckStatus(libcudnn.cudnnCreateConvolutionDescriptor(ctypes.byref(conv_desc)))

#********Set descriptors*******
print("Setting Input Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
print("Setting Filter Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetFilter4dDescriptor(filter_desc,data_type,tensor_format,k,c_i,h_k,w_k));
print("Setting Convolution Desriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetConvolution2dDescriptor(conv_desc,pad_h,pad_w,stride_h,stride_w,upscalex,upscaley,convolution_mode));
        

#*********configuring the Output*********
print("Getting Output Dimensions:"),
temp_n_o = ctypes.c_int()
temp_c_o = ctypes.c_int()
temp_h_o = ctypes.c_int()
temp_w_o = ctypes.c_int()
cudnnCheckStatus(libcudnn.cudnnGetConvolution2dForwardOutputDim(conv_desc, input_desc,filter_desc, ctypes.byref(temp_n_o),ctypes.byref(temp_c_o), ctypes.byref(temp_h_o),ctypes.byref(temp_w_o)))

n_o=temp_n_o.value
c_o=temp_c_o.value
w_o=temp_w_o.value
h_o=temp_h_o.value
print("Output Dimensions: ",n_o,c_o,h_o,w_o)
Y= gpuarray.empty((n_o,c_o,h_o,w_o), array_type)
print("Setting Output Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_o,c_o,h_o,w_o));


#*********Setting Workspace********
algo=ctypes.c_int()
memory_limit=ctypes.c_size_t(1024*1024)
cudnnCheckStatus(libcudnn.cudnnGetConvolutionForwardAlgorithm(handle,input_desc,filter_desc,conv_desc,output_desc,preference,memory_limit,ctypes.byref(algo)))

workspace =ctypes.c_void_p()
workspace_size=ctypes.c_size_t(0)
print("Getting Workspace Size:"),
cudnnCheckStatus(libcudnn.cudnnGetConvolutionForwardWorkspaceSize(handle,input_desc,filter_desc,conv_desc,\
output_desc,algo,ctypes.byref(workspace_size)))
print("Workspace Size = ",workspace_size)
if workspace_size.value!=0:
    workspace= drv.mem_alloc(workspace_size.value)
    print("workspace Allocated")
else:
    print("Workspace Not allocated")

#*********ConvolutionForward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
W_data = ctypes.c_void_p(int(W.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))
print(workspace.value)
#workspace_data = ctypes.c_void_p(workspace)
print("Convolution forward Status:"),

cudnnCheckStatus(libcudnn.cudnnConvolutionForward(handle,ctypes.byref(a),input_desc,X_data,filter_desc,\
                W_data,conv_desc,algo,None,0,ctypes.byref(b),output_desc,Y_data))
print("Convolution Output")
print(Y.get())
#*********Setting bias*************
bias=np.zeros((h_o,w_o),dtype="float64")
BIAS =gpuarray.to_gpu(bias)
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))
#***********Adding Bias******************
a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
B_data = ctypes.c_void_p(int(BIAS.gpudata))



print("\n\nAdding Bias Tensor Status:")

cudnnCheckStatus(libcudnn.cudnnAddTensor(handle,ctypes.byref(a),input_desc,B_data,ctypes.byref(b),output_desc,Y_data))

print("\n\nConvolutionwithbias Output")
print(Y.get())


X_conv=X
W_conv=W
Y_conv=Y
output_desc_conv=output_desc

Convolution Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Filter Descriptor: CUDNN_STATUS_SUCCESS
Setting Convolution Desriptor: CUDNN_STATUS_SUCCESS
Getting Output Dimensions: CUDNN_STATUS_SUCCESS
('Output Dimensions: ', 1, 1, 27, 27)
Setting Output Descriptor: CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
Getting Workspace Size: CUDNN_STATUS_SUCCESS
('Workspace Size = ', c_ulong(0L))
Workspace Not allocated
None
Convolution forward Status: CUDNN_STATUS_SUCCESS
Convolution Output
[[[[ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.89372549  0.9         0.89372549  0.89686275
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9       ]
   [ 0.89058824  0.89058824  0.89058824  0.89058824  0.89058824  0.89058824
     0.89058824  0.88431373  0.88117647  0.9         0.9         0.9
     

In [39]:
#Relu Activation

X_relu=Y_conv
X=X_relu
#input_desc=output_desc_conv

#**********Defining Enumerated Types*************
cudnnDataType={'CUDNN_DATA_FLOAT': 0}
cudnnTensorFormat = {'CUDNN_TENSOR_NCHW': 0}
#cudnnPoolingMode       ={'CUDNN_POOLING_MAX':0}
cudnnNanPropagation     ={'CUDNN_NOT_PROPAGATE_NAN':0}
cudnnActivationMode     ={'CUDNN_ACTIVATION_RELU':1}

relu_mode = cudnnActivationMode['CUDNN_ACTIVATION_RELU']
reluNanOpt = cudnnNanPropagation ['CUDNN_NOT_PROPAGATE_NAN']


#**********Dimensions************
"""
n_i=1
c_i=1
h_i=4
w_i=4
"""
n_i=n_o
c_i=c_o
h_i=h_o
w_i=w_o


#********GPU arrays***********
"""
X = gpuarray.to_gpu(np.random.rand(n_i,c_i,h_i,w_i)
.astype(np.float32))
W = gpuarray.to_gpu(np.random.rand(k,h_k,w_k).astype(np.float32))
#x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0]]]],dtype='float32')
x=np.array([[[[1.0,-2.0,3.0,-4.0],[5.0,-6.0,-7.0,-8.0],[-9.0,-10.0,-11.0,-12.0],[-13.0,14.0,-15.0,-16.0]]]],dtype='float32')
X=gpuarray.to_gpu(x)
"""

#*******Creating descriptors********

activation_desc=ctypes.c_void_p()
print("Activation Descriptor"),
cudnnCheckStatus(libcudnn.cudnnCreateActivationDescriptor(ctypes.byref(activation_desc)))

#********Set descriptors*******
print("Setting Input Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

reluCeiling=ctypes.c_double(1.0)
maxpoolingNanOpt =cudnnNanPropagation['CUDNN_NOT_PROPAGATE_NAN']
print("Setting Activation Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetActivationDescriptor(activation_desc,relu_mode,reluNanOpt,ctypes.byref(reluCeiling)))


#*********configuring the Output*****************************************************************
Y = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)


print("Setting Output Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))


#*********PoolingForward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Activation forward Status:"),

cudnnCheckStatus(libcudnn.cudnnActivationForward(handle,activation_desc,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))


#*********Display Output*************


print("Relu Activation Output"),
print(Y.get())
Y_relu=Y


Activation Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Activation Descriptor: CUDNN_STATUS_SUCCESS
Setting Output Descriptor: CUDNN_STATUS_SUCCESS
Activation forward Status: CUDNN_STATUS_SUCCESS
Relu Activation Output [[[[ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.89372549  0.9         0.89372549  0.89686275
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9       ]
   [ 0.89058824  0.89058824  0.89058824  0.89058824  0.89058824  0.89058824
     0.89058824  0.88431373  0.88117647  0.9         0.9         0.9
     0.89372549  0.89372549  0.89686275  0.89058824  0.89058824  0.89058824
     0.89058824  0.89058824  0.89058824  0.89058824  0.89058824  0.89058824
     0.89058824  0.89058824  0.89058824]
   [ 0.9         0.9         0.9         0.9         0.9         0.9      

In [40]:
#Maxpool
X_pool=Y_relu
X=X_pool
#**********Defining Enumerated Types*************

cudnnDataType={'CUDNN_DATA_FLOAT': 0}
cudnnTensorFormat = {'CUDNN_TENSOR_NCHW': 0}
cudnnPoolingMode        ={'CUDNN_POOLING_MAX':0}
cudnnNanPropagation     ={'CUDNN_NOT_PROPAGATE_NAN':0}

pooling_mode=0
maxpoolingNanOpt =0

#**********Dimensions************
"""
n_i=1
c_i=1
h_i=4
w_i=4
"""
n_i=n_o
c_i=c_o
h_i=h_o
w_i=w_o

pad_h=0
pad_w=0
stride_h=1
stride_w=1
win_h=4
win_w=4

#********GPU arrays***********
#x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0]]]],dtype='float32')
#X=gpuarray.to_gpu(x)



#*******Creating descriptors********

pooling_desc=ctypes.c_void_p()
print("Pooling Descriptor"),
cudnnCheckStatus(libcudnn.cudnnCreatePoolingDescriptor(ctypes.byref(pooling_desc)))

#********Set descriptors*******

print("Setting Input Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Pooling Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetPooling2dDescriptor(pooling_desc,pooling_mode,maxpoolingNanOpt,
win_h,
win_w,
pad_h,
pad_w,
stride_h,
stride_w))
#*********configuring the Output*****************************************************************

print("Getting Output Dimensions:"),
temp_n_o = ctypes.c_int()
temp_c_o = ctypes.c_int()
temp_h_o = ctypes.c_int()
temp_w_o = ctypes.c_int()

cudnnCheckStatus(libcudnn.cudnnGetPooling2dForwardOutputDim(pooling_desc,input_desc,ctypes.byref(temp_n_o),ctypes.byref(temp_c_o), ctypes.byref(temp_h_o),ctypes.byref(temp_w_o)))
n_o=temp_n_o.value
c_o=temp_c_o.value
w_o=temp_w_o.value
h_o=temp_h_o.value


print("Output Dimensions: ",n_o,c_o,h_o,w_o),
Y = gpuarray.empty((n_o,c_o,h_o,w_o), np.float64)


print("Setting Output Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_o,c_o,h_o,w_o));


#*********PoolingForward********
a=ctypes.c_double(1.0)
b=ctypes.c_float(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Pooling forward Status:"),

cudnnCheckStatus(libcudnn.cudnnPoolingForward(handle,pooling_desc,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))


#*********Display Output*************

print("Max Pooling Output")
print(Y.get())


Y_pool=Y


Pooling Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Pooling Descriptor: CUDNN_STATUS_SUCCESS
Getting Output Dimensions: CUDNN_STATUS_SUCCESS
('Output Dimensions: ', 1, 1, 24, 24) Setting Output Descriptor: CUDNN_STATUS_SUCCESS
Pooling forward Status: CUDNN_STATUS_SUCCESS
Max Pooling Output
[[[[ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9       ]
   [ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9       ]
   [ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         

In [41]:
# Batchnorm
X_norm=Y_pool
X=X_norm

#**********Defining Enumerated Types*************
"""
cudnnDataType={'CUDNN_DATA_FLOAT': 0}
cudnnTensorFormat = {'CUDNN_TENSOR_NCHW': 0}
cudnnSoftmaxAlgorithm ={'CUDNN_SOFTMAX_FAST':0}

BatchNorm Modes
CUDNN_BATCHNORM_PER_ACTIVATION : to be used after non-convolution layers. bnBias and bnScale dimensions are 1xCxHxW
CUDNN_BATCHNORM_SPATIAL : to be used after convolutional layers. bnBias and bnScale dimensions are 1xCx1x1 

cudnnBatchNormMode={'CUDNN_BATCHNORM_PER_ACTIVATION':0,'CUDNN_BATCHNORM_SPATIAL': 1}
"""

data_type =1
batchnorm_mode=1

#**********Dimensions************
"""
n_i=1
c_i=1
h_i=1
w_i=4
"""
n_i=n_o
c_i=c_o
h_i=h_o
w_i=w_o

#********GPU arrays***********
"""
X = gpuarray.to_gpu(np.random.rand(n_i,c_i,h_i,w_i)
.astype(np.float32))
W = gpuarray.to_gpu(np.random.rand(k,h_k,w_k).astype(np.float32))
x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0]]]],dtype='float32')
"""
#x=np.array([[[[1,2,3,4],[1,2,3,4]],[[1,2,3,4],[1,2,3,4]]]])
#x=np.array([[[[1.0,2.3,4.1,0.6]]]],dtype='float64')
#X=gpuarray.to_gpu(x)
Y = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)

#*******Creating descriptors********

batchnorm_desc=ctypes.c_void_p()
print("Batchnorm TensorDescriptor: "),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(batchnorm_desc)))

#********Set descriptors*******

print("Setting Input Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))
print("Setting Output  Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))

if batchnorm_mode==0:
	n_bn=1
	c_bn=c_i
	h_bn=h_i
	w_bn=w_i
else:
	n_bn=1
	c_bn=c_i
	h_bn=1
	w_bn=1

print("Setting BatchNorm Descriptor: "),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(batchnorm_desc,tensor_format,data_type,n_bn,c_bn,h_bn,w_bn))


#*********BatchNorm Forward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)

X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

scale_array=np.ones((n_bn,c_bn,h_bn,w_bn),dtype='float64')
scale=gpuarray.to_gpu(scale_array) #gamma
scale_p=ctypes.c_void_p(int(scale.gpudata)) 

bias_array=np.zeros((n_bn,c_bn,h_bn,w_bn),dtype='float64')
bias=gpuarray.to_gpu(bias_array) #beta
bias_p=ctypes.c_void_p(int(bias.gpudata))

exp_avg_factor=ctypes.c_double(0.1) #momentum

#running_mean=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float32)
#running_var=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float32)

running_mean_array=np.zeros((n_bn,c_bn,h_bn,w_bn),dtype='float64')
running_mean=gpuarray.to_gpu(running_mean_array)
running_mean_p=ctypes.c_void_p(int(running_mean.gpudata))

running_var_array=np.ones((n_bn,c_bn,h_bn,w_bn),dtype='float64')
running_var=gpuarray.to_gpu(running_var_array)
running_var_p=ctypes.c_void_p(int(running_var.gpudata))

#e=ctypes.c_float(libcudnn.CUDNN_BN_MIN_EPSILON)
epsilon=ctypes.c_double(0.0001)

result_save_mean=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
result_save_mean_p=ctypes.c_void_p(int(result_save_mean.gpudata))

result_save_var=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
result_save_var_p=ctypes.c_void_p(int(result_save_var.gpudata))

print("BatchNorm Status:"),

cudnnCheckStatus(libcudnn.cudnnBatchNormalizationForwardTraining(handle,batchnorm_mode,ctypes.byref(a),\
                 ctypes.byref(b),input_desc,X_data,output_desc,Y_data,batchnorm_desc,scale_p,bias_p,\
                 exp_avg_factor,running_mean_p,running_var_p,epsilon,result_save_mean_p,result_save_var_p))

print("Batchnorm Output:")
print(Y.get())
Y_norm=Y

Batchnorm TensorDescriptor:  CUDNN_STATUS_SUCCESS
Setting Input Descriptor CUDNN_STATUS_SUCCESS
Setting Output  Descriptor: CUDNN_STATUS_SUCCESS
Setting BatchNorm Descriptor:  CUDNN_STATUS_SUCCESS
BatchNorm Status: CUDNN_STATUS_SUCCESS
Batchnorm Output:
[[[[ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012

In [42]:
#Innerproduct
A=Y_norm
#skcuda.cublas.cublasCheckStatus(status)
"""
m=1
k=3
n=4
"""
m=1
k=n_o*c_o*h_o*w_o
#k=100
n=10
#x = np.float64(np.random.rand(m,k))
w = np.float64(np.random.normal(0,0.1,k*n))
w=w.reshape(k,n)
bias=np.zeros((m,n),dtype="float64")


#A = gpuarray.to_gpu(x)
B = gpuarray.to_gpu(w)
C =gpuarray.empty((m,n), np.float64)
BIAS =gpuarray.to_gpu(bias)
h =skcuda.cublas.cublasCreate()

#transa=_CUBLAS_OP[transa]

lda=m
ldb=k
ldc=m
#alf = 1.0;
#bet = 0.0;
#const float *alpha = &alf;
#const float *beta = &bet;
alpha=1.0
beta=0.0
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(input_desc)))
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(output_desc)))

cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))



#d=skcuda.cublas.cublasSgemm(h, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
status=skcuda.cublas.cublasDgemm(h, 0, 0, m, n, k, alpha,A.gpudata, lda,B.gpudata, ldb, beta,C.gpudata, ldc)
#skcuda.cublas.cublasCheckStatus(status)

a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
X_data = ctypes.c_void_p(int(BIAS.gpudata))
Y_data = ctypes.c_void_p(int(C.gpudata))


print("Adding Bias Tensor Status:")

cudnnCheckStatus(libcudnn.cudnnAddTensor(handle,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))

print 'Input vector'
print '----------------------------------'
print A.get()
print 'Weight Vector'
print '----------------------------------'
print w
print 'Inner product'
print '----------------------------------'
print C.get()

n_o=1
c_o=1
h_o=m
w_o=n


Y_ip=C

CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
Adding Bias Tensor Status:
CUDNN_STATUS_SUCCESS
Input vector
----------------------------------
[[[[ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.220

In [43]:
# Softmax
X=Y_ip


#*******Enums************
#softmax_mode = cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_INSTANCE']
#softmax_algo = cudnnSoftmaxAlgorithm ['CUDNN_SOFTMAX_FAST']
softmax_mode=0
softmax_algo=0
#**********Dimensions************
"""
n_i=1
c_i=1
h_i=1
w_i=4
"""
n_i=1
c_i=1
h_i=1
w_i=n_o*c_o*h_o*w_o

#********GPU arrays***********
#x=np.array([[[[1,2,3,4],[1,2,3,4]],[[1,2,3,4],[1,2,3,4]]]])
#x=np.array([[[[1.0,2.3,4.1,0.6]]]],dtype='float32')
#X=gpuarray.to_gpu(x)

Y = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)

#********Set descriptors*******
print("Setting Input Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Output  Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))

#*********Softmax Forward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Softmax Activation Status:"),

cudnnCheckStatus(libcudnn.cudnnSoftmaxForward(handle,softmax_algo,softmax_mode,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))

print("Softmax Output")
print(Y.get())

Y_softmax=Y


Setting Input Descriptor CUDNN_STATUS_SUCCESS
Setting Output  Descriptor: CUDNN_STATUS_SUCCESS
Softmax Activation Status: CUDNN_STATUS_SUCCESS
Softmax Output
[[[[  1.17510182e-01   3.06731977e-04   1.09109171e-04   1.37604971e-03
      3.49025680e-02   2.67783648e-03   5.23179858e-01   7.49282571e-02
      1.63562947e-01   8.14464603e-02]]]]


In [44]:
#softmax backward

n_o=1
c_o=1
h_o=1
w_o=10

n_i=1
c_i=1
h_i=1
w_i=10

Y= Y_softmax
Y_data = ctypes.c_void_p(int(Y.gpudata))
#print(Y.get())
a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
input_grad_array=np.array([[[[1.0,1.0,1.0,1.1,1.1,1.0,1.0,1.0,1.0,1.0]]]],dtype='float64')
output_grad_array=np.zeros((n_o,c_o,h_o,w_o),dtype='float64')

input_grad=gpuarray.to_gpu(input_grad_array)
#print(input_grad.get())

output_grad=gpuarray.empty((n_o,c_o,h_o,w_o),np.float64)
#output_grad=gpuarray.to_gpu(output_grad_array)
#print(output_grad.get())


input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))


#********Set descriptors*******
print("Setting Input Gradient Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Output Gradient Descriptor:"),        
#cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type,n_o,c_o,h_o,w_o))
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

#************Softmax Backward***********************
print("Softmax Backward Status:"),
cudnnCheckStatus(libcudnn.cudnnSoftmaxBackward(handle, softmax_algo, softmax_mode, ctypes.byref(a),input_desc,\
                 Y_data, input_grad_desc, input_grad_data, ctypes.byref(b),output_grad_desc, output_grad_data))

print("Softmax Output")
print(Y.get())
print("Softmax Input Gradient")
print(input_grad.get())
print("Softmax Output Gradient")
print(output_grad.get())
dY_softmax=output_grad

Setting Input Gradient Descriptor CUDNN_STATUS_SUCCESS
Setting Output Gradient Descriptor: CUDNN_STATUS_SUCCESS
Softmax Backward Status: CUDNN_STATUS_SUCCESS
Softmax Output
[[[[  1.17510182e-01   3.06731977e-04   1.09109171e-04   1.37604971e-03
      3.49025680e-02   2.67783648e-03   5.23179858e-01   7.49282571e-02
      1.63562947e-01   8.14464603e-02]]]]
Softmax Input Gradient
[[[[ 1.   1.   1.   1.1  1.1  1.   1.   1.   1.   1. ]]]]
Softmax Output Gradient
[[[[ -4.26310164e-04  -1.58283187e+01  -2.70256819e-02   1.32726671e-04
      9.39414608e-02  -6.81744059e-02  -1.90024088e-03  -2.71829360e-04
     -2.71665888e-04   1.43795726e-02]]]]


In [45]:
print B.shape
print A.shape

(576, 10)
(1, 1, 24, 24)


In [46]:
# ip backward
input_grad=dY_softmax

#inner Product Backward
N,C,H,W = A.shape
#m=N*C*H*W #input layer
m=24*24
k=10 #output layer
n=1
"""
m=3
k=2 
n=1 
"""
print("Check")

input_grad=input_grad.reshape(-1,1)
#input_grad_array=np.array([[1],[2]],dtype='float64')
#input_grad_array=np.reshape(input_grad_array,(m,k))
#input_grad=gpuarray.to_gpu(input_grad_array)
output_grad=gpuarray.empty((m,n),np.float64)
alpha=1.0
beta=0.0
lda=m
ldb=k
ldc=m
#ldc=n
status=skcuda.cublas.cublasDgemm(h, 0, 0, m, n, k, alpha,B.gpudata, lda,input_grad.gpudata,ldb, beta,\
    output_grad.gpudata, ldc)
#B=B.reshape((1,-1))
#input_grad=input_grad.reshape((-1,1))

m=10
k=1
n=N*C*H*W
"""
m=2 
k=1 
n=3 
"""
lda=m
ldb=k
ldc=m
X=A.reshape(1,-1)
weight_grad=gpuarray.empty((n,m),np.float64)
status=skcuda.cublas.cublasDgemm(h, 0, 0, m, n, k, alpha,input_grad.gpudata, lda,X.gpudata,ldb, beta,\
    weight_grad.gpudata, ldc)
"""
print("W")
print(A.get())
print("X")
print(B.get())

print("IP output")
print(C.get())
print("input gradient")
print(input_grad.get())
"""
print("output gradient (dx)")
print(output_grad.get())

print("Weights gradient (dw)")
print(weight_grad.get())

dY_ip=output_grad


Check
output gradient (dx)
[[  6.65230386e-01]
 [ -1.76503642e+00]
 [ -2.50580483e+00]
 [ -2.26070796e+00]
 [  2.17041106e+00]
 [  4.32205983e-01]
 [ -1.23985075e+00]
 [  1.65807671e-01]
 [  9.22935409e-01]
 [ -3.98842373e-01]
 [  1.59151682e+00]
 [ -3.10187939e-01]
 [  1.51580549e+00]
 [ -2.32541412e+00]
 [ -1.13490048e+00]
 [  2.57725328e-01]
 [ -1.21418557e+00]
 [  1.89507246e+00]
 [  8.49082858e-01]
 [  3.61585040e-02]
 [  1.72735902e+00]
 [  1.67602890e+00]
 [ -1.61724406e+00]
 [ -2.71371839e+00]
 [ -3.98203702e-02]
 [ -1.82623847e-01]
 [ -4.74818071e-01]
 [  4.76291222e-01]
 [ -6.59088802e-01]
 [ -1.50249904e+00]
 [ -8.31454633e-01]
 [  3.12438490e+00]
 [ -7.82960494e-01]
 [  5.70695446e-01]
 [  7.27834280e-01]
 [  1.77080151e+00]
 [  1.26705030e+00]
 [ -1.67076287e+00]
 [  2.08575524e+00]
 [ -8.43421247e-01]
 [ -5.52297428e-01]
 [ -1.39072765e+00]
 [  2.50462550e+00]
 [  2.58916900e-01]
 [  1.01283964e+00]
 [  1.13105357e+00]
 [ -2.67040519e+00]
 [ -2.03075305e+00]
 [  7.0997062

In [47]:
#print input_grad.get()
print Y.get()


[[[[  1.17510182e-01   3.06731977e-04   1.09109171e-04   1.37604971e-03
      3.49025680e-02   2.67783648e-03   5.23179858e-01   7.49282571e-02
      1.63562947e-01   8.14464603e-02]]]]


In [48]:
#Batchnorm Backward
n_i,c_i,h_i,w_i=X_norm.shape
n_o,c_o,h_o,w_o=Y_norm.shape

input_grad=dY_ip.reshape((n_o,c_o,h_o,w_o))

Y=Y_norm
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Setting Output Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_o,c_o,h_o,w_o))
print("Setting Input Grad Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type,n_o,c_o,h_o,w_o))
print("Setting Output Grad Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))


batchnorm_grad_desc = batchnorm_desc

"""
input_grad_array=np.array([[[[1,1,1,1]]]],dtype='float64')
input_grad=gpuarray.to_gpu(input_grad_array)
"""
input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))

output_grad=gpuarray.empty((n_i,c_i,h_i,w_i),np.float64)
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))

scale_grad=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
scale_grad_p=ctypes.c_void_p(int(scale_grad.gpudata))
bias_grad=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
bias_grad_p=ctypes.c_void_p(int(bias_grad.gpudata))

a_data=ctypes.c_double(1.0)
b_data=ctypes.c_double(0.0)
a_param=ctypes.c_double(1.0)
b_param=ctypes.c_double(0.0)

print("Batch Norm Backward status:"),
cudnnCheckStatus(libcudnn.cudnnBatchNormalizationBackward(handle,batchnorm_mode,\
        ctypes.byref(a_data),ctypes.byref(b_data),ctypes.byref(a_param),ctypes.byref(b_param),\
            output_desc,Y_data,input_grad_desc,input_grad_data,output_grad_desc,output_grad_data,\
            batchnorm_grad_desc,scale_p,scale_grad_p,bias_grad_p,epsilon,result_save_mean_p,result_save_var_p)) 

#print("Batchnorm Output")
#print(Y.get())
#print("Batchnorm Input Gradient")
#print(input_grad.get())
print("Batchnorm Output Gradient")
print(output_grad.get())

dY_norm=output_grad

Setting Output Descriptor CUDNN_STATUS_SUCCESS
Setting Input Grad Descriptor CUDNN_STATUS_SUCCESS
Setting Output Grad Descriptor: CUDNN_STATUS_SUCCESS
Batch Norm Backward status: CUDNN_STATUS_SUCCESS
Batchnorm Output Gradient
[[[[ -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15]
   [ -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+1

In [49]:
print output_grad.shape
print X.shape
print Y.shape

(1, 1, 24, 24)
(1, 576)
(1, 1, 24, 24)


In [50]:
#Pooling Backward

input_grad=dY_norm
input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))
output_grad = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))  

n_i,c_i,h_i,w_i=X_pool.shape
n_o,c_o,h_o,w_o=Y_pool.shape
X=X_pool
Y=Y_pool
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

#input_grad_array=np.array([[[[1.0,1.0,1.0],[1.0,1.0,1.0],[1.0,1.0,1.0]]]],dtype='float64')
#input_grad=gpuarray.to_gpu(input_grad_array)

                   
#********Set descriptors*******
print("Setting Output Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))

print("Setting Input Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Input Gradient Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))

print("Setting Output Gradient Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
print("Pooling backward Status:"),

a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
#cudnnCheckStatus(libcudnn.cudnnPoolingForward(handle,pooling_desc,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))
cudnnCheckStatus(libcudnn.cudnnPoolingBackward(handle,pooling_desc,ctypes.byref(a),\
                output_desc,Y_data,input_grad_desc,input_grad_data,\
                input_desc,X_data,ctypes.byref(b),output_grad_desc,output_grad_data))
print ("Output Gradient:")
print (output_grad.get())

dY_pool=output_grad

Setting Output Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Input Gradient Descriptor CUDNN_STATUS_SUCCESS
Setting Output Gradient Descriptor: CUDNN_STATUS_SUCCESS
Pooling backward Status: CUDNN_STATUS_SUCCESS
Output Gradient:
[[[[ -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15   0.00000000e+00  -8.40810577e+15   0.00000000e+00
      0.00000000e+00  -1.26121587e+16  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15]
   [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
      0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
      0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
     -1.68162115e+16  -4.20405288e+15  -4.20405288e+15   0.00000000e+00
      0.00

In [51]:
#Relu Backward

input_grad=dY_pool
Y=Y_relu
X=X_relu
n_i,c_i,h_i,w_i=Y.shape
Y_data = ctypes.c_void_p(int(Y.gpudata))
X_data = ctypes.c_void_p(int(X.gpudata))
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)

#input_grad_array=np.ones((n_i,c_i,h_i,w_i),dtype='float64')
#output_grad_array=np.array([[[[0.0,0.0,0.0,0.0]]]],dtype='float64')

#input_grad=gpuarray.to_gpu(input_grad_array)


output_grad=gpuarray.empty((n_i,c_i,h_i,w_i),dtype='float64')
#output_grad=gpuarray.to_gpu(output_grad_array)


input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))


#********Set descriptors*******
print("Setting Input Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
print("Setting output Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))


print("Setting Input Gradient Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Output Gradient Descriptor:"),        
#cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type,n_o,c_o,h_o,w_o))
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

#************Activation Backward***********************
print("Activation Backward Status:"),
cudnnCheckStatus(libcudnn.cudnnActivationBackward(handle,activation_desc,ctypes.byref(a),output_desc,\
                Y_data, input_grad_desc, input_grad_data, input_desc, X_data,ctypes.byref(b),\
                 output_grad_desc,output_grad_data))
"""
print("Activation Input")
print(X.get())
print("Activation Output")
print(Y.get())
print("Activation Input Gradient")
print(input_grad.get())
"""
print("Activation Output Gradient")
print(output_grad.get())

dY_relu=output_grad


Setting Input Descriptor CUDNN_STATUS_SUCCESS
Setting output Descriptor CUDNN_STATUS_SUCCESS
Setting Input Gradient Descriptor CUDNN_STATUS_SUCCESS
Setting Output Gradient Descriptor: CUDNN_STATUS_SUCCESS
Activation Backward Status: CUDNN_STATUS_SUCCESS
Activation Output Gradient
[[[[ -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15   0.00000000e+00  -8.40810577e+15   0.00000000e+00
      0.00000000e+00  -1.26121587e+16  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
     -4.20405288e+15  -4.20405288e+15  -4.20405288e+15  -4.20405288e+15
      0.00000000e+00   0.00000000e+00   0.00000000e+00]
   [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
      0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
      0.00000000e+00  -1.68162115e+16  -4.20405288e+15  -4.20405288e+15
      0.0000000

In [52]:
b=dY_relu.get()
a=np.asarray(b)
a=a.astype(np.float32)
a

array([[[[ -4.20405277e+15,  -4.20405277e+15,  -4.20405277e+15,
           -4.20405277e+15,  -4.20405277e+15,  -4.20405277e+15,
           -4.20405277e+15,  -4.20405277e+15,  -4.20405277e+15,
            0.00000000e+00,  -8.40810554e+15,   0.00000000e+00,
            0.00000000e+00,  -1.26121586e+16,  -4.20405277e+15,
           -4.20405277e+15,  -4.20405277e+15,  -4.20405277e+15,
           -4.20405277e+15,  -4.20405277e+15,  -4.20405277e+15,
           -4.20405277e+15,  -4.20405277e+15,  -4.20405277e+15,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
         [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           -1.68162111e+16,  -4.20405277e+15,  -4.20405277e+15,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.0000000

In [53]:
#convolution backprop

input_grad=dY_relu
X=X_conv
W=W_conv
Y=Y_conv

n_i,c_i,h_i,w_i=X_conv.shape
n_o,c_o,h_o,w_o=Y_conv.shape
k,c_i,h_k,w_k=W_conv.shape

#-----------------------initialize-------------------------------------------------------
X_data = ctypes.c_void_p(int(X.gpudata))
W_data = ctypes.c_void_p(int(W.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
#input_grad_array=np.ones((n_o,c_o,h_o,w_o),dtype='float64')
#input_grad_array=np.random.rand(n_o,c_o,h_o,w_o)
#output_grad_array=np.array([[[[0.0,0.0,0.0,0.0]]]],dtype='float64')
#input_grad=gpuarray.to_gpu(input_grad_array)
input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))

print("1")
#filter_grad_array=np.ones((n_i,c_i,h_k,w_k),dtype='float64')
#filter_grad=gpuarray.to_gpu(filter_grad_array)
filter_grad=gpuarray.empty((k,c_i,h_k,w_k),dtype='float64')
filter_grad_data=ctypes.c_void_p(int(filter_grad.gpudata))
#print("2")
#bias_grad_array=np.ones((n_o,c_o,h_o,w_o),dtype='float64')
#bias_grad=gpuarray.to_gpu(bias_grad_array)
#bias_grad_data=ctypes.c_void_p(int(bias_grad.gpudata))

output_grad=gpuarray.empty((n_i,c_i,h_i,w_i),dtype='float64')
#output_grad=gpuarray.to_gpu(output_grad_array)
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))


#------------------Setting Descriptors--------------------------------------

print("Setting Input Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))
print("Setting Output Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
print("Setting Filter Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetFilter4dDescriptor(filter_grad_desc,data_type,tensor_format,k,c_i,h_k,w_k))
#print("Setting Bias Grad Descriptor:"),
#cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(bias_grad_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))
preference=0
memoryLimitInbytes=4*1024
data_algo=ctypes.c_int()
filter_algo=ctypes.c_int()
#algo=0
workspace =ctypes.c_void_p()
workspace_size=ctypes.c_size_t(0)


#------------------Setting Descriptors--------------------------------------

print("Setting Input Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))
print("Setting Output Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
print("Setting Filter Grad Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetFilter4dDescriptor(filter_grad_desc,data_type,tensor_format,k,c_i,h_k,w_k))
#print("Setting Bias Grad Descriptor:"),
#cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(bias_grad_desc,tensor_format,data_type, n_o, c_o, h_o, w_o))
preference=0
memoryLimitInbytes=4*1024
data_algo=ctypes.c_int()
filter_algo=ctypes.c_int()
#algo=0
workspace =ctypes.c_void_p()
workspace_size=ctypes.c_size_t(0)

#--------------------Setting functions----------------------------------------
print("Getting Convolution backward Data Algorithm: "),
cudnnCheckStatus(libcudnn.cudnnGetConvolutionBackwardDataAlgorithm(\
handle,filter_desc,input_grad_desc,conv_desc,\
output_grad_desc, preference, ctypes.c_size_t(memoryLimitInbytes),ctypes.byref(data_algo)))

print("Get convolution backward Data workspace size: "),
cudnnCheckStatus(libcudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(handle,filter_desc,input_grad_desc,\
                conv_desc,output_grad_desc,data_algo,ctypes.byref(workspace_size)))

print("Convolution backward Data: "),
cudnnCheckStatus(libcudnn.cudnnConvolutionBackwardData(handle,ctypes.byref(a),filter_desc,W_data,input_grad_desc,\
                input_grad_data,conv_desc,data_algo,workspace,ctypes.byref(workspace_size),\
                ctypes.byref(b),output_grad_desc,output_grad_data))

print("Get convolution backward filter algorithm: "),
cudnnCheckStatus(libcudnn.cudnnGetConvolutionBackwardFilterAlgorithm(handle,input_desc,input_grad_desc,conv_desc,\
        filter_grad_desc,preference,ctypes.c_size_t(memoryLimitInbytes),ctypes.byref(filter_algo)))

print("Get convolution backward Filter Workspace Size: "),
cudnnCheckStatus(libcudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(handle,input_desc,input_grad_desc,\
         conv_desc, filter_grad_desc,filter_algo,ctypes.byref(workspace_size)))

print("Convolution Backward Filter: "),
cudnnCheckStatus(libcudnn.cudnnConvolutionBackwardFilter(handle,ctypes.byref(a),input_desc,\
                X_data,input_grad_desc,input_grad_data,conv_desc,filter_algo,workspace,ctypes.byref(workspace_size),\
                 ctypes.byref(b),filter_grad_desc,filter_grad_data))

#***********Setting bias*************
bias=np.zeros((h_i,w_i),dtype="float64")
BIAS =gpuarray.to_gpu(bias)
#print "input to CONVOLUTION"
#print X.get()
#print "input gradients"
#print input_grad.get()
print "output gradients"
print output_grad.get()
print "filter gradients"
print filter_grad.get()

      
#***********Adding Bias******************
a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
X_data = ctypes.c_void_p(int(BIAS.gpudata))



print("\n\nAdding Bias Tensor Status:")

cudnnCheckStatus(libcudnn.cudnnAddTensor(handle,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,output_grad_data))

print("\n\nBackpropWithBias Output")
print(output_grad.get())



#print "Output"
#print Y.get()

#print("Convolution Backward Bias: "),
#cudnnCheckStatus(libcudnn.cudnnConvolutionBackwardBias(handle,ctypes.byref(a),input_grad_desc,input_grad_data,bias_grad_desc,bias_grad_data))



1
Setting Input Grad Descriptor: CUDNN_STATUS_SUCCESS
Setting Output Grad Descriptor: CUDNN_STATUS_SUCCESS
Setting Filter Grad Descriptor: CUDNN_STATUS_SUCCESS
Setting Input Grad Descriptor: CUDNN_STATUS_SUCCESS
Setting Output Grad Descriptor: CUDNN_STATUS_SUCCESS
Setting Filter Grad Descriptor: CUDNN_STATUS_SUCCESS
Getting Convolution backward Data Algorithm:  CUDNN_STATUS_SUCCESS
Get convolution backward Data workspace size:  CUDNN_STATUS_SUCCESS
Convolution backward Data:  CUDNN_STATUS_SUCCESS
Get convolution backward filter algorithm:  CUDNN_STATUS_SUCCESS
Get convolution backward Filter Workspace Size:  CUDNN_STATUS_SUCCESS
Convolution Backward Filter:  CUDNN_STATUS_SUCCESS
output gradients
[[[[ -6.86860044e+03  -6.86955655e+03  -6.86328736e+03  -6.85969720e+03
     -6.86620055e+03  -6.86883809e+03  -6.86883077e+03  -6.87573418e+03
     -6.87346213e+03  -6.87231620e+03  -6.86439511e+03  -6.87386700e+03
     -6.85690612e+03  -6.86447518e+03  -6.86021994e+03  -6.86360038e+03
     -6

In [54]:
libcudnn.cudnnDestroyTensorDescriptor(input_desc)
libcudnn.cudnnDestroyTensorDescriptor(output_desc)  
libcudnn.cudnnDestroyTensorDescriptor(input_grad_desc)
libcudnn.cudnnDestroyTensorDescriptor(output_grad_desc) 
libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc)
libcudnn.cudnnDestroyActivationDescriptor(activation_desc)
libcudnn.cudnnDestroyPoolingDescriptor(pooling_desc)
libcudnn.cudnnDestroyTensorDescriptor(batchnorm_desc)
#libcudnn.cudnnDestroyBatchnormDescriptor(batchnorm_grad_desc)
skcuda.cublas.cublasDestroy(h)
libcudnn.cudnnDestroy(handle)
print("Cleaned Up")

Cleaned Up
