In [2]:
#forward pass

import sys
import ctypes
import ctypes.util
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
import numpy as np
import cv2
import pycuda.gpuarray as gpuarray
import skcuda.cublas
import matplotlib.pyplot as plt
%matplotlib inline 

try:
        libcudnn=ctypes.cdll.LoadLibrary('/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudnn.so.5.1.10')
except OSError:
        print("OSError");
print(libcudnn.cudnnGetVersion())
libcudnn.cudnnGetErrorString.restype = ctypes.c_char_p
libcudnn.cudnnGetErrorString.argtypes = [ctypes.c_int]

def cudnnCheckStatus(status):
    print(libcudnn.cudnnGetErrorString(status))

#**********Defining Handle***************
handle=ctypes.c_void_p()
print("Handle Creation Status:"),
cudnnCheckStatus(libcudnn.cudnnCreate(ctypes.byref(handle)))


#************Image Data****************
def normalize(image_data):
    a = 0.1; b = 0.9; MIN = 0; MAX = 255
    b=a + (((image_data - MIN)*(b - a))/(MAX - MIN))
    return b

img = cv2.imread('One.jpg',0)
img=np.asarray(img)
img = normalize(img)

#************Enums*****************
tensor_format=0
data_type=1

#***********Creating descriptors******
input_desc = ctypes.c_void_p()
print("Input Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(input_desc)))

input_grad_desc = ctypes.c_void_p()
print("Input Grad Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(input_grad_desc)))

output_desc = ctypes.c_void_p()
print("Output Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(output_desc)))

output_grad_desc = ctypes.c_void_p()
print("Output Grad Tensor Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(output_grad_desc)))

filter_desc = ctypes.c_void_p()
print("Filter Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnCreateFilterDescriptor(ctypes.byref(filter_desc)))



5110
Handle Creation Status: CUDNN_STATUS_SUCCESS
Input Tensor Descriptor: CUDNN_STATUS_SUCCESS
Input Grad Tensor Descriptor: CUDNN_STATUS_SUCCESS
Output Tensor Descriptor: CUDNN_STATUS_SUCCESS
Output Grad Tensor Descriptor: CUDNN_STATUS_SUCCESS
Filter Descriptor: CUDNN_STATUS_SUCCESS


In [3]:
#Convolution

#*******Enums********************
convolution_mode=0
algo=0
preference=0

#**********Dimensions************
n_i=1
c_i=1
h_i=28
w_i=28
h_k=2
w_k=2
k=1
pad_h=0
pad_w=0
stride_h=1
stride_w=1
upscalex=1
upscaley=1

#********GPU arrays***********
array_type=np.float64
#x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0],]]],dtype=array_type)
#x=np.random.rand(1,1,28,28)
x=img.reshape((n_i,c_i,h_i,w_i))
X=gpuarray.to_gpu(x)
w=np.array([[[1.0,0.0],[0.0,0.0]]],dtype=array_type)
W=gpuarray.to_gpu(w)

#***********Creating descriptors******
conv_desc = ctypes.c_void_p()
print("Convolution Descriptor"),
cudnnCheckStatus(libcudnn.cudnnCreateConvolutionDescriptor(ctypes.byref(conv_desc)))

#********Set descriptors*******
print("Setting Input Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
print("Setting Filter Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetFilter4dDescriptor(filter_desc,data_type,tensor_format,k,c_i,h_k,w_k));
print("Setting Convolution Desriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetConvolution2dDescriptor(conv_desc,pad_h,pad_w,stride_h,stride_w,upscalex,upscaley,convolution_mode));
        

#*********configuring the Output*********
print("Getting Output Dimensions:"),
temp_n_o = ctypes.c_int()
temp_c_o = ctypes.c_int()
temp_h_o = ctypes.c_int()
temp_w_o = ctypes.c_int()
cudnnCheckStatus(libcudnn.cudnnGetConvolution2dForwardOutputDim(conv_desc, input_desc,filter_desc, ctypes.byref(temp_n_o),ctypes.byref(temp_c_o), ctypes.byref(temp_h_o),ctypes.byref(temp_w_o)))

n_o=temp_n_o.value
c_o=temp_c_o.value
w_o=temp_w_o.value
h_o=temp_h_o.value
print("Output Dimensions: ",n_o,c_o,h_o,w_o)
Y= gpuarray.empty((n_o,c_o,h_o,w_o), array_type)
print("Setting Output Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_o,c_o,h_o,w_o));

#*********Setting Workspace********
algo=ctypes.c_int()
memory_limit=ctypes.c_size_t(1024*1024)
cudnnCheckStatus(libcudnn.cudnnGetConvolutionForwardAlgorithm(handle,input_desc,filter_desc,conv_desc,output_desc,preference,memory_limit,ctypes.byref(algo)))

workspace =ctypes.c_void_p()
workspace_size=ctypes.c_size_t(0)

print("Getting Workspace Size:"),
cudnnCheckStatus(libcudnn.cudnnGetConvolutionForwardWorkspaceSize(handle,input_desc,filter_desc,conv_desc,\
output_desc,algo,ctypes.byref(workspace_size)))
print("Workspace Size = ",workspace_size)
if workspace_size.value!=0:
    workspace= drv.mem_alloc(workspace_size.value)
    print("workspace Allocated")
else:
    print("Workspace Not allocated")

#*********ConvolutionForward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
W_data = ctypes.c_void_p(int(W.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))
print(workspace.value)
#workspace_data = ctypes.c_void_p(workspace)
print("Convolution forward Status:"),

cudnnCheckStatus(libcudnn.cudnnConvolutionForward(handle,ctypes.byref(a),input_desc,X_data,filter_desc,\
                W_data,conv_desc,algo,None,0,ctypes.byref(b),output_desc,Y_data))
print("Convolution Output")
print(Y.get())

Y_conv=Y
output_desc_conv=output_desc

Convolution Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Filter Descriptor: CUDNN_STATUS_SUCCESS
Setting Convolution Desriptor: CUDNN_STATUS_SUCCESS
Getting Output Dimensions: CUDNN_STATUS_SUCCESS
('Output Dimensions: ', 1, 1, 27, 27)
Setting Output Descriptor: CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
Getting Workspace Size: CUDNN_STATUS_SUCCESS
('Workspace Size = ', c_ulong(0L))
Workspace Not allocated
None
Convolution forward Status: CUDNN_STATUS_SUCCESS
Convolution Output
[[[[ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.89372549  0.9         0.89372549  0.89686275
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9       ]
   [ 0.89058824  0.89058824  0.89058824  0.89058824  0.89058824  0.89058824
     0.89058824  0.88431373  0.88117647  0.9         0.9         0.9
     

In [4]:
#Relu Activation

X=Y_conv
#input_desc=output_desc_conv

#**********Defining Enumerated Types*************
cudnnDataType={'CUDNN_DATA_FLOAT': 0}
cudnnTensorFormat = {'CUDNN_TENSOR_NCHW': 0}
cudnnNanPropagation     ={'CUDNN_NOT_PROPAGATE_NAN':0}
cudnnActivationMode     ={'CUDNN_ACTIVATION_RELU':1}

relu_mode = cudnnActivationMode['CUDNN_ACTIVATION_RELU']
reluNanOpt = cudnnNanPropagation ['CUDNN_NOT_PROPAGATE_NAN']


#**********Dimensions************
"""
n_i=1
c_i=1
h_i=4
w_i=4
"""
n_i=n_o
c_i=c_o
h_i=h_o
w_i=w_o


#********GPU arrays***********
"""
X = gpuarray.to_gpu(np.random.rand(n_i,c_i,h_i,w_i)
.astype(np.float32))
W = gpuarray.to_gpu(np.random.rand(k,h_k,w_k).astype(np.float32))
#x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0]]]],dtype='float32')
x=np.array([[[[1.0,-2.0,3.0,-4.0],[5.0,-6.0,-7.0,-8.0],[-9.0,-10.0,-11.0,-12.0],[-13.0,14.0,-15.0,-16.0]]]],dtype='float32')
X=gpuarray.to_gpu(x)
"""

#*******Creating descriptors********

activation_desc=ctypes.c_void_p()
print("Activation Descriptor"),
cudnnCheckStatus(libcudnn.cudnnCreateActivationDescriptor(ctypes.byref(activation_desc)))

#********Set descriptors*******
print("Setting Input Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

reluCeiling=ctypes.c_double(1.0)
maxpoolingNanOpt =cudnnNanPropagation['CUDNN_NOT_PROPAGATE_NAN']
print("Setting Activation Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetActivationDescriptor(activation_desc,relu_mode,reluNanOpt,ctypes.byref(reluCeiling)))


#*********configuring the Output*****************************************************************
Y = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)


print("Setting Output Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))


#*********PoolingForward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Activation forward Status:"),

cudnnCheckStatus(libcudnn.cudnnActivationForward(handle,activation_desc,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))


#*********Display Output*************


print("Relu Activation Output"),
print(Y.get())


libcudnn.cudnnDestroyActivationDescriptor(activation_desc)
print("Cleaned Up")

Y_relu=Y


Activation Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Activation Descriptor: CUDNN_STATUS_SUCCESS
Setting Output Descriptor: CUDNN_STATUS_SUCCESS
Activation forward Status: CUDNN_STATUS_SUCCESS
Relu Activation Output [[[[ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.89372549  0.9         0.89372549  0.89686275
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9       ]
   [ 0.89058824  0.89058824  0.89058824  0.89058824  0.89058824  0.89058824
     0.89058824  0.88431373  0.88117647  0.9         0.9         0.9
     0.89372549  0.89372549  0.89686275  0.89058824  0.89058824  0.89058824
     0.89058824  0.89058824  0.89058824  0.89058824  0.89058824  0.89058824
     0.89058824  0.89058824  0.89058824]
   [ 0.9         0.9         0.9         0.9         0.9         0.9      

In [5]:
#Maxpool
X=Y_relu
#**********Defining Enumerated Types*************

cudnnDataType={'CUDNN_DATA_FLOAT': 0}
cudnnTensorFormat = {'CUDNN_TENSOR_NCHW': 0}
cudnnPoolingMode        ={'CUDNN_POOLING_MAX':0}
cudnnNanPropagation     ={'CUDNN_NOT_PROPAGATE_NAN':0}

pooling_mode=0
maxpoolingNanOpt =0

#**********Dimensions************
"""
n_i=1
c_i=1
h_i=4
w_i=4
"""
n_i=n_o
c_i=c_o
h_i=h_o
w_i=w_o

pad_h=0
pad_w=0
stride_h=1
stride_w=1
win_h=4
win_w=4

#********GPU arrays***********
#x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0]]]],dtype='float32')
#X=gpuarray.to_gpu(x)



#*******Creating descriptors********

pooling_desc=ctypes.c_void_p()
print("Pooling Descriptor"),
cudnnCheckStatus(libcudnn.cudnnCreatePoolingDescriptor(ctypes.byref(pooling_desc)))

#********Set descriptors*******

print("Setting Input Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Pooling Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetPooling2dDescriptor(pooling_desc,pooling_mode,maxpoolingNanOpt,
win_h,
win_w,
pad_h,
pad_w,
stride_h,
stride_w))
#*********configuring the Output*****************************************************************

print("Getting Output Dimensions:"),
temp_n_o = ctypes.c_int()
temp_c_o = ctypes.c_int()
temp_h_o = ctypes.c_int()
temp_w_o = ctypes.c_int()

cudnnCheckStatus(libcudnn.cudnnGetPooling2dForwardOutputDim(pooling_desc,input_desc,ctypes.byref(temp_n_o),ctypes.byref(temp_c_o), ctypes.byref(temp_h_o),ctypes.byref(temp_w_o)))
n_o=temp_n_o.value
c_o=temp_c_o.value
w_o=temp_w_o.value
h_o=temp_h_o.value


print("Output Dimensions: ",n_o,c_o,h_o,w_o),
Y = gpuarray.empty((n_o,c_o,h_o,w_o), np.float64)


print("Setting Output Descriptor:"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_o,c_o,h_o,w_o));


#*********PoolingForward********
a=ctypes.c_double(1.0)
b=ctypes.c_float(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Pooling forward Status:"),

cudnnCheckStatus(libcudnn.cudnnPoolingForward(handle,pooling_desc,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))


#*********Display Output*************

print("Max Pooling Output")
print(Y.get())

#********Cleaning Up**************

libcudnn.cudnnDestroyPoolingDescriptor(pooling_desc)

print("Cleaned Up")

Y_pool=Y

Pooling Descriptor CUDNN_STATUS_SUCCESS
Setting Input Descriptor: CUDNN_STATUS_SUCCESS
Setting Pooling Descriptor: CUDNN_STATUS_SUCCESS
Getting Output Dimensions: CUDNN_STATUS_SUCCESS
('Output Dimensions: ', 1, 1, 24, 24) Setting Output Descriptor: CUDNN_STATUS_SUCCESS
Pooling forward Status: CUDNN_STATUS_SUCCESS
Max Pooling Output
[[[[ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9       ]
   [ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         0.9         0.9       ]
   [ 0.9         0.9         0.9         0.9         0.9         0.9         0.9
     0.9         

In [6]:
# Batchnorm
X=Y_pool

#**********Defining Enumerated Types*************
"""
cudnnDataType={'CUDNN_DATA_FLOAT': 0}
cudnnTensorFormat = {'CUDNN_TENSOR_NCHW': 0}
cudnnSoftmaxAlgorithm ={'CUDNN_SOFTMAX_FAST':0}

BatchNorm Modes
CUDNN_BATCHNORM_PER_ACTIVATION : to be used after non-convolution layers. bnBias and bnScale dimensions are 1xCxHxW
CUDNN_BATCHNORM_SPATIAL : to be used after convolutional layers. bnBias and bnScale dimensions are 1xCx1x1 

cudnnBatchNormMode={'CUDNN_BATCHNORM_PER_ACTIVATION':0,'CUDNN_BATCHNORM_SPATIAL': 1}
"""

data_type =1
batchnorm_mode=1

#**********Dimensions************
"""
n_i=1
c_i=1
h_i=1
w_i=4
"""
n_i=n_o
c_i=c_o
h_i=h_o
w_i=w_o

#********GPU arrays***********
"""
X = gpuarray.to_gpu(np.random.rand(n_i,c_i,h_i,w_i)
.astype(np.float32))
W = gpuarray.to_gpu(np.random.rand(k,h_k,w_k).astype(np.float32))
x=np.array([[[[1.0,2.0,3.0,4.0],[5.0,6.0,7.0,8.0],[9.0,10.0,11.0,12.0],[13.0,14.0,15.0,16.0]]]],dtype='float32')
"""
#x=np.array([[[[1,2,3,4],[1,2,3,4]],[[1,2,3,4],[1,2,3,4]]]])
#x=np.array([[[[1.0,2.3,4.1,0.6]]]],dtype='float64')
#X=gpuarray.to_gpu(x)
Y = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)

#*******Creating descriptors********

batchnorm_desc=ctypes.c_void_p()
print("Batchnorm TensorDescriptor: "),
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(batchnorm_desc)))

#********Set descriptors*******

print("Setting Input Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))
print("Setting Output  Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))

if batchnorm_mode==0:
	n_bn=1
	c_bn=c_i
	h_bn=h_i
	w_bn=w_i
else:
	n_bn=1
	c_bn=c_i
	h_bn=1
	w_bn=1

print("Setting BatchNorm Descriptor: "),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(batchnorm_desc,tensor_format,data_type,n_bn,c_bn,h_bn,w_bn))


#*********BatchNorm Forward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)

X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

scale_array=np.ones((n_bn,c_bn,h_bn,w_bn),dtype='float')
scale=gpuarray.to_gpu(scale_array) #gamma
scale_p=ctypes.c_void_p(int(scale.gpudata)) 

bias_array=np.zeros((n_bn,c_bn,h_bn,w_bn),dtype='float32')
bias=gpuarray.to_gpu(bias_array) #beta
bias_p=ctypes.c_void_p(int(bias.gpudata))

exp_avg_factor=ctypes.c_double(0.1) #momentum

#running_mean=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float32)
#running_var=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float32)

running_mean_array=np.zeros((n_bn,c_bn,h_bn,w_bn),dtype='float64')
running_mean=gpuarray.to_gpu(running_mean_array)
running_mean_p=ctypes.c_void_p(int(running_mean.gpudata))

running_var_array=np.ones((n_bn,c_bn,h_bn,w_bn),dtype='float64')
running_var=gpuarray.to_gpu(running_var_array)
running_var_p=ctypes.c_void_p(int(running_var.gpudata))

#e=ctypes.c_float(libcudnn.CUDNN_BN_MIN_EPSILON)
epsilon=ctypes.c_double(0.0001)

result_save_mean=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
result_save_mean_p=ctypes.c_void_p(int(result_save_mean.gpudata))

result_save_var=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
result_save_var_p=ctypes.c_void_p(int(result_save_var.gpudata))

print("BatchNorm Status:"),

cudnnCheckStatus(libcudnn.cudnnBatchNormalizationForwardTraining(handle,batchnorm_mode,ctypes.byref(a),\
                 ctypes.byref(b),input_desc,X_data,output_desc,Y_data,batchnorm_desc,scale_p,bias_p,\
                 exp_avg_factor,running_mean_p,running_var_p,epsilon,result_save_mean_p,result_save_var_p))

print("Batchnorm Output:")
print(Y.get())
Y_norm=Y

Batchnorm TensorDescriptor:  CUDNN_STATUS_SUCCESS
Setting Input Descriptor CUDNN_STATUS_SUCCESS
Setting Output  Descriptor: CUDNN_STATUS_SUCCESS
Setting BatchNorm Descriptor:  CUDNN_STATUS_SUCCESS
BatchNorm Status: CUDNN_STATUS_SUCCESS
Batchnorm Output:
[[[[ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012

In [7]:
#Innerproduct
A=Y_norm
#skcuda.cublas.cublasCheckStatus(status)
"""
m=1
k=3
n=4
"""
m=1
k=n_o*c_o*h_o*w_o
#k=100
n=10
#x = np.float64(np.random.rand(m,k))
w = np.float64(np.random.rand(k,n))
bias=np.zeros((m,n),dtype="float64")


#A = gpuarray.to_gpu(x)
B = gpuarray.to_gpu(w)
C =gpuarray.empty((m,n), np.float64)
BIAS =gpuarray.to_gpu(bias)
h =skcuda.cublas.cublasCreate()

#transa=_CUBLAS_OP[transa]

lda=m
ldb=k
ldc=m
#alf = 1.0;
#bet = 0.0;
#const float *alpha = &alf;
#const float *beta = &bet;
alpha=1.0
beta=0.0
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(input_desc)))
cudnnCheckStatus(libcudnn.cudnnCreateTensorDescriptor(ctypes.byref(output_desc)))

cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))



#d=skcuda.cublas.cublasSgemm(h, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
status=skcuda.cublas.cublasDgemm(h, 0, 0, m, n, k, alpha,A.gpudata, lda,B.gpudata, ldb, beta,C.gpudata, ldc)
#skcuda.cublas.cublasCheckStatus(status)
skcuda.cublas.cublasDestroy(h)
a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
X_data = ctypes.c_void_p(int(BIAS.gpudata))
Y_data = ctypes.c_void_p(int(C.gpudata))


print("Adding Bias Tensor Status:")

cudnnCheckStatus(libcudnn.cudnnAddTensor(handle,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))

print 'Input vector'
print '----------------------------------'
print A.get()
print 'Weight Vector'
print '----------------------------------'
print w
print 'Inner product'
print '----------------------------------'
print C.get()

n_o=1
c_o=1
h_o=m
w_o=n


Y_ip=C

CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
CUDNN_STATUS_SUCCESS
Adding Bias Tensor Status:
CUDNN_STATUS_SUCCESS
Input vector
----------------------------------
[[[[ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.220

In [8]:
# Softmax
X=Y_ip


#*******Enums************
#softmax_mode = cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_INSTANCE']
#softmax_algo = cudnnSoftmaxAlgorithm ['CUDNN_SOFTMAX_FAST']
softmax_mode=0
softmax_algo=0
#**********Dimensions************
"""
n_i=1
c_i=1
h_i=1
w_i=4
"""
n_i=1
c_i=1
h_i=1
w_i=n_o*c_o*h_o*w_o

#********GPU arrays***********
#x=np.array([[[[1,2,3,4],[1,2,3,4]],[[1,2,3,4],[1,2,3,4]]]])
#x=np.array([[[[1.0,2.3,4.1,0.6]]]],dtype='float32')
#X=gpuarray.to_gpu(x)

Y = gpuarray.empty((n_i,c_i,h_i,w_i), np.float64)

#********Set descriptors*******
print("Setting Input Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Output  Descriptor:"),        
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_desc,tensor_format,data_type,n_i,c_i,h_i,w_i))

#*********Softmax Forward********
a=ctypes.c_double(1.0)
b=ctypes.c_double(0.0)
X_data = ctypes.c_void_p(int(X.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

print("Softmax Activation Status:"),

cudnnCheckStatus(libcudnn.cudnnSoftmaxForward(handle,softmax_algo,softmax_mode,ctypes.byref(a),input_desc,X_data,ctypes.byref(b),output_desc,Y_data))

print("Softmax Output")
print(Y.get())
Y_softmax=Y


libcudnn.cudnnDestroyTensorDescriptor(input_desc)
libcudnn.cudnnDestroyTensorDescriptor(output_desc)  

libcudnn.cudnnDestroy(handle)
print("Cleaned Up")    

Setting Input Descriptor CUDNN_STATUS_SUCCESS
Setting Output  Descriptor: CUDNN_STATUS_SUCCESS
Softmax Activation Status: CUDNN_STATUS_SUCCESS
Softmax Output
[[[[  3.76739342e-04   1.87247554e-04   4.60308188e-01   2.61361425e-04
      1.67693048e-01   3.86319381e-03   2.28908606e-02   2.30154454e-02
      2.68919808e-05   3.21377024e-01]]]]
Cleaned Up


In [9]:

bb=np.array(output_grad.get())
print bb.shape

NameError: name 'output_grad' is not defined

In [12]:
#Softmax backward

n_o=1
c_o=1
h_o=1
w_o=10

n_i=1
c_i=1
h_i=1
w_i=10

Y= Y_softmax
Y_data = ctypes.c_void_p(int(Y.gpudata))
#print(Y.get())
a=ctypes.c_double(1.0)
b=ctypes.c_double(1.0)
input_grad_array=np.array([[[[1.0,1.0,1.0,1.1,1.1,1.0,1.0,1.0,1.0,1.0]]]],dtype='float64')
output_grad_array=np.zeros((n_o,c_o,h_o,w_o),dtype='float64')

input_grad=gpuarray.to_gpu(input_grad_array)
#print(input_grad.get())

output_grad=gpuarray.empty((n_o,c_o,h_o,w_o),np.float64)
#output_grad=gpuarray.to_gpu(output_grad_array)
#print(output_grad.get())


input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))


#********Set descriptors*******
print("Setting Input Gradient Descriptor"),
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(input_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

print("Setting Output Gradient Descriptor:"),        
#cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type,n_o,c_o,h_o,w_o))
cudnnCheckStatus(libcudnn.cudnnSetTensor4dDescriptor(output_grad_desc,tensor_format,data_type, n_i, c_i, h_i, w_i))

#************Softmax Backward***********************
print("Softmax Backward Status:"),
cudnnCheckStatus(libcudnn.cudnnSoftmaxBackward(handle, softmax_algo, softmax_mode, ctypes.byref(a),input_desc,\
                 Y_data, input_grad_desc, input_grad_data, ctypes.byref(b),output_grad_desc, output_grad_data))

print("Softmax Output")
print(Y.get())
print("Softmax Input Gradient")
print(input_grad.get())
print("Softmax Output Gradient")
print(output_grad.get())
dX_softmax=output_grad

Setting Input Gradient Descriptor CUDNN_STATUS_SUCCESS
Setting Output Gradient Descriptor: CUDNN_STATUS_SUCCESS
Softmax Backward Status: CUDNN_STATUS_BAD_PARAM
Softmax Output
[[[[  3.76739342e-04   1.87247554e-04   4.60308188e-01   2.61361425e-04
      1.67693048e-01   3.86319381e-03   2.28908606e-02   2.30154454e-02
      2.68919808e-05   3.21377024e-01]]]]
Softmax Input Gradient
[[[[ 1.   1.   1.   1.1  1.1  1.   1.   1.   1.   1. ]]]]
Softmax Output Gradient
[[[[ 1.   1.   1.   1.1  1.1  1.   1.   1.   1.   1. ]]]]


In [None]:
ss = np.array(output_grad.get())
print ss.shape


In [16]:
#inner Product Backward
m=3
k=10
n=1 



#input_grad_array=np.array([[1],[2]],dtype='float64')
#input_grad_array=np.reshape(input_grad_array,(m,k))
#input_grad=gpuarray.to_gpu(input_grad_array)
input_grad=dX_softmax

output_grad=gpuarray.empty((m,n),np.float64)

#ldc=n
h =skcuda.cublas.cublasCreate()
alpha=1.0
beta=0.0
lda=m
ldb=k
ldc=m

status=skcuda.cublas.cublasDgemm(h, 0, 0, m, n, k, alpha,A.gpudata, lda,input_grad.gpudata,ldb, beta,\
    output_grad.gpudata, ldc)

skcuda.cublas.cublasDestroy(h)
#B=B.reshape((1,-1))
#input_grad=input_grad.reshape((-1,1))

h =skcuda.cublas.cublasCreate()
m=10
k=1 #2
n=3 #3
lda=m
ldb=k
ldc=m
weight_grad=gpuarray.empty((n,m),np.float64)
status=skcuda.cublas.cublasDgemm(h, 0, 0, m, n, k, alpha,input_grad.gpudata, lda,B.gpudata,ldb, beta,\
    weight_grad.gpudata, ldc)
skcuda.cublas.cublasDestroy(h)

print("W")
print(A.get())
print("X")
print(B.get())

print("IP output")
print(C.get())
print("input gradient")
print(input_grad.get())
print("output gradient (dx)")
print(output_grad.get())

print("Weights gradient (dw)")
print(weight_grad.get())

dX_innerproduct=output_grad

W
[[[[ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796]
   [ 0.22012796  0.22012796  0.22012796  0.22012796  0.22012796  0.22012796
     0.

In [19]:
#Batchnorm Backward
X=
Y=
Y_data = ctypes.c_void_p(int(Y.gpudata))
batchnorm_grad_desc = batchnorm_desc
input_grad_desc=input_desc
output_grad_desc=output_desc

#input_grad_array=np.array([[[[1,1,1,1]]]],dtype='float64')
#input_grad=gpuarray.to_gpu(input_grad_array)
input_grad=dX_innerproduct
input_grad_data=ctypes.c_void_p(int(input_grad.gpudata))

output_grad=gpuarray.empty((n_i,c_i,h_i,w_i),np.float64)
output_grad_data=ctypes.c_void_p(int(output_grad.gpudata))

scale_grad=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
scale_grad_p=ctypes.c_void_p(int(scale_grad.gpudata))
bias_grad=gpuarray.empty((n_bn,c_bn,h_bn,w_bn),np.float64)
bias_grad_p=ctypes.c_void_p(int(bias_grad.gpudata))

a_data=ctypes.c_double(1.0)
b_data=ctypes.c_double(0.0)
a_param=ctypes.c_double(1.0)
b_param=ctypes.c_double(0.0)

print("Batch Norm Backward status:"),
cudnnCheckStatus(libcudnn.cudnnBatchNormalizationBackward(handle,batchnorm_mode,\
        ctypes.byref(a_data),ctypes.byref(b_data),ctypes.byref(a_param),ctypes.byref(b_param),\
            output_desc,Y_data,input_grad_desc,input_grad_data,output_grad_desc,output_grad_data,\
            batchnorm_grad_desc,scale_p,scale_grad_p,bias_grad_p,epsilon,result_save_mean_p,result_save_var_p)) 

print("Batchnorm Output")
print(Y.get())
print("Batchnorm Input Gradient")
print(input_grad.get())
print("Batchnorm Output Gradient")
print(output_grad.get())


Batch Norm Backward status: CUDNN_STATUS_BAD_PARAM
Batchnorm Output
[[ 0.57407927 -0.12504272  7.68217724  0.20842994  6.67241621  2.90177525
   4.68101873  4.68644653 -2.06564615  7.32289596]]
Batchnorm Input Gradient
[[ 2.24530518]
 [ 2.24530518]
 [ 2.24530518]]
Batchnorm Output Gradient
[[[[  1.14016224e-187   2.74735249e+123   2.66262241e+123   1.11778711e+138
      1.50326261e-269  -1.37695515e+228   1.32437141e+123   1.40958566e+123
      2.40514545e-268   1.32340306e+123]]]]


In [None]:
libcudnn.cudnnDestroyTensorDescriptor(input_desc)
libcudnn.cudnnDestroyTensorDescriptor(output_desc)  
libcudnn.cudnnDestroyTensorDescriptor(input_grad_desc)
libcudnn.cudnnDestroyTensorDescriptor(output_grad_desc) 
skcuda.cublas.cublasDestroy(h)
libcudnn.cudnnDestroy(handle)
print("Cleaned Up")