## 1. Set up a Neural Networks 
set up MNIST data sets

In [54]:
from keras.datasets import mnist
from keras.utils import np_utils

# set up MNIST data sets, required keras lib
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)/255.0
X_test = X_test.reshape(10000, 784)/255.0

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

print('X_train.shape',X_train.shape)
print('y_train.shape',y_train.shape)

X_train.shape (60000, 784)
y_train.shape (60000, 10)


set up Neural Networks

In [55]:
import numpy as np
import reikna.cluda as cluda
from reikna.core import Type
from reikna.cbrng import CBRNG
from reikna.cbrng.samplers import uniform_float
from reikna.cbrng.bijections import threefry

api = cluda.cuda_api()#ocl_api()#.cuda_api() switch opencl and cuda
thr = api.Thread.create()

In [60]:
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '-1'
'input :784 units, output layer:10 units'
layers=(784, 10)
NN = np.zeros(layers).astype(np.float32)
NN_dev = thr.to_device(NN)
print('before random set')
print(NN_dev.get())

'set up GPU random set Neural Networks fun'
rng = CBRNG(Type(np.float32 , shape=layers), 1, uniform_float(threefry(32, 4), np.float32 ))
counters_dev = thr.to_device(rng.create_counters())
GPU_randomSet=rng.compile(thr)

GPU_randomSet(counters_dev, NN_dev)
print()
print('after random set')
print(NN_dev.get())

before random set
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

after random set
[[ 0.12403855  0.02336662  0.59507793 ...,  0.09912867  0.13376518
   0.83598381]
 [ 0.49977431  0.38667753  0.17460006 ...,  0.11854074  0.23574686
   0.53133935]
 [ 0.84471881  0.56952435  0.06624993 ...,  0.98948461  0.56452423
   0.04352978]
 ..., 
 [ 0.36141056  0.64748794  0.9488861  ...,  0.28697175  0.60381103
   0.89384705]
 [ 0.43408123  0.8563509   0.53729534 ...,  0.80452663  0.39777288
   0.3031466 ]
 [ 0.80537182  0.9238919   0.13718309 ...,  0.68108571  0.78619164
   0.42821193]]


rearrange the weights in [ -1 , 1 ]

In [61]:
program = thr.compile("""
KERNEL void gpu_rearrange(
    GLOBAL_MEM float *input,
    const float from,
    const float to
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    input[IDX] = from+input[IDX] * (to-from) ; // calculate product value, it can be +,/,<... etc.
}
""")
#have attention to the function name if it is same as above
GPUrearrange = program.gpu_rearrange

In [62]:
GPUrearrange(NN_dev, (np.float32)(-1.0), (np.float32)(1.0), local_size=(1,1), global_size=NN_dev.shape)
NN_dev

array([[ -7.51922905e-01,  -9.53266740e-01,   1.90155864e-01, ...,
         -8.01742673e-01,  -7.32469678e-01,   6.71967626e-01],
       [ -4.51385975e-04,  -2.26644933e-01,  -6.50799870e-01, ...,
         -7.62918532e-01,  -5.28506279e-01,   6.26786947e-02],
       [  6.89437628e-01,   1.39048696e-01,  -8.67500126e-01, ...,
          9.78969216e-01,   1.29048467e-01,  -9.12940443e-01],
       ..., 
       [ -2.77178884e-01,   2.94975877e-01,   8.97772193e-01, ...,
         -4.26056504e-01,   2.07622051e-01,   7.87694097e-01],
       [ -1.31837547e-01,   7.12701797e-01,   7.45906830e-02, ...,
          6.09053254e-01,  -2.04454243e-01,  -3.93706799e-01],
       [  6.10743642e-01,   8.47783804e-01,  -7.25633860e-01, ...,
          3.62171412e-01,   5.72383285e-01,  -1.43576145e-01]], dtype=float32)

set up GPU feedforward fun

In [66]:
from reikna.linalg import MatrixMul
'set up GPU feedforward fun'
batchSize = 3000
getRandBatch = lambda : np.random.choice(np.arange(len(X_train)),batchSize)

batchTrain = X_train[getRandBatch(),:].copy().astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

predict_dev = thr.array((batchSize,layers[-1]), dtype=np.float32)

GPUfeedforward = MatrixMul(batchTrain_dev, NN_dev, out_arr=predict_dev).compile(thr)

In [67]:
GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
print(predict_dev)
print("Check GPU result with CPU result if they are the same :")
print(np.linalg.norm(predict_dev.get() - batchTrain.dot(NN_dev.get())) / np.linalg.norm(batchTrain.dot(NN_dev.get())) < 1e-6)

[[  1.61323667   2.01963687   3.31880426 ...,  10.61067677  -3.63300228
   -1.79879069]
 [-12.04747677  -1.93427289   3.86403012 ...,   4.32552052  -4.82424212
   -2.34765768]
 [ -3.16366839  -6.27524328  -0.18959212 ...,   5.3775568   -1.33155012
   -1.25523257]
 ..., 
 [ -7.9624629    3.3482852    7.71547127 ...,   5.91617012   4.04497671
    2.27989793]
 [ -3.23057485   4.60989571   5.82851648 ...,   3.45486617   1.28478599
    0.23329422]
 [ -9.43785954   4.39097881  10.85242271 ...,   4.54710913   1.99141037
    1.45276427]]
Check GPU result with CPU result if they are the same :
True


Pre-reading: http://cs231n.github.io/linear-classify/#softmax

In [68]:
'set up a GPU softmax fun'
program = thr.compile("""
KERNEL void gpu_softmax(
    GLOBAL_MEM float *input
    )
{
    const SIZE_T i0 = get_global_id(0);
    const SIZE_T i1 = get_global_id(1);
    //why terminate?  because softmax need to sum up from [i,0] to [i,end]
    if(i1>0)return;

    int IDX = i0*get_global_size(1)+i1;
    float s = 0.0;
    float max = 0.0;
    for(int i=0;i<get_global_size(1);i++){
        if(max<input[IDX+i])max=input[IDX+i];
    }
    for(int i=0;i<get_global_size(1);i++){
      input[IDX+i]=exp(input[IDX+i]-max);
      s+=input[IDX+i];
    };
    for(int i=0;i<get_global_size(1);i++){
      input[IDX+i]/=s;
    };
}
""")
GPUsoftmax = program.gpu_softmax
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)
predict_dev

array([[  1.20096382e-04,   1.80313174e-04,   6.61071739e-04, ...,
          9.70663309e-01,   6.32582839e-07,   3.96009182e-06],
       [  4.65533745e-08,   1.14831177e-03,   3.78643751e-01, ...,
          6.00694358e-01,   6.38207493e-05,   7.59502233e-04],
       [  1.57253089e-04,   7.00260625e-06,   3.07768444e-03, ...,
          8.05390060e-01,   9.82376863e-04,   1.06028432e-03],
       ..., 
       [  1.22112780e-07,   9.97600891e-03,   7.86324203e-01, ...,
          1.30069375e-01,   2.00228598e-02,   3.42737813e-03],
       [  1.30953458e-05,   3.32805067e-02,   1.12571947e-01, ...,
          1.04849674e-02,   1.19705254e-03,   4.18269279e-04],
       [  1.51609614e-09,   1.53643906e-03,   9.83297110e-01, ...,
          1.79606408e-03,   1.39442753e-04,   8.13701408e-05]], dtype=float32)

Summary

In [69]:
batchIdx = getRandBatch()
batchTrain = X_train[batchIdx,:].astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
batchTrainLabels_dev = thr.to_device(batchTrainLabels)

GPU_randomSet(counters_dev, NN_dev)
# GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

'calculate errors cross entropy'
errors_dev=(predict_dev-batchTrainLabels_dev)
errors_dev

array([[  1.10526034e-03,   2.36217733e-02,   3.41599365e-03, ...,
         -9.87945318e-01,   1.69304222e-01,   1.57591549e-03],
       [  6.69368878e-02,   5.01299463e-03,   3.07187557e-01, ...,
         -7.53791332e-01,   8.32950845e-02,   2.66553257e-02],
       [  1.71011857e-06,   7.38984440e-04,   6.61693321e-06, ...,
          1.13188245e-04,   2.92522600e-04,   2.29449256e-06],
       ..., 
       [ -9.99879658e-01,   8.34345166e-03,   1.51391199e-04, ...,
          1.00565485e-05,   2.00507187e-04,   1.46632592e-05],
       [  2.39652756e-04,   1.04264647e-01,   2.77398364e-03, ...,
          1.43363429e-02,   1.28449360e-02,   1.52608831e-04],
       [  8.57720908e-04,   6.90310895e-01,   6.23268681e-03, ...,
          4.25609313e-02,   9.19866040e-02,   5.38685685e-03]], dtype=float32)

## 2. Set up BP

In [70]:
"get errors like this : errors_dev=batchTrainLabels_dev-predict_dev"

program = thr.compile("""
KERNEL void gpu_minus(
    GLOBAL_MEM float *a_dev,
    GLOBAL_MEM float *b_dev,
    GLOBAL_MEM float *res_dev
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    res_dev[IDX] = a_dev[IDX] - b_dev[IDX] ; 
}
""")
GPUminus = program.gpu_minus
GPUminus(batchTrainLabels_dev, predict_dev, errors_dev, local_size=(1,1), global_size=errors_dev.shape)

In [72]:
'set up a GPU errors back fun'
errors_back_dev = thr.array((layers[0],layers[-1]), dtype=np.float32)

GPUerrorsBack = MatrixMul(batchTrain_dev, errors_dev, out_arr=errors_back_dev, transposed_a=True).compile(thr)
GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
errors_back_dev.get().max()

217.10446

In [73]:
"update weights like this : NN_dev+= lr * errors_back_dev/batchSize"

program = thr.compile("""
KERNEL void gpu_updateW(
    GLOBAL_MEM float *NN_dev,
    GLOBAL_MEM float *errors_back_dev,
    const float lr,
    const float batchSize
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    NN_dev[IDX] += lr * errors_back_dev[IDX]/batchSize ; 
}
""")
GPUupdateW = program.gpu_updateW
GPUupdateW(NN_dev, errors_back_dev, (np.float32)(lr), (np.float32)(batchSize), local_size=(1,1), global_size=NN_dev.shape)

In [74]:
GPU_randomSet(counters_dev, NN_dev)
GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

In [None]:
lr = 1e-3
# batchTrain_dev = thr.to_device(X_train[:,:].astype(np.float32))
# batchTrainLabels_dev = thr.to_device(y_train[:,:].astype(np.float32))
import time
start = time.time()
for i in range(100):
    batchIdx = getRandBatch()
    batchTrain = X_train[batchIdx,:].astype(np.float32)
    batchTrain_dev = thr.to_device(batchTrain)

    batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    batchTrainLabels_dev = thr.to_device(batchTrainLabels)

    GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
    GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

    'calculate errors cross entropy'
    GPUminus(batchTrainLabels_dev, predict_dev, errors_dev, local_size=(1,1), global_size=errors_dev.shape)
    GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
#     errors_mean_dev=errors_back_dev/batchSize
    GPUupdateW(NN_dev, errors_dev, (np.float32)(lr), (np.float32)(batchSize), local_size=(1,1), global_size=errors_dev.shape)
#     if i%100 == 0:
#         print((np.argmax(predict_dev.get(),1)==np.argmax(batchTrainLabels,1)).sum()/batchSize*100)
print('cost time:',time.time()-start)

## 3. make a CPU version

In [79]:
def softmax(x):
    tmp=np.exp(x-x.max(1).reshape(-1,1))
    return tmp/tmp.sum(1).reshape(-1,1)
NN=NN_dev.get()

In [100]:
lr = 1e-4
# batchTrain = X_train[:,:].astype(np.float32)
# batchTrainLabels = y_train[:,:].astype(np.float32)
import time
start = time.time()
for j in range((int)(1e0)):
    batchIdx = getRandBatch()
    batchTrain = X_train[batchIdx,:].astype(np.float32)
    batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    
    %time y = softmax(batchTrain.dot(NN))
    %time error = batchTrainLabels - y
    %time NN += batchTrain.T.dot(error)/batchSize * lr
    
    if (j% 100) == 0:
        print("acc:",(np.argmax(y,1)==np.argmax(batchTrainLabels,1)).sum()/batchSize*100)   
print('cost time:',time.time()-start)

Wall time: 9 ms
Wall time: 3 ms
Wall time: 27 ms
acc: 14.9
cost time: 0.0760042667388916
