## 1. Set up a Neural Networks 
set up MNIST data sets

In [706]:
from keras.datasets import mnist
from keras.utils import np_utils

# set up MNIST data sets, required keras lib
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)/255.0
X_test = X_test.reshape(10000, 784)/255.0

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

print('X_train.shape',X_train.shape)
print('y_train.shape',y_train.shape)

X_train.shape (60000, 784)
y_train.shape (60000, 10)


set up Neural Networks

In [707]:
import numpy as np
import reikna.cluda as cluda
from reikna.core import Type
from reikna.cbrng import CBRNG
from reikna.cbrng.samplers import uniform_float
from reikna.cbrng.bijections import threefry

api = cluda.ocl_api()#.cuda_api() switch opencl and cuda
thr = api.Thread.create()

In [708]:
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '-1'
'input :784 units, output layer:10 units'
layers=(784, 10)
NN = np.zeros(layers).astype(np.float32)
NN_dev = thr.to_device(NN)
print('before random set')
print(NN_dev.get())

'set up GPU random set Neural Networks fun'
rng = CBRNG(Type(np.float32 , shape=layers), 1, uniform_float(threefry(32, 4), np.float32 ))
counters_dev = thr.to_device(rng.create_counters())
GPU_randomSet=rng.compile(thr)

GPU_randomSet(counters_dev, NN_dev)
print()
print('after random set')
print(NN_dev.get())

before random set
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

after random set
[[ 0.22819683  0.24527091  0.83517373 ...,  0.29089949  0.45058417
   0.52518374]
 [ 0.85281593  0.81345785  0.09425479 ...,  0.34834602  0.20637314
   0.04884203]
 [ 0.6472804   0.18995953  0.2313401  ...,  0.46528244  0.27973965
   0.23528051]
 ..., 
 [ 0.41609126  0.47612658  0.2670033  ...,  0.67392647  0.47403955
   0.46945018]
 [ 0.35290349  0.63495708  0.06728859 ...,  0.98033309  0.38221839
   0.54361045]
 [ 0.11656856  0.27305785  0.25426731 ...,  0.36772919  0.81402475
   0.3140361 ]]


rearrange the weights in [ -1 , 1 ]

In [None]:
program = thr.compile("""
KERNEL void gpu_rearrange(
    GLOBAL_MEM float *input,
    const float from,
    const float to
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    input[IDX] = from+input[IDX] * (to-from) ; // calculate product value, it can be +,/,<... etc.
}
""")
#have attention to the function name if it is same as above
GPUrearrange = program.gpu_rearrange

In [None]:
GPUrearrange(NN_dev, (np.float32)(-1.0), (np.float32)(1.0), local_size=(1,1), global_size=NN_dev.shape)
NN_dev

set up GPU feedforward fun

In [739]:
from reikna.linalg import MatrixMul
'set up GPU feedforward fun'
batchSize = 60000
getRandBatch = lambda : np.random.choice(np.arange(len(X_train)),batchSize)

batchTrain = X_train[getRandBatch(),:].copy().astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

predict_dev = thr.array((batchSize,layers[-1]), dtype=np.float32)

GPUfeedforward = MatrixMul(batchTrain_dev, NN_dev, out_arr=predict_dev).compile(thr)

In [740]:
GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
print(predict_dev)
print("Check GPU result with CPU result if they are the same :")
print(np.linalg.norm(predict_dev.get() - batchTrain.dot(NN_dev.get())) / np.linalg.norm(batchTrain.dot(NN_dev.get())) < 1e-6)

[[ 5.45752048  0.61648786  5.63830566 ...,  4.54402876  4.45077896
   5.38192654]
 [ 2.59078813  4.14450932  3.84620118 ...,  3.42415428  3.1559236
   2.82430506]
 [ 1.09660375  2.34249163  1.06969988 ...,  5.81089449  3.90435743
   5.16018057]
 ..., 
 [ 1.71481991  3.92238975  6.3789053  ...,  4.45873117  7.23615074
   5.70820808]
 [ 0.84289247  3.48989296  2.84652066 ...,  7.20613623  6.45906734
   9.21541023]
 [ 3.74016953  3.06753683  4.05717421 ...,  4.07021856  4.7097764
   4.49157715]]
Check GPU result with CPU result if they are the same :
True


Pre-reading: http://cs231n.github.io/linear-classify/#softmax

In [741]:
'set up a GPU softmax fun'
program = thr.compile("""
KERNEL void gpu_softmax(
    GLOBAL_MEM float *input
    )
{
    const SIZE_T i0 = get_global_id(0);
    const SIZE_T i1 = get_global_id(1);
    //why terminate?  because softmax need to sum up from [i,0] to [i,end]
    if(i1>0)return;

    int IDX = i0*get_global_size(1)+i1;
    float s = 0.0;
    float max = 0.0;
    for(int i=0;i<get_global_size(1);i++){
        if(max<input[IDX+i])max=input[IDX+i];
    }
    for(int i=0;i<get_global_size(1);i++){
      input[IDX+i]=exp(input[IDX+i]-max);
      s+=input[IDX+i];
    };
    for(int i=0;i<get_global_size(1);i++){
      input[IDX+i]/=s;
    };
}
""")
GPUsoftmax = program.gpu_softmax
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)
predict_dev

array([[  1.58712178e-01,   1.25365064e-03,   1.90162227e-01, ...,
          6.36628121e-02,   5.79946525e-02,   1.47156760e-01],
       [  1.93628371e-02,   9.15675387e-02,   6.79497644e-02, ...,
          4.45548818e-02,   3.40725146e-02,   2.44559254e-02],
       [  3.48071521e-03,   1.20990323e-02,   3.38831800e-03, ...,
          3.88202995e-01,   5.76846600e-02,   2.02515095e-01],
       ..., 
       [  1.25370175e-03,   1.14006512e-02,   1.32978305e-01, ...,
          1.94921512e-02,   3.13384682e-01,   6.79987147e-02],
       [  1.72172862e-04,   2.42964225e-03,   1.27681857e-03, ...,
          9.98817757e-02,   4.73193042e-02,   7.44908392e-01],
       [  3.65274884e-02,   1.86422858e-02,   5.01525700e-02, ...,
          5.08110672e-02,   9.63196158e-02,   7.74376243e-02]], dtype=float32)

Summary

In [742]:
batchIdx = getRandBatch()
batchTrain = X_train[batchIdx,:].astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
batchTrainLabels_dev = thr.to_device(batchTrainLabels)

GPU_randomSet(counters_dev, NN_dev)
# GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

'calculate errors cross entropy'
errors_dev=(predict_dev-batchTrainLabels_dev)
errors_dev

array([[  9.33410004e-02,   1.07645174e-03,   2.18172297e-01, ...,
          3.29565793e-01,   1.62964657e-01,   7.62007236e-02],
       [  2.72556627e-03,   1.32118922e-03,   9.06981230e-01, ...,
          4.39324006e-02,   1.09446961e-02,   6.54740084e-04],
       [  2.17844978e-01,   1.45267113e-03,   1.59617916e-01, ...,
          5.70728490e-03,   3.51215184e-01,   5.89348492e-04],
       ..., 
       [  5.14842896e-03,   7.49961194e-03,  -2.80558884e-01, ...,
          1.18471928e-01,   1.74991158e-03,   1.01075498e-02],
       [  4.52455861e-05,   3.93370690e-04,   6.51068211e-01, ...,
          4.26313467e-02,   9.34747804e-04,   2.30411533e-04],
       [  4.92520891e-02,   7.53213465e-03,   1.07967839e-01, ...,
         -9.85917568e-01,   4.26144950e-04,   6.46910295e-02]], dtype=float32)

## 2. Set up BP

In [743]:
'set up a GPU errors back fun'
errors_back_dev = thr.array((layers[0],layers[-1]), dtype=np.float32)

GPUerrorsBack = MatrixMul(batchTrain_dev, errors_dev, out_arr=errors_back_dev, transposed_a=True).compile(thr)
GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
errors_back_dev.get().max()

10248.24

In [744]:
GPU_randomSet(counters_dev, NN_dev)
GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

<pyopencl.cffi_cl.Event at 0x138f2a58>

In [745]:
lr = 1e-3
batchTrain_dev = thr.to_device(X_train[:,:].astype(np.float32))
batchTrainLabels_dev = thr.to_device(y_train[:,:].astype(np.float32))
for i in range(1000):
#     batchIdx = getRandBatch()
#     batchTrain = X_train[batchIdx,:].astype(np.float32)
#     batchTrain_dev = thr.to_device(batchTrain)

#     batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
#     batchTrainLabels_dev = thr.to_device(batchTrainLabels)

    GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
    GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

    'calculate errors cross entropy'
    errors_dev=batchTrainLabels_dev-predict_dev
    GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
    errors_mean_dev=errors_back_dev/batchSize
    NN_dev+= lr * errors_mean_dev
    if i%100 == 0:
        print((np.argmax(predict_dev.get(),1)==np.argmax(batchTrainLabels,1)).sum()/batchSize*100)

9.905
9.91833333333


KeyboardInterrupt: 

## 3. make a CPU version

In [None]:
def softmax(x):
    tmp=np.exp(x-x.max(1).reshape(-1,1))
    return tmp/tmp.sum(1).reshape(-1,1)
NN=NN_dev.get()

In [737]:
lr = 1e-4
batchTrain = X_train[:,:].astype(np.float32)
batchTrainLabels = y_train[:,:].astype(np.float32)
for j in range((int)(1e3)):
#     batchIdx = getRandBatch()
#     batchTrain = X_train[batchIdx,:].astype(np.float32)

#     batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    
    y = softmax(batchTrain.dot(NN))
    error = batchTrainLabels - y
    NN += batchTrain.T.dot(error)/60000 * lr
    
    if (j% 100) == 0:
        print("acc:",(np.argmax(y,1)==np.argmax(batchTrainLabels,1)).sum()/60000*100)    

acc: 88.1983333333
acc: 88.2
acc: 88.205
acc: 88.205


KeyboardInterrupt: 

CPU is faster than GPU!!!

The main reason here is not at GPU, but the data transformation from host to GPU ( to_device() )