## 1. Set up a Neural Networks 
set up MNIST data sets

In [1]:
from keras.datasets import mnist
from keras.utils import np_utils

# set up MNIST data sets, required keras lib
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)/255.0
X_test = X_test.reshape(10000, 784)/255.0

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

print('X_train.shape',X_train.shape)
print('y_train.shape',y_train.shape)

Using TensorFlow backend.


X_train.shape (60000, 784)
y_train.shape (60000, 10)


set up Neural Networks

In [3]:
import numpy as np
import reikna.cluda as cluda
from reikna.core import Type
from reikna.cbrng import CBRNG
from reikna.cbrng.samplers import uniform_float
from reikna.cbrng.bijections import threefry

api = cluda.ocl_api()#ocl_api()#.cuda_api() switch opencl and cuda
thr = api.Thread.create()

In [4]:
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '-1'
'input :784 units, output layer:10 units'
layers=(784, 10)
NN = np.zeros(layers).astype(np.float32)
NN_dev = thr.to_device(NN)
print('before random set')
print(NN_dev.get())

'set up GPU random set Neural Networks fun'
rng = CBRNG(Type(np.float32 , shape=layers), 1, uniform_float(threefry(32, 4), np.float32 ))
counters_dev = thr.to_device(rng.create_counters())
GPU_randomSet=rng.compile(thr)

GPU_randomSet(counters_dev, NN_dev)
print()
print('after random set')
print(NN_dev.get())

before random set
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

after random set
[[ 0.08641637  0.90209508  0.62375396 ...,  0.91747504  0.25345078
   0.09279578]
 [ 0.72694129  0.47865188  0.01610436 ...,  0.96699399  0.86852717
   0.48502406]
 [ 0.222192    0.93154973  0.27536535 ...,  0.96508378  0.54829693
   0.88557684]
 ..., 
 [ 0.08711699  0.15364091  0.27513865 ...,  0.94850802  0.26529047
   0.10553091]
 [ 0.81097257  0.95353609  0.65544355 ...,  0.93285435  0.40868947
   0.06189557]
 [ 0.95610756  0.24175578  0.44507435 ...,  0.35118732  0.38971031
   0.7225948 ]]


rearrange the weights in [ -1 , 1 ]

In [5]:
program = thr.compile("""
KERNEL void gpu_rearrange(
    GLOBAL_MEM float *input,
    const float from,
    const float to
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    input[IDX] = from+input[IDX] * (to-from) ; // calculate product value, it can be +,/,<... etc.
}
""")
#have attention to the function name if it is same as above
GPUrearrange = program.gpu_rearrange

In [6]:
GPUrearrange(NN_dev, (np.float32)(-1.0), (np.float32)(1.0), local_size=(1,1), global_size=NN_dev.shape)
NN_dev

array([[-0.82716727,  0.80419016,  0.24750793, ...,  0.83495009,
        -0.49309844, -0.81440842],
       [ 0.45388258, -0.04269624, -0.96779126, ...,  0.93398798,
         0.73705435, -0.02995187],
       [-0.55561602,  0.86309946, -0.44926929, ...,  0.93016756,
         0.09659386,  0.77115369],
       ..., 
       [-0.82576603, -0.69271815, -0.44972271, ...,  0.89701605,
        -0.46941906, -0.78893816],
       [ 0.62194514,  0.90707219,  0.3108871 , ...,  0.86570871,
        -0.18262106, -0.87620884],
       [ 0.91221511, -0.51648843, -0.1098513 , ..., -0.29762536,
        -0.22057939,  0.4451896 ]], dtype=float32)

set up GPU feedforward fun

In [7]:
from reikna.linalg import MatrixMul
'set up GPU feedforward fun'
batchSize = 3000
getRandBatch = lambda : np.random.choice(np.arange(len(X_train)),batchSize)

batchTrain = X_train[getRandBatch(),:].copy().astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

predict_dev = thr.array((batchSize,layers[-1]), dtype=np.float32)

GPUfeedforward = MatrixMul(batchTrain_dev, NN_dev, out_arr=predict_dev).compile(thr)

In [8]:
GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
print(predict_dev)
print("Check GPU result with CPU result if they are the same :")
print(np.linalg.norm(predict_dev.get() - batchTrain.dot(NN_dev.get())) / np.linalg.norm(batchTrain.dot(NN_dev.get())) < 1e-6)

[[ -5.54050255e+00  -2.77163315e+00  -8.97107983e+00 ...,  -4.37363815e+00
   -5.39937496e+00   2.37878108e+00]
 [ -4.75659227e+00  -4.12313223e+00  -3.85019755e+00 ...,  -8.35910261e-01
   -7.29273272e+00  -7.66714621e+00]
 [ -2.36363101e+00  -5.04455614e+00  -6.03383493e+00 ...,  -5.89509428e-01
   -4.30992365e+00  -6.30287266e+00]
 ..., 
 [ -7.79163408e+00   4.09991056e-01  -1.82047501e+01 ...,  -2.44125462e+00
    2.19995594e+00   8.05452228e-01]
 [ -9.77492237e+00  -2.42421460e+00   6.35660112e-01 ...,   6.26923609e+00
    8.61009210e-03   1.02177513e+00]
 [ -3.30745697e+00  -7.00298643e+00  -9.92344093e+00 ...,   9.89926338e+00
   -4.15765858e+00  -4.97645140e+00]]
Check GPU result with CPU result if they are the same :
True


Pre-reading: http://cs231n.github.io/linear-classify/#softmax

In [10]:
'set up a GPU softmax fun'
program = thr.compile("""
KERNEL void gpu_softmax(
    GLOBAL_MEM float *input
    )
{
    const SIZE_T i0 = get_global_id(0);
    const SIZE_T i1 = get_global_id(1);
    //why terminate?  because softmax need to sum up from [i,0] to [i,end]
    if(i1>0)return;

    int IDX = i0*get_global_size(1)+i1;
    float s = 0.0f;
    float max = 0.0f;
    for(int i=0;i<(int)get_global_size(1);i++){
        if(max<input[IDX+i])max=input[IDX+i];
    }
    for(int i=0;i<(int)get_global_size(1);i++){
      input[IDX+i]=exp(input[IDX+i]-max);
      s+=input[IDX+i];
    };
    for(int i=0;i<(int)get_global_size(1);i++){
      input[IDX+i]/=s;
    };
}
""")
GPUsoftmax = program.gpu_softmax
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)
predict_dev

array([[ 0.08838614,  0.08839467,  0.08838559, ...,  0.08838741,
         0.08838623,  0.08996862],
       [ 0.08768382,  0.08768389,  0.08768395, ...,  0.0876884 ,
         0.08768373,  0.08768373],
       [ 0.08725574,  0.08657068,  0.08653916, ...,  0.09094588,
         0.08662512,  0.08653475],
       ..., 
       [ 0.08533687,  0.08533698,  0.08533687, ...,  0.08533688,
         0.08533749,  0.08533702],
       [ 0.08741203,  0.08742375,  0.08766253, ...,  0.19459687,
         0.08754575,  0.08778082],
       [ 0.08562313,  0.08562313,  0.08562313, ...,  0.08765012,
         0.08562313,  0.08562313]], dtype=float32)

Summary

In [69]:
batchIdx = getRandBatch()
batchTrain = X_train[batchIdx,:].astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
batchTrainLabels_dev = thr.to_device(batchTrainLabels)

GPU_randomSet(counters_dev, NN_dev)
# GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

'calculate errors cross entropy'
errors_dev=(predict_dev-batchTrainLabels_dev)
errors_dev

array([[  1.10526034e-03,   2.36217733e-02,   3.41599365e-03, ...,
         -9.87945318e-01,   1.69304222e-01,   1.57591549e-03],
       [  6.69368878e-02,   5.01299463e-03,   3.07187557e-01, ...,
         -7.53791332e-01,   8.32950845e-02,   2.66553257e-02],
       [  1.71011857e-06,   7.38984440e-04,   6.61693321e-06, ...,
          1.13188245e-04,   2.92522600e-04,   2.29449256e-06],
       ..., 
       [ -9.99879658e-01,   8.34345166e-03,   1.51391199e-04, ...,
          1.00565485e-05,   2.00507187e-04,   1.46632592e-05],
       [  2.39652756e-04,   1.04264647e-01,   2.77398364e-03, ...,
          1.43363429e-02,   1.28449360e-02,   1.52608831e-04],
       [  8.57720908e-04,   6.90310895e-01,   6.23268681e-03, ...,
          4.25609313e-02,   9.19866040e-02,   5.38685685e-03]], dtype=float32)

## 2. Set up BP

In [70]:
"get errors like this : errors_dev=batchTrainLabels_dev-predict_dev"

program = thr.compile("""
KERNEL void gpu_minus(
    GLOBAL_MEM float *a_dev,
    GLOBAL_MEM float *b_dev,
    GLOBAL_MEM float *res_dev
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    res_dev[IDX] = a_dev[IDX] - b_dev[IDX] ; 
}
""")
GPUminus = program.gpu_minus
GPUminus(batchTrainLabels_dev, predict_dev, errors_dev, local_size=(1,1), global_size=errors_dev.shape)

In [72]:
'set up a GPU errors back fun'
errors_back_dev = thr.array((layers[0],layers[-1]), dtype=np.float32)

GPUerrorsBack = MatrixMul(batchTrain_dev, errors_dev, out_arr=errors_back_dev, transposed_a=True).compile(thr)
GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
errors_back_dev.get().max()

217.10446

In [73]:
"update weights like this : NN_dev+= lr * errors_back_dev/batchSize"

program = thr.compile("""
KERNEL void gpu_updateW(
    GLOBAL_MEM float *NN_dev,
    GLOBAL_MEM float *errors_back_dev,
    const float lr,
    const float batchSize
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    NN_dev[IDX] += lr * errors_back_dev[IDX]/batchSize ; 
}
""")
GPUupdateW = program.gpu_updateW
GPUupdateW(NN_dev, errors_back_dev, (np.float32)(lr), (np.float32)(batchSize), local_size=(1,1), global_size=NN_dev.shape)

In [74]:
GPU_randomSet(counters_dev, NN_dev)
GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

In [103]:
lr = 1e-3
# batchTrain_dev = thr.to_device(X_train[:,:].astype(np.float32))
# batchTrainLabels_dev = thr.to_device(y_train[:,:].astype(np.float32))
import time
start = time.time()
for i in range(100):
    batchIdx = getRandBatch()
    batchTrain = X_train[batchIdx,:].astype(np.float32)
    batchTrain_dev = thr.to_device(batchTrain)

    batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    batchTrainLabels_dev = thr.to_device(batchTrainLabels)

    GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
    GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

    'calculate errors cross entropy'
    GPUminus(batchTrainLabels_dev, predict_dev, errors_dev, local_size=(1,1), global_size=errors_dev.shape)
    GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
#     errors_mean_dev=errors_back_dev/batchSize
    GPUupdateW(NN_dev, errors_dev, (np.float32)(lr), (np.float32)(batchSize), local_size=(1,1), global_size=errors_dev.shape)
#     if i%100 == 0:
#         print((np.argmax(predict_dev.get(),1)==np.argmax(batchTrainLabels,1)).sum()/batchSize*100)
print('cost time:',time.time()-start)

cost time: 3.9852280616760254


## 3. make a CPU version

In [79]:
def softmax(x):
    tmp=np.exp(x-x.max(1).reshape(-1,1))
    return tmp/tmp.sum(1).reshape(-1,1)
NN=NN_dev.get()

In [100]:
lr = 1e-4
# batchTrain = X_train[:,:].astype(np.float32)
# batchTrainLabels = y_train[:,:].astype(np.float32)
import time
start = time.time()
for j in range((int)(1e0)):
    batchIdx = getRandBatch()
    batchTrain = X_train[batchIdx,:].astype(np.float32)
    batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    
    y = softmax(batchTrain.dot(NN))
    error = batchTrainLabels - y
    NN += batchTrain.T.dot(error)/batchSize * lr
    
    if (j% 100) == 0:
        print("acc:",(np.argmax(y,1)==np.argmax(batchTrainLabels,1)).sum()/batchSize*100)   
print('cost time:',time.time()-start)

Wall time: 9 ms
Wall time: 3 ms
Wall time: 27 ms
acc: 14.9
cost time: 0.0760042667388916
