## 1. Set up a Neural Networks 
set up MNIST data sets

In [1]:
from keras.datasets import mnist
from keras.utils import np_utils

# set up MNIST data sets, required keras lib
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)/255.0
X_test = X_test.reshape(10000, 784)/255.0

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

print('X_train.shape',X_train.shape)
print('y_train.shape',y_train.shape)

Using TensorFlow backend.


X_train.shape (60000, 784)
y_train.shape (60000, 10)


set up Neural Networks

In [2]:
import numpy as np
import reikna.cluda as cluda
from reikna.core import Type
from reikna.cbrng import CBRNG
from reikna.cbrng.samplers import uniform_float
from reikna.cbrng.bijections import threefry

api = cluda.ocl_api()#ocl_api()#.cuda_api() switch opencl and cuda
thr = api.Thread.create()

In [3]:
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '-1'
'input :784 units, output layer:10 units'
layers=(784, 10)
NN = np.zeros(layers).astype(np.float32)
NN_dev = thr.to_device(NN)
print('before random set')
print(NN_dev.get())

'set up GPU random set Neural Networks fun'
rng = CBRNG(Type(np.float32 , shape=layers), 1, uniform_float(threefry(32, 4), np.float32 ))
counters_dev = thr.to_device(rng.create_counters())
GPU_randomSet=rng.compile(thr)

GPU_randomSet(counters_dev, NN_dev)
print()
print('after random set')
print(NN_dev.get())

before random set
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

after random set
[[  7.32800841e-01   8.28513563e-01   5.42470515e-01 ...,   7.22194314e-01
    6.18905425e-01   5.99146783e-01]
 [  3.99667680e-01   2.76426405e-01   3.96757483e-01 ...,   3.25989783e-01
    1.95984095e-01   6.40559010e-04]
 [  9.37009275e-01   6.49996817e-01   3.28292876e-01 ...,   9.54684675e-01
    4.18014288e-01   7.68346786e-01]
 ..., 
 [  7.48250246e-01   2.66601145e-01   9.70116079e-01 ...,   3.01580012e-01
    8.58210742e-01   6.39033556e-01]
 [  9.61607039e-01   5.93545675e-01   2.29457393e-01 ...,   2.07748637e-01
    9.61972833e-01   8.11869085e-01]
 [  5.91419935e-01   1.44220488e-02   4.18172687e-01 ...,   8.73038411e-01
    3.65534961e-01   3.94895285e-01]]


rearrange the weights in [ -1 , 1 ]

In [4]:
program = thr.compile("""
KERNEL void gpu_rearrange(
    GLOBAL_MEM float *input,
    const float from,
    const float to
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    input[IDX] = from+input[IDX] * (to-from) ; 
}
""")
GPUrearrange = program.gpu_rearrange

In [5]:
GPUrearrange(NN_dev, (np.float32)(-1.0), (np.float32)(1.0), local_size=(1,1), global_size=NN_dev.shape)
NN_dev

array([[ 0.46560168,  0.65702713,  0.08494103, ...,  0.44438863,
         0.23781085,  0.19829357],
       [-0.20066464, -0.44714719, -0.20648503, ..., -0.34802043,
        -0.60803181, -0.99871886],
       [ 0.87401855,  0.29999363, -0.34341425, ...,  0.90936935,
        -0.16397142,  0.53669357],
       ..., 
       [ 0.49650049, -0.46679771,  0.94023216, ..., -0.39683998,
         0.71642148,  0.27806711],
       [ 0.92321408,  0.18709135, -0.54108524, ..., -0.5845027 ,
         0.92394567,  0.62373817],
       [ 0.18283987, -0.97115588, -0.16365463, ...,  0.74607682,
        -0.26893008, -0.21020943]], dtype=float32)

set up GPU feedforward fun

In [6]:
from reikna.linalg import MatrixMul
'set up GPU feedforward fun'
batchSize = 3000
getRandBatch = lambda : np.random.choice(np.arange(len(X_train)),batchSize).astype(np.int32)

batchTrain = X_train[getRandBatch(),:].copy().astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

predict_dev = thr.array((batchSize,layers[-1]), dtype=np.float32)

GPUfeedforward = MatrixMul(batchTrain_dev, NN_dev, out_arr=predict_dev).compile(thr)

In [7]:
GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
print(predict_dev)
print("Check GPU result with CPU result if they are the same :")
print(np.linalg.norm(predict_dev.get() - batchTrain.dot(NN_dev.get())) / np.linalg.norm(batchTrain.dot(NN_dev.get())) < 1e-6)

[[-2.73258877 -2.45554996  1.83694613 ..., -3.51368475 -4.25749969
   1.54934692]
 [ 4.49922514  0.65834707  9.97752953 ..., -1.88652945 -0.33362553
  -3.00773001]
 [ 3.848526   -4.36007071  9.3614769  ...,  6.2664299  -6.49548721
  -7.46111631]
 ..., 
 [ 3.20629644 -0.84305209  2.04479122 ..., -9.59047508 -1.53382289
  -5.06183577]
 [-1.09001207 -0.83752155 -2.12862182 ..., -3.34624171 -1.56633997
   1.45993412]
 [ 2.57909489  1.7980994   0.74181187 ...,  1.27912402  6.56489849
  -7.73308754]]
Check GPU result with CPU result if they are the same :
True


Pre-reading: http://cs231n.github.io/linear-classify/#softmax

In [8]:
'set up a GPU softmax fun'
program = thr.compile("""
KERNEL void gpu_softmax(
    GLOBAL_MEM float *input
    )
{
    const SIZE_T i0 = get_global_id(0);
    const SIZE_T i1 = get_global_id(1);
    //why terminate?  because softmax need to sum up from [i,0] to [i,end]
    if(i1>0)return;

    int IDX = i0*get_global_size(1)+i1;
    float s = 0.0f;
    float max = 0.0f;
    for(int i=0;i<(int)get_global_size(1);i++){
        if(max<input[IDX+i])max=input[IDX+i];
    };
    for(int i=0;i<(int)get_global_size(1);i++){
      input[IDX+i]=exp(input[IDX+i]-max);
      s+=input[IDX+i];
    };
    for(int i=0;i<(int)get_global_size(1);i++){
      input[IDX+i]/=s;
    };
}
""")
GPUsoftmax = program.gpu_softmax
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)
predict_dev

array([[  6.42055795e-07,   8.47011165e-07,   6.19579005e-05, ...,
          2.93999705e-07,   1.39737310e-07,   4.64722689e-05],
       [  1.41008140e-03,   3.02811277e-05,   3.37630421e-01, ...,
          2.37654695e-06,   1.12295857e-05,   7.74488853e-07],
       [  3.80493514e-03,   1.03609295e-06,   9.43172991e-01, ...,
          4.27001640e-02,   1.22461444e-07,   4.66263863e-08],
       ..., 
       [  2.21340675e-02,   3.85879393e-04,   6.92828791e-03, ...,
          6.13047249e-08,   1.93398664e-04,   5.67880306e-06],
       [  1.05689876e-02,   1.36046885e-02,   3.74085526e-03, ...,
          1.10704510e-03,   6.56397175e-03,   1.35350823e-01],
       [  4.95724613e-03,   2.27017049e-03,   7.89438898e-04, ...,
          1.35104673e-03,   2.66841263e-01,   1.64708695e-07]], dtype=float32)

Summary

In [9]:
batchIdx = getRandBatch()
batchTrain = X_train[batchIdx,:].astype(np.float32)
batchTrain_dev = thr.to_device(batchTrain)

batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
batchTrainLabels_dev = thr.to_device(batchTrainLabels)

GPU_randomSet(counters_dev, NN_dev)
GPUrearrange(NN_dev, (np.float32)(-1), (np.float32)(1), local_size=(1,1), global_size=NN_dev.shape)

GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

errors_dev=(predict_dev-batchTrainLabels_dev)
errors_dev

array([[  2.53711075e-01,   4.58997056e-06,   5.70994765e-02, ...,
         -9.99999404e-01,   7.84337608e-05,   6.61649287e-01],
       [  8.42314184e-01,   3.19325886e-06,   9.87458900e-02, ...,
          3.13316741e-05,   8.44048486e-07,   2.50185898e-04],
       [  7.51112704e-04,  -9.98892307e-01,   5.49790263e-02, ...,
          1.35208402e-05,   9.70839551e-07,   2.05836864e-03],
       ..., 
       [  1.08720349e-06,   4.73830570e-03,  -9.98037875e-01, ...,
          4.05637650e-07,   4.03369768e-07,   4.98181616e-05],
       [  3.38208564e-02,   9.27042711e-05,   9.65389669e-01, ...,
          1.06395625e-07,   1.24160419e-04,   3.87191860e-04],
       [  1.01235946e-05,   3.74680189e-06,   2.94366419e-05, ...,
          1.44698337e-04,   6.81707636e-02,   6.25829995e-01]], dtype=float32)

## 2. Set up Backpropagation

In [10]:
"get errors like this : errors_dev=batchTrainLabels_dev-predict_dev"

program = thr.compile("""
KERNEL void gpu_minus(
    GLOBAL_MEM float *a_dev,
    GLOBAL_MEM float *b_dev,
    GLOBAL_MEM float *res_dev
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    res_dev[IDX] = a_dev[IDX] - b_dev[IDX] ; 
}
""")
GPUminus = program.gpu_minus
GPUminus(batchTrainLabels_dev, predict_dev, errors_dev, local_size=(1,1), global_size=errors_dev.shape)

<pyopencl.cffi_cl.Event at 0x13bd24630>

In [11]:
'set up a GPU errors back fun'
errors_back_dev = thr.array((layers[0],layers[-1]), dtype=np.float32)

GPUerrorsBack = MatrixMul(batchTrain_dev, errors_dev, out_arr=errors_back_dev, transposed_a=True).compile(thr)
GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
errors_back_dev.get().max()

278.229

In [12]:
"update weights like this : NN_dev+= lr * errors_back_dev/batchSize"
lr = 1e-3
program = thr.compile("""
KERNEL void gpu_updateW(
    GLOBAL_MEM float *NN_dev,
    GLOBAL_MEM float *errors_back_dev,
    const float lr,
    const float batchSize
    )
{
    const SIZE_T id0 = get_global_id(0);
    const SIZE_T id1 = get_global_id(1);
    int IDX = id0*get_global_size(1)+id1;
    NN_dev[IDX] += lr * errors_back_dev[IDX]/batchSize ; 
}
""")
GPUupdateW = program.gpu_updateW
GPUupdateW(NN_dev, errors_back_dev, (np.float32)(lr), (np.float32)(batchSize), local_size=(1,1), global_size=NN_dev.shape)

<pyopencl.cffi_cl.Event at 0x13bd24ef0>

In [13]:
GPU_randomSet(counters_dev, NN_dev)
GPUrearrange(NN_dev, (np.float32)(0), (np.float32)(0.1), local_size=(1,1), global_size=NN_dev.shape)

<pyopencl.cffi_cl.Event at 0x13bdb0ac8>

In [14]:
lr = 1e-3
# batchTrain_dev = thr.to_device(X_train[:,:].astype(np.float32))
# batchTrainLabels_dev = thr.to_device(y_train[:,:].astype(np.float32))

import time
start = time.time()
for i in range(100):
    batchIdx = getRandBatch()
    batchTrain = X_train[batchIdx,:].astype(np.float32)
    batchTrain_dev = thr.to_device(batchTrain)

    batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    batchTrainLabels_dev = thr.to_device(batchTrainLabels)

    GPUfeedforward(predict_dev, batchTrain_dev, NN_dev)
    GPUsoftmax(predict_dev, local_size=(1,1), global_size=predict_dev.shape)

    GPUminus(batchTrainLabels_dev, predict_dev, errors_dev, local_size=(1,1), global_size=errors_dev.shape)
    GPUerrorsBack(errors_back_dev, batchTrain_dev, errors_dev)
    GPUupdateW(NN_dev, errors_back_dev, (np.float32)(lr), (np.float32)(batchSize), local_size=(1,1), global_size=NN_dev.shape)
    if i%100 == 0:
        print((np.argmax(predict_dev.get(),1)==np.argmax(batchTrainLabels_dev.get(),1)).sum()/batchSize*100)
print('cost time:',time.time()-start)

12.8666666667
cost time: 2.0018649101257324


## 3. make a CPU version

In [15]:
def softmax(x):
    tmp=np.exp(x-x.max(1).reshape(-1,1))
    return tmp/tmp.sum(1).reshape(-1,1)
NN=NN_dev.get()

In [16]:
lr = 1e-3
# batchTrain = X_train[:,:].astype(np.float32)
# batchTrainLabels = y_train[:,:].astype(np.float32)
import time
start = time.time()
for j in range((int)(1e2)):
    batchIdx = getRandBatch()
    batchTrain = X_train[batchIdx,:].astype(np.float32)
    batchTrainLabels = y_train[batchIdx,:].astype(np.float32)
    
    y = softmax(batchTrain.dot(NN))
    error = batchTrainLabels - y
    NN += batchTrain.T.dot(error)/batchSize * lr
    
    if (j% 100) == 0:
        print("acc:",(np.argmax(y,1)==np.argmax(batchTrainLabels,1)).sum()/batchSize*100)   
print('cost time:',time.time()-start)

acc: 22.3666666667
cost time: 1.7499561309814453


CPU is faster than GPU !!!

The main reson here is that X_train data transformation ( to_device() ) cost too much time.

Try to restart kernel and use full train data set which just need one time transformation ( to_device() ).

CPU 100 times : cost time: 7.946720123291016

GPU 100 times : cost time: 0.19799399375915527

## Summary

This tutorial seems a little difficult. But CPU version shows this machine learning is no so complex.(except some math things)

If you give a read of GPU codes only, you will find that things at GPU are also simple.

Next tutorial will introduce Genetic Algorithm, which dose not have a GPU solution at python but easy to code.