In [None]:
import sys
import numpy as np
import tensorflow as tf
from datetime import datetime

### List of CPU and GPUs
How to get list of CPU and GPUs ?

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
get_available_gpus()

What is __XLA__?
XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that optimizes TensorFlow computations. The results are improvements in speed, memory usage, and portability on server and mobile platforms. Initially, most users will not see large benefits from XLA, but are welcome to experiment by using XLA via just-in-time (JIT) compilation or ahead-of-time (AOT) compilation. Developers targeting new hardware accelerators are especially encouraged to try out XLA.

The XLA framework is experimental and in active development. In particular, while it is unlikely that the semantics of existing operations will change, it is expected that more operations will be added to cover important use cases. The team welcomes feedback from the community about missing functionality and community contributions via GitHub.

### Logging device

I also recommend logging device placement when using GPUs, at this lets you easily debug issues relating to different device usage. This prints the usage of devices to the log, allowing you to see when devices change and how that affects the graph.

You can see that a, b and c are all run on GPU0

### ***Run it in a terminal***

In [None]:
def print_logging_device():
    # Creates a graph.
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
    c = tf.matmul(a, b)
    # Creates a session with log_device_placement set to True.
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    # Runs the op.
    print sess.run(c)
print_logging_device()

### Multiplication on gpu0 and cpu

In [None]:
print "------- Multiplication on gpu0 vs cpu ---------"
def matrix_mul(device_name, matrix_sizes):
    time_values = []
    #device_name = "/cpu:0"
    for size in matrix_sizes:
        with tf.device(device_name):
            random_matrix = tf.random_uniform(shape=(2,2), minval=0, maxval=1)
            dot_operation = tf.matmul(random_matrix, tf.transpose(random_matrix))
            sum_operation = tf.reduce_sum(dot_operation)

        with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as session:
            startTime = datetime.now()
            result = session.run(sum_operation)
        td = datetime.now() - startTime
        time_values.append(td.microseconds/1000)
        print ("matrix shape:" + str(size) + "  --"+ device_name +" time: "+str(td.microseconds/1000))
    return time_values


matrix_sizes = range(100,1000,100)
time_values_gpu = matrix_mul("/gpu:0", matrix_sizes)
time_values_cpu = matrix_mul("/cpu:0", matrix_sizes)
print ("GPU time" +  str(time_values_gpu))
print ("CPUtime" + str(time_values_cpu))
print "--------------------------------"


In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.plot(matrix_sizes[:len(time_values_gpu)], time_values_gpu, label='cpu')
plt.plot(matrix_sizes[:len(time_values_cpu)], time_values_cpu, label='gpu')
plt.ylabel('Time (sec)')
plt.xlabel('Size of Matrix ')
plt.legend(loc='best')
plt.show()

# Using Multi GPUs

In [None]:
print "------- Multi GPU ---------"
def multi_gpu():
    c = []
    for d in ['/cpu','/gpu:0', '/gpu:1','/gpu:2', '/gpu:3']:
        with tf.device(d):
            a = tf.ones(shape=[3,3], dtype=tf.float32)
            b = tf.ones(shape=[3,3], dtype=tf.float32)
            c.append(tf.matmul(a, b))
    with tf.device('/cpu:0'):
      sum = tf.add_n(c)
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    print(sess.run(sum))
    
multi_gpu()
print "--------------------------------"

## Single vs Multi GPU

In [None]:
#Processing Units logs
log_device_placement = True

#num of multiplications to perform
n = 10

matrix_size = 1e3

In [None]:
# Example: compute A^n + B^n on 2 GPUs

# Create random large matrix
A = np.random.rand(matrix_size, matrix_size).astype('float32')
B = np.random.rand(matrix_size, matrix_size).astype('float32')
C = np.random.rand(matrix_size, matrix_size).astype('float32')
D = np.random.rand(matrix_size, matrix_size).astype('float32')

# Creates a graph to store results
c1 = []
c2 = []

# Define matrix power
def matpow(M, n):
    if n < 1: #Abstract cases where n < 1
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

In [None]:
# Single GPU computing

with tf.device('/gpu:0'):
    a = tf.constant(A)
    b = tf.constant(B)
    c = tf.constant(C)
    d = tf.constant(D)
    #compute A^n, B^n, .. and store results in c1
    c1.append(matpow(a, n))
    c1.append(matpow(b, n))
    c1.append(matpow(c, n))
    c1.append(matpow(d, n))

with tf.device('/cpu:0'):
    sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n + ..

t1_1 = datetime.now()
with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:
    # Runs the op.
    sess.run(sum)
t2_1 = datetime.now()

In [None]:
# Multi GPU computing
# GPU:0 computes A^n
with tf.device('/gpu:0'):
    #compute A^n and store result in c2
    a = tf.constant(A)
    c2.append(matpow(a, n))

#GPU:1 computes B^n
with tf.device('/gpu:1'):
    #compute B^n and store result in c2
    b = tf.constant(B)
    c2.append(matpow(b, n))


#GPU:1 computes C^n
with tf.device('/gpu:2'):
    #compute B^n and store result in c2
    c = tf.constant(C)
    c2.append(matpow(c, n))    
    
    
#GPU:1 computes D^n
with tf.device('/gpu:3'):
    #compute B^n and store result in c2
    d = tf.constant(D)
    c2.append(matpow(d, n))
    
    
    
with tf.device('/cpu:0'):
    sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n

t1_2 = datetime.now()
with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:
    # Runs the op.
    sess.run(sum)
t2_2 = datetime.now()

In [None]:
print "Single GPU computation time: " + str(t2_1-t1_1)
print "Multi GPU computation time: " + str(t2_2-t1_2)

Ref: Based on https://github.com/aymericdamien/TensorFlow-Examples/