In [2]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
from __future__ import print_function
import time
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as pyplot


## track performance of machine 
#CPU: top -p `pgrep "python"`  #enter s1 to tell update the stat every second
#GPU: watch -n 1 nvidia-smi

In [2]:
def benchmark(devices):
  '''Benchmark each device by computing matrix products'''
  times = {device: [] for device in devices}
  sizes = range(100, 7000, 500)

  for size in sizes:

    print(f"Calculating {size}x{size} matrix product")

    for device in devices:

      shape = (size, size)
      data_type = tf.float32
      with tf.device(device):
        mat1 = tf.random_uniform(shape=shape, minval=0, maxval=1, dtype=data_type)
        mat2 = tf.random_uniform(shape=shape, minval=0, maxval=1, dtype=data_type)
        matmul = tf.matmul(mat1, mat2)

      with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
        start_time = time.time()
        result = session.run(matmul)
        time_taken = time.time() - start_time
        print(f"{device} took {round(time_taken,2)}s")
        times[device].append(time_taken)

  return times, sizes


def plot_results(devices, sizes, times):
  '''Plot the benchmark results'''
  fig, (ax1, ax2) = pyplot.subplots(2, 1, sharex=True)
 
  for device in devices:
    ax1.plot(sizes, times[device], 'o-', label=device)
  ax1.set_ylabel('Compute Time')
  ax1.set_title('Device Compute Time vs. Matrix size')
  ax1.legend(devices, loc=2)
 
  ax2.plot(sizes, np.divide(times[devices[1]], times[devices[0]]), 'o-', label=device)
  ax2.set_ylabel('GPU Speedup')
  ax2.set_xlabel('Matrix size')
  ax2.set_title('GPU Speedup vs. Matrix size')
 
  pyplot.show()


def experiment():
  '''Run an experiment that compares CPU and GPU device performance'''
  devices = ["/gpu:0", "/cpu:0"]
  times, sizes = benchmark(devices)
  plot_results(devices, sizes, times)

experiment()

Calculating 100x100 matrix product


InvalidArgumentError: Cannot assign a device for operation random_uniform/RandomUniform: node random_uniform/RandomUniform (defined at <ipython-input-2-9f9a841b0bc0>:15) was explicitly assigned to /device:GPU:0 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device. The requested device appears to be a GPU, but CUDA is not enabled.
	 [[node random_uniform/RandomUniform (defined at <ipython-input-2-9f9a841b0bc0>:15) ]]

Caused by op 'random_uniform/RandomUniform', defined at:
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
    app.start()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\asyncio\base_events.py", line 541, in run_forever
    self._run_once()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\asyncio\base_events.py", line 1786, in _run_once
    handle._run()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\ioloop.py", line 741, in _run_callback
    ret = callback()
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel\kernelbase.py", line 545, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\IPython\core\interactiveshell.py", line 2887, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\IPython\core\interactiveshell.py", line 2932, in _run_cell
    return runner(coro)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\IPython\core\interactiveshell.py", line 3156, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\IPython\core\interactiveshell.py", line 3347, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\IPython\core\interactiveshell.py", line 3427, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-9f9a841b0bc0>", line 53, in <module>
    experiment()
  File "<ipython-input-2-9f9a841b0bc0>", line 50, in experiment
    times, sizes = benchmark(devices)
  File "<ipython-input-2-9f9a841b0bc0>", line 15, in benchmark
    mat1 = tf.random_uniform(shape=shape, minval=0, maxval=1, dtype=data_type)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tensorflow\python\ops\random_ops.py", line 247, in random_uniform
    rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tensorflow\python\ops\gen_random_ops.py", line 776, in random_uniform
    name=name)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tensorflow\python\framework\ops.py", line 3300, in create_op
    op_def=op_def)
  File "C:\Users\dbwab\.conda\envs\ztdl\lib\site-packages\tensorflow\python\framework\ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Cannot assign a device for operation random_uniform/RandomUniform: node random_uniform/RandomUniform (defined at <ipython-input-2-9f9a841b0bc0>:15) was explicitly assigned to /device:GPU:0 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device. The requested device appears to be a GPU, but CUDA is not enabled.
	 [[node random_uniform/RandomUniform (defined at <ipython-input-2-9f9a841b0bc0>:15) ]]


The upper graph shows the time it takes for each device to compute matrix products for different sizes of matrices, with the CPU in orange. The lower graph shows the GPU speedup, which is the ratio of the CPU compute time to the GPU compute time.

Here are some conclusions you may have reached:


#The CPU time grows proportional to the size of the matrix squared or cubed. That is, doubling the matrix size results in a 4X to 8X increase in compute time. This follows the growth in the number of computations required to calculate matrix products of different sizesThe GPU time grows almost linearly with the size of the matrix for the sizes used in the experiment. That is doubling the size doubles the time. This indicates that the GPU hasn't reached capacity. It is able to add more compute cores to complete the computation in much shorter times than a CPU.

The compute time for matrices of size less than 1000 is similar for GPU and CPU. Sometimes the CPU performs better than GPU for these small sizes. Transfering results from GPU to CPU and initializing GPU programs create some overhead in using a GPU. In general, GPUs excel for large-scale problems.The speedup increases with the matrix size. The maximum reached is around 20X. A 20X speedup can mean a compute job finishing in 1 day on CPU, or in 1 hour on GPU. 

For larger problems, GPUs can offer speedups in the hundreds.

In this Lab Step, you analyzed the results of the experiment and gained an understanding of when GPUs can offer substantial improvements and when they are overkill. For a complete comparison you need to consider the cost of running GPU instances to running CPU-only instances. Usually, if you have problems large enough to be compute-bound, the price increase of using GPUs is significantly less than the speedup achieved by using GPUs. This results in a reduced overall price.For very large problems, you can utilize multiple GPUs in a cluster. 

AWS provides a machine learning CloudFormation template to help with setting up an auto-scaled cluster of GPUs for machine learning tasks. Challenge (Optional)If you have time remaining in your Lab Session, try to modify the code to see what size of matrix causes the GPU to become compute-bound. Observe how high the temperature reaches while running in a compute-bound state for an extended period.  Also, see what the maximum speedup you can achieve by using the GPU is.