In [1]:
""" Test script that uses two GPUs, one per sub-process,
via the Python multiprocessing module.  Each GPU fits a logistic regression model. """


# These imports will not trigger any theano GPU binding
from multiprocessing import Process, Manager
import numpy as np
import os

def f(shared_args,private_args): 
    """ Build and fit a logistic regression model.  Adapted from 
    http://deeplearning.net/software/theano/tutorial/examples.html#a-real-example-logistic-regression
    """

    # Import sandbox.cuda to bind the specified GPU to this subprocess
    # then import the remaining theano and model modules.
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(private_args['gpu'])

    import theano
    import theano.tensor as T
    from theano.tensor.shared_randomstreams import RandomStreams

    rng = np.random    

    # Pull the size of the matrices from 
    shared_args_dict = shared_args[0]
    N = shared_args_dict['N']
    feats = shared_args_dict['n_features']
    D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
    training_steps = shared_args_dict['n_steps']

    # Declare Theano symbolic variables
    x = T.matrix("x")
    y = T.vector("y")
    w = theano.shared(rng.randn(feats), name="w")
    b = theano.shared(0., name="b")
    print "Initial model:"
    print w.get_value(), b.get_value()

    # Construct Theano expression graph
    p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
    prediction = p_1 > 0.5                    # The prediction thresholded
    xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
    cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
    gw,gb = T.grad(cost, [w, b])              # Compute the gradient of the cost
                                              # (we shall return to this in a
                                              # following section of this tutorial)

    # Compile.  allow_input_downcast reassures the compiler that we are ok using
    # 64 bit floating point numbers on the cpu, gut only 32 bit floats on the gpu.
    train = theano.function(
              inputs=[x,y],
              outputs=[prediction, xent],
              updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)), allow_input_downcast=True)
    predict = theano.function(inputs=[x], outputs=prediction, allow_input_downcast=True)

    # Train
    for i in range(training_steps):
        pred, err = train(D[0], D[1])

    print "Final model:"
    print w.get_value(), b.get_value()
    print "target values for D:", D[1]
    print "prediction on D:", predict(D[0])           



if __name__ == '__main__':

    # Construct a dict to hold arguments that can be shared by both processes
    # The Manager class is a convenient to implement this
    # See: http://docs.python.org/2/library/multiprocessing.html#managers
    #
    # Important: managers store information in mutable *proxy* data structures
    # but any mutation of those proxy vars must be explicitly written back to the manager.
    manager = Manager()

    args = manager.list()
    args.append({})
    shared_args = args[0]
    shared_args['N'] = 400
    shared_args['n_features'] = 784
    shared_args['n_steps'] = 10000
    args[0] = shared_args       

    # Construct the specific args for each of the two processes
    p_args = {}
    q_args = {}

    p_args['gpu'] = 'gpu0'
    q_args['gpu'] = 'gpu1'

    # Run both sub-processes
    p = Process(target=f, args=(args,p_args,))
    q = Process(target=f, args=(args,q_args,))
    p.start()
    q.start()
    p.join()
    q.join()

ERROR (theano.sandbox.cuda): ERROR: Not using GPU. Initialisation of device 0 failed:
initCnmem: cnmemInit call failed! Reason=CNMEM_STATUS_OUT_OF_MEMORY. numdev=1

ERROR (theano.sandbox.cuda): ERROR: Not using GPU. Initialisation of device gpu failed:
initCnmem: cnmemInit call failed! Reason=CNMEM_STATUS_OUT_OF_MEMORY. numdev=1

Process Process-3:
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-1-2837a9bf904c>", line 17, in f
    import theano.sandbox.cuda
  File "/home/ubuntu/.local/lib/python2.7/site-packages/theano/__init__.py", line 111, in <module>
    theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
  File "/home/ubuntu/.local/lib/python2.7/site-packages/theano/sandbox/cuda/tests/test_driver.py", line 29, in test_nvidia_driver1
    A = cuda.shared_

Initial model:
[ -2.56179933e-01   6.18712655e-01   1.02705605e+00  -6.70996065e-01
  -1.20930083e+00   8.43825347e-01   8.74943103e-01  -8.43005680e-02
  -1.25670704e+00  -3.28611649e-01   4.14106854e-01   1.10094314e+00
   1.98969590e-02   2.72391231e-01   7.23872058e-01  -5.92196349e-01
  -1.09857687e-01   1.73370342e+00  -3.11692267e-01   5.11087971e-01
  -1.02661673e+00  -1.16882182e+00  -6.77368641e-01  -3.15455121e-01
  -1.32469230e+00  -1.16127621e+00   1.45456856e+00   2.57588895e+00
  -2.25824074e-02  -2.39695160e-01  -4.69894436e-01   1.48111230e-01
  -5.29882742e-01   1.12209310e+00   1.73338003e+00  -2.94644037e-01
   2.96336644e-01   2.09995544e+00  -8.51032255e-02  -9.12045713e-01
   7.63093696e-01   1.08296296e+00  -5.15658458e-01   4.50786532e-02
  -2.16772813e+00  -4.29496839e-01   5.26844359e-01   3.44125377e-02
   9.24468420e-01  -4.08535591e-01   3.15661443e-01  -3.52225170e-01
   8.39324873e-01   2.84695177e-01   1.41319873e+00   2.00442285e-01
  -9.04817545e-01  