-
Notifications
You must be signed in to change notification settings - Fork 117
[test] Add check for IPCMagic #1360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
79deac5
add ipcmagic check
rsarm bf0b81d
use default module version
rsarm f66118e
fix comments
rsarm ac53d19
add license
rsarm 9e8dbbd
Merge branch 'master' into check/ipcmagic
b8585f6
Merge branch 'master' into check/ipcmagic
32d6270
Merge branch 'master' into check/ipcmagic
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| # Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich) | ||
| # ReFrame Project Developers. See the top-level LICENSE file for details. | ||
| # | ||
| # SPDX-License-Identifier: BSD-3-Clause | ||
|
|
||
| import reframe as rfm | ||
| import reframe.utility.sanity as sn | ||
| from reframe.core.backends import getlauncher | ||
|
|
||
|
|
||
| @rfm.simple_test | ||
| class IPCMagicCheck(rfm.RunOnlyRegressionTest): | ||
| def __init__(self): | ||
| self.descr = 'Distributed training with TensorFlow using ipyparallel' | ||
| self.valid_systems = ['daint:gpu', 'dom:gpu'] | ||
| self.valid_prog_environs = ['PrgEnv-gnu'] | ||
| self.modules = ['ipcmagic'] | ||
| self.pre_run = [ | ||
| 'module unload dask', | ||
| 'module load Horovod/0.16.4-CrayGNU-19.10-tf-1.14.0' | ||
| ] | ||
| self.num_tasks = 2 | ||
| self.num_tasks_per_node = 1 | ||
| self.executable = 'ipython' | ||
| self.executable_opts = ['tf-hvd-sgd-ipc-tf-1.14.py'] | ||
| nids = sn.extractall(r'nid(?P<nid>\d+)', | ||
| self.stdout, 'nid', str) | ||
| self.sanity_patterns = sn.all([ | ||
| sn.assert_ne(nids, []), | ||
| sn.assert_ne(nids[0], nids[1]) | ||
| ]) | ||
| self.reference = { | ||
| 'daint:gpu': { | ||
| 'slope': (2.0, -0.1, 0.1, None), | ||
| 'offset': (0.0, -0.1, 0.1, None), | ||
| 'retries': (0, None, None, None), | ||
| 'time': (10, None, None, 's'), | ||
| }, | ||
| 'dom:gpu': { | ||
| 'slope': (2.0, -0.1, 0.1, None), | ||
| 'offset': (0.0, -0.1, 0.1, None), | ||
| 'retries': (0, None, None, None), | ||
| 'time': (10, None, None, 's'), | ||
| } | ||
| } | ||
| self.perf_patterns = { | ||
| 'slope': sn.extractsingle(r'slope=(?P<slope>\S+)', | ||
| self.stdout, 'slope', float), | ||
| 'offset': sn.extractsingle(r'offset=(?P<offset>\S+)', | ||
| self.stdout, 'offset', float), | ||
| 'retries': 4 - sn.count(sn.findall(r'IPCluster is already running', | ||
| self.stdout)), | ||
| 'time': sn.extractsingle(r'IPCluster is ready\!\s+' | ||
| r'\((?P<time>\d+) seconds\)', | ||
| self.stdout, 'time', float) | ||
| } | ||
| self.maintainers = ['RS', 'TR'] | ||
| self.tags = {'production'} | ||
|
|
||
| @rfm.run_before('run') | ||
| def prepare_run(self): | ||
| # Change the job launcher since `ipython` | ||
| # needs to be launched without `srun`. | ||
| self.job.launcher = getlauncher('local')() | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| # Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich) | ||
| # ReFrame Project Developers. See the top-level LICENSE file for details. | ||
| # | ||
| # SPDX-License-Identifier: BSD-3-Clause | ||
|
|
||
| import ipcmagic | ||
| import ipyparallel as ipp | ||
|
|
||
|
|
||
| get_ipython().run_line_magic('ipcluster', '--version') | ||
|
|
||
|
|
||
| get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi') | ||
| # Repeat a few of times in case of `TimeoutError`. | ||
| # After the cluser starts, the following calls won't do nothing | ||
| # but printing "IPCluster is already running". | ||
| # This mimics what the user would do in such case. | ||
| get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi') | ||
| get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi') | ||
| get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi') | ||
| get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi') | ||
|
|
||
| c = ipp.Client() | ||
|
|
||
| print('cluster ids:', c.ids) | ||
|
|
||
| get_ipython().run_cell_magic('px', '', 'import os\nprint(os.popen("ps -u $USER | grep ip").read())') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', 'import socket\nsocket.gethostname()') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', 'import numpy as np\nimport tensorflow as tf\nimport horovod.tensorflow as hvd') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', 'hvd.init()') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', '# Note that the generated rando data is different from one node to the other\nnsamples = 1000\nref_slope = 2.0\nref_offset = 0.0\nnoise = np.random.random((nsamples, 1)) - 0.5\nx_train = np.random.random((nsamples, 1)) - 0.5\ny_train = ref_slope * x_train + ref_offset + noise') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', '#input pipeline\ndataset = tf.data.Dataset.from_tensor_slices((x_train.astype(np.float32),\n y_train.astype(np.float32)))\ndataset = dataset.shard(hvd.size(), hvd.rank())\ndataset = dataset.batch(500)\ndataset = dataset.repeat(500)\niterator = dataset.make_one_shot_iterator()\nnext_item = iterator.get_next()') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', '# Define the model\nslope = tf.Variable(np.random.randn())\noffset = tf.Variable(np.random.randn())\n\nx, y = next_item # The model is the continuation of the pipeline\n\ny_hat = slope * x + offset\n\nloss = tf.losses.mean_squared_error(y_hat, y)\n\nopt = tf.train.GradientDescentOptimizer(.5)\ntrain = hvd.DistributedOptimizer(opt).minimize(loss)') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', 'hooks = [hvd.BroadcastGlobalVariablesHook(0)]') | ||
|
|
||
| get_ipython().run_cell_magic('px', '', "history = []\n\nwith tf.train.MonitoredTrainingSession(hooks=hooks) as sess:\n # Initialization of the variables `slope` and `offset`\n # is done automatically by tf.train.MonitoredTrainingSession\n print('rank', hvd.rank(),\n 'inital slope = %12.6f\\n initial offset = %12.6f' %\n sess.run((slope, offset)))\n while not sess.should_stop():\n _, loss_val, m, n = sess.run((train, loss, slope, offset))\n history.append([sess.run(slope), sess.run(offset), loss_val])") | ||
|
|
||
| get_ipython().run_cell_magic('px', '', "print('slope=%f offset=%f loss=%f' % tuple(history[-1]))") | ||
|
|
||
| get_ipython().run_line_magic('ipcluster', 'stop') |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.