Skip to content

Commit

Permalink
Add rig-counters utility.
Browse files Browse the repository at this point in the history
This application reports router counter value changes (e.g. dropped packets)
and may be used to discover if packets are being dropped in your application.
  • Loading branch information
mossblaser committed Jul 14, 2015
1 parent adc40bb commit e941ae8
Show file tree
Hide file tree
Showing 5 changed files with 570 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.rst
Expand Up @@ -89,6 +89,8 @@ The utilities provided by Rig can be broken down approximately as follows:
from a SpiNNaker application.
* ``rig-ps``: No-nonsense command line utility for listing all applications
(and their locations) in a SpiNNaker machine.
* ``rig-counters``: No-nonsense command line utility which can
non-intrusively monitor a SpiNNaker system for dropped packets.

Python Version Support
----------------------
Expand Down
67 changes: 67 additions & 0 deletions docs/source/utility_apps.rst
Expand Up @@ -143,3 +143,70 @@ command)::
X Y P State Application App ID
--- --- --- ----------------- ---------------- ------
0 0 3 sync0 network_tester 66


``rig-counters``
================

The ``rig-counters`` command reads the router diagnostic counters for all chips
in a SpiNNaker system and reports any changes in value. This can be useful, for
example, when checking if (and where) an application is dropping packets.

In the simplest use case, simply call ``rig-counters`` with a SpiNNaker
hostname as an argument, run your application and then press enter to see how
many packets were dropped::

$ rig-counters HOSTNAME
time,dropped_multicast
<press enter>
8.7,234

In the example above, 234 packets were dropped. Note that the output is in the
form of a CSV file. You can give the `--multiple`` option to allow multiple
samples to be captured. In the example below we capture four samples::

$ rig-counters HOSTNAME --multiple > out.csv
<press enter>
<press enter>
<press enter>
<press enter>
<press enter> ^C
$ cat out.csv
time,dropped_multicast
1.0,12
1.4,34
2.3,23
2.7,11

Instead of manually pressing enter to trigger a sample, you can use the
``--command`` argument to report the number of dropped packets during the
execution of your program::

$ rig-counters HOSTNAME --command ./my_program my_args
time,dropped_multicast
10.4,102

You can also report each router's counter values individually using the
``--detailed`` option::

$ rig-counters HOSTNAME --detailed
time,x,y,dropped_multicast
<press enter>
10.4,0,0,10
10.4,0,1,2
10.4,0,2,5
...

Other router counter values can be reported too, see ``rig-counters --help``
for more details.

.. warning::

``rig-counters`` works by polling the router in every chip in a SpiNNaker
machine. This process takes some time (i.e. it isn't monotonic) and also
results in P2P messages being sent through the SpiNNaker network.

The system is polled once when the utility is started and then once more
for each sample requested (e.g. every time you press enter). As a result,
you should be careful to only start or trigger a poll when the machine is
otherwise idle, for example, before or after your application runs.
204 changes: 204 additions & 0 deletions rig/scripts/rig_counters.py
@@ -0,0 +1,204 @@
"""A minimal command-line utility which samples counter values around the
machine.
Installed as "rig-counters" by setuptools.
"""

import sys
import argparse
import subprocess
import time

import rig

from six import iteritems
from six.moves import input

from rig.machine_control import MachineController

from rig.machine_control.machine_controller import RouterDiagnostics

from rig.machine_control.scp_connection import TimeoutError


def sample_counters(mc, machine):
"""Sample every router counter in the machine."""
return {
(x, y): mc.get_router_diagnostics(x, y) for (x, y) in machine
}


def deltas(last, now):
"""Return the change in counter values (accounting for wrap-around)."""
return {
xy: RouterDiagnostics(*((n - l) & 0xFFFFFFFF
for l, n in zip(last[xy], now[xy])))
for xy in last
}


def monitor_counters(mc, output, counters, detailed, f):
"""Monitor the counters on a specified machine, taking a snap-shot every
time the generator 'f' yields."""
# Print CSV header
output.write("time,{}{}\n".format("x,y," if detailed else "",
",".join(counters)))

machine = mc.get_machine()

# Make an initial sample of the counters
last_counter_values = sample_counters(mc, machine)

start_time = time.time()

for _ in f():
# Snapshot the change in counter values
counter_values = sample_counters(mc, machine)
delta = deltas(last_counter_values, counter_values)
last_counter_values = counter_values

now = time.time() - start_time

# Output the changes
if detailed:
for x, y in machine:
output.write("{:0.1f},{},{},{}\n".format(
now, x, y,
",".join(str(getattr(delta[(x, y)], c))
for c in counters)))
else:
totals = [0 for _ in counters]
for xy in machine:
for i, counter in enumerate(counters):
totals[i] += getattr(delta[xy], counter)
output.write("{:0.1f},{}\n".format(
now, ",".join(map(str, totals))))


def press_enter(multiple=False, silent=False):
"""Return a generator function which yields every time the user presses
return."""

def f():
try:
while True:
if silent:
yield input()
else:
sys.stderr.write("<press enter> ")
sys.stderr.flush()
yield input()
if not multiple:
break
except (EOFError, KeyboardInterrupt):
# User Ctrl+D or Ctrl+C'd
if not silent:
# Prevents the user's terminal getting clobbered
sys.stderr.write("\n")
sys.stderr.flush()
return

return f


def run_command(command):
"""Return a generator function which yields once when a supplied command
exits."""
def f():
try:
subprocess.call(command)
except KeyboardInterrupt:
# If the user interrupts the process, just continue
pass
yield ""
return f


def main(args=None):
parser = argparse.ArgumentParser(
description="Report changes in router diagnostic counters in a "
"SpiNNaker system.")
parser.add_argument("--version", "-V", action="version",
version="%(prog)s {}".format(rig.__version__))

parser.add_argument("hostname", type=str,
help="hostname or IP of SpiNNaker system")

parser.add_argument("--detailed", "-d", action="store_true",
help="give counter values for each chip individually "
"by default, just a sum is given")
parser.add_argument("--silent", "-s", action="store_true",
help="do not produce informational messages on STDOUT")
parser.add_argument("--output", "-o", type=str, default="-",
metavar="FILENAME",
help="filename to write recorded counter values to "
"or - for stdout (default: %(default)s)")

when_group = parser.add_mutually_exclusive_group()
when_group.add_argument("--command", "-c", nargs=argparse.REMAINDER,
help="report the difference in counter values "
"before and after executing the supplied "
"command")
when_group.add_argument("--multiple", "-m", action="store_true",
help="allow recording of multiple snapshots "
"(default: just one snapshot)")

counter_group = parser.add_argument_group(
"counter selection arguments",
description="Any subset of these counters may be selected for output. "
"If none are specified, only dropped multicast packets "
"will be reported.")
abbreviations = {
"local": "loc",
"external": "ext",
"dropped": "drop",
"multicast": "mc",
"nearest-neighbour": "nn",
"fixed-route": "fr",
"counter": "c",
}
for counter in RouterDiagnostics._fields:
arg_name = "--{}".format(counter.replace("_", "-"))
short_name = arg_name
for full, abbrev in iteritems(abbreviations):
short_name = short_name.replace(full, abbrev)
counter_group.add_argument(arg_name, short_name,
dest="counters",
action="append_const", const=counter)

args = parser.parse_args(args)

try:
mc = MachineController(args.hostname)
info = mc.get_software_version(0, 0)
if "SpiNNaker" in info.version_string:
counters = args.counters or ["dropped_multicast"]
if args.output == "-":
output = sys.stdout
else:
output = open(args.output, "w")

if args.command is None:
f = press_enter(args.multiple, args.silent)
else:
f = run_command(args.command)

try:
monitor_counters(mc, output, counters, args.detailed, f)
finally:
if output is not sys.stdout: # pragma: no branch
output.close() # pragma: no branch
else:
sys.stderr.write("{}: error: unknown architecture '{}'\n".format(
parser.prog, info.version_string.strip("\x00")))
return 2
except TimeoutError:
sys.stderr.write("{}: error: command timed out\n".format(
parser.prog))
return 1

return 0


if __name__ == "__main__": # pragma: no cover
sys.exit(main())
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -104,6 +104,7 @@ def get_new_url(url):
"rig-discover = rig.scripts.rig_discover:main",
"rig-iobuf = rig.scripts.rig_iobuf:main",
"rig-ps = rig.scripts.rig_ps:main",
"rig-counters = rig.scripts.rig_counters:main",
],
}
)

0 comments on commit e941ae8

Please sign in to comment.