Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Allow better testing of flappy compilers by specifying max failures i…
…n a single SCALE before stopping that test for that compiler
  • Loading branch information
Geoffrey Broadwell committed Nov 1, 2014
1 parent 3e2d044 commit fa5cc93
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
3 changes: 2 additions & 1 deletion bench
Expand Up @@ -445,12 +445,13 @@ multi MAIN ('history', *@timings, :$format?, :$style?, :$outfile?,
multi MAIN ('stress', *@components, :$variants?, :$tests?,
:$tests-tagged = 'stress',
:$runs = 10, :$enough-time?,
:$max-failures = Int($runs / 2 + .5),
:$min-scaling-points?,
Bool :$verbose?) {
needs-setup('stress test Perls');

my @options = as-options(:$tests, :$tests-tagged, :$runs, :$enough-time,
:$min-scaling-points, :$verbose);
:$max-failures, :$min-scaling-points, :$verbose);

for-checkouts @components, -> $comp, $name, $checkout {
chdir "$COMPONENTS_DIR/$name/$checkout";
Expand Down
32 changes: 22 additions & 10 deletions timeall
Expand Up @@ -21,6 +21,7 @@ use Cwd;

my %DEFAULT = (
TEST_RUNS => 2, # times to run each test, taking best time
MAX_FAILURES => 5, # maximum number of failed runs by a given compiler at a given SCALE to still continue that test
MIN_STARTUP_RUNS => 10, # minimum times to run startup time test
ENOUGH_TIME => 1.0, # minimum run time (seconds) to give clean timing data for scalable tests
MIN_SCALING_POINTS => 3, # minimum different points to test on each scaling curve
Expand Down Expand Up @@ -176,7 +177,8 @@ sub known_tags {
sub process_options_and_arguments {
my %opt;
GetOptions(\%opt, 'help|h|?!', 'man!', 'verbose!', 'format=s', 'outfile=s',
'runs=i', 'enough-time=f', 'min-scaling-points=i',
'runs=i', 'max-failures=i',
'enough-time=f', 'min-scaling-points=i',
'list-variants!', 'list-tests!', 'tests=s',
'list-tags!', 'list-tests-tagged=s', 'tests-tagged=s')
or pod2usage(-verbose => 0);
Expand Down Expand Up @@ -314,23 +316,24 @@ sub run_all_tests {
my $overhead_runs = $startup_runs;
my $enough_time = $opt->{'enough-time'} || $DEFAULT{ENOUGH_TIME};
my $scale_points = $opt->{'min-scaling-points'} || $DEFAULT{MIN_SCALING_POINTS};
my $max_failures = $opt->{'max-failures'} // $DEFAULT{MAX_FAILURES};
my $empty_test = $TESTS{e}[0];

say "Measuring startup times ...";
# If user aborts now, don't bother catching it; normal testing hasn't begun
my $times = time_all_compilers($empty_test, $startup_runs, 0, $enough_time, 0, $opt->{verbose});
my $times = time_all_compilers($empty_test, $startup_runs, 0, $enough_time, 0, 0, $opt->{verbose});
my $startup = best_times($times);
$startup->{$_} = $startup->{$_}{1}{time} for keys %$startup;

my @results;
# Catch user aborts so that results for completed tests can be returned
eval { run_tests(\@TESTS, \@results, $startup, $runs, $overhead_runs, $enough_time, $scale_points, $opt->{verbose}) };
eval { run_tests(\@TESTS, \@results, $startup, $runs, $overhead_runs, $enough_time, $scale_points, $max_failures, $opt->{verbose}) };
warn "\n$@\n" if $@;
return (\@results, $startup);
}

sub run_tests {
my ($tests, $results, $startup, $runs, $overhead_runs, $enough_time, $min_scaling_points, $verbose) = @_;
my ($tests, $results, $startup, $runs, $overhead_runs, $enough_time, $min_scaling_points, $max_failures, $verbose) = @_;

my @enabled = grep { $_->{enabled} } @$tests;
my $testcount = @enabled;
Expand All @@ -342,7 +345,7 @@ sub run_tests {

# Let user aborts fall out to run_all_tests() so that the last test
# timed won't have partial (and possibly misleading) timing data
my $raw_times = time_all_compilers($test, $runs, $overhead_runs, $enough_time, $min_scaling_points, $verbose, $startup);
my $raw_times = time_all_compilers($test, $runs, $overhead_runs, $enough_time, $min_scaling_points, $max_failures, $verbose, $startup);
my $best = best_times($raw_times);
push @$results, {
name => $name,
Expand All @@ -354,7 +357,7 @@ sub run_tests {
}

sub time_all_compilers {
my ($test, $runs, $overhead_runs, $enough_time, $min_scaling_points, $verbose, $startup) = @_;
my ($test, $runs, $overhead_runs, $enough_time, $min_scaling_points, $max_failures, $verbose, $startup) = @_;

my $test_type = $test->{type};
my $cwd = cwd;
Expand Down Expand Up @@ -411,7 +414,8 @@ sub time_all_compilers {
my $run_times = time_command(\@compile, \@run, $overhead_runs, 0, 0, $expected->(0), $verbose);
push @all_times, @{$run_times || []};

if (!$run_times || grep { $_->{failed} } @$run_times) {
if (!$run_times
|| (grep { $_->{failed} } @$run_times) > $max_failures) {
warn "Compiler $name is failing at scale=0 for test $test->{name}, aborting remaining runs.\n";
}
else {
Expand All @@ -427,7 +431,8 @@ sub time_all_compilers {
push @all_times, @{$run_times || []};
$scale_points++;

if (!$run_times || grep { $_->{failed} } @$run_times) {
if (!$run_times
|| (grep { $_->{failed} } @$run_times) > $max_failures) {
warn "Compiler $name is failing at scale=$scale for test $test->{name}, aborting remaining runs.\n";
last;
}
Expand All @@ -442,7 +447,8 @@ sub time_all_compilers {
else {
my $run_times = time_command(\@compile, \@run, $runs, 1, 1, $expected->(1), $verbose);
push @all_times, @{$run_times || []};
if (!$run_times || grep { $_->{failed} } @$run_times) {
if (!$run_times
|| (grep { $_->{failed} } @$run_times) > $max_failures) {
warn "Compiler $name is failing for test $test->{name}, continuing to next compiler/test.\n";
}
}
Expand Down Expand Up @@ -654,7 +660,7 @@ timeall -- Benchmark Perl-family compilers against each other
timeall [--help|-h|-?] [--man] [--list-variants] [--list-tests]
[--list-tags] [--list-tests-tagged=required-tag,-forbidden-tag]
timeall [--verbose] [--runs=2] [--enough-time=3.0]
timeall [--verbose] [--runs=2] [--max-failures=0] [--enough-time=3.0]
[--format=json] [--outfile=path/to/file.ext]
[--tests=list,of,tests] [--tests-tagged=required,-forbidden]
[<enabled compiler variants>]
Expand Down Expand Up @@ -750,6 +756,12 @@ reaches diminishing returns. Many times it's more effective to just turn off
as many background processes (such as email clients) as possible while
benchmarking.
=item --max-failures=0
The maximum number of failed runs that can occur at a given SCALE for a
given compiler before that compiler is assumed over-stressed and testing
continues with the next compiler or next test.
=item --enough-time=1.0
Set the amount of time a scalable test must run (not including startup and
Expand Down

0 comments on commit fa5cc93

Please sign in to comment.