Skip to content

Commit

Permalink
Merge pull request #831 from shawnlaffan/faster_RPE2_maybe
Browse files Browse the repository at this point in the history
Speed improvements for PE, RPE etc
  • Loading branch information
shawnlaffan committed Oct 24, 2022
2 parents f0852df + 8badc66 commit 0247368
Show file tree
Hide file tree
Showing 11 changed files with 369 additions and 161 deletions.
57 changes: 57 additions & 0 deletions etc/benchmark_scripts/bench_hash_sum.pl
@@ -0,0 +1,57 @@
use Benchmark qw {:all};
use 5.016;
use Data::Dumper;

use List::Util qw {sum};

my @keys = 'a' .. 'zzm';
my %base_hash;
@base_hash{@keys} = map {rand() + $_} (1..@keys);

say sum_foreach();
say lu_sum();
say lu_sum0();
say lu_reduce();
say pf_values();


cmpthese (
-2,
{
foreach => \&sum_foreach,
lu_sum => \&lu_sum,
lu_sum0 => \&lu_sum0,
lu_reduce => \&lu_reduce,
pf_values => \&pf_values,
}
);

sub sum_foreach {
my $sum;
foreach (values %base_hash) {
$sum += $_;
}
$sum;
}

sub lu_sum {
my $sum = sum values %base_hash;
$sum;
}

sub lu_sum0 {
my $sum = sum 0, values %base_hash;
$sum;
}

sub lu_reduce {
my $sum = List::Util::reduce {$a + $b} values %base_hash;
$sum;
}

sub pf_values {
my $sum;
$sum += $_ for values %base_hash;
$sum;
}

4 changes: 2 additions & 2 deletions lib/Biodiverse/BaseData/ManageOutputs.pm
Expand Up @@ -141,7 +141,7 @@ sub delete_output {
delete $self->{SPATIAL_OUTPUTS}{$name};
}
elsif ( $type =~ /Cluster|Tree|RegionGrower/ ) {
my $x = eval { $object->delete_cached_values_below };
my $x = eval { $object->delete_all_cached_values };
$self->{CLUSTER_OUTPUTS}{$name} = undef;
delete $self->{CLUSTER_OUTPUTS}{$name};
}
Expand Down Expand Up @@ -396,7 +396,7 @@ sub delete_cluster_output_cached_values {
my $self = shift;
print "[BASEDATA] Deleting cached values in cluster trees\n";
foreach my $cluster ( $self->get_cluster_output_refs ) {
$cluster->delete_cached_values_below(@_);
$cluster->delete_all_cached_values(@_);
}

return;
Expand Down
2 changes: 1 addition & 1 deletion lib/Biodiverse/Cluster.pm
Expand Up @@ -2245,7 +2245,7 @@ sub cluster {
$root_node->ladderise;

if ($args{clear_cached_values}) {
$root_node->delete_cached_values_below;
$self->delete_all_cached_values;
}
$root_node->number_terminal_nodes;

Expand Down
15 changes: 6 additions & 9 deletions lib/Biodiverse/GUI/GUIManager.pm
Expand Up @@ -1795,9 +1795,8 @@ sub do_trim_tree_to_basedata {
return if $response ne 'ok'; # they chickened out

my $new_tree = $phylogeny->clone;
$new_tree->delete_cached_values;
$new_tree->reset_total_length;
$new_tree->reset_total_length_below;
#$new_tree->delete_cached_values; # we use the caches so clear up at the end
$new_tree->reset_total_length; # also handles each node

my $trim_to_lca = $checkbox->get_active;
my $merge_knuckle_nodes = $knuckle_checkbox->get_active;
Expand All @@ -1817,12 +1816,6 @@ sub do_trim_tree_to_basedata {
if ($trim_to_lca) {
$new_tree->trim_to_last_common_ancestor;
}
# clear the caches --after-- all the above method calls
# that use them internally
foreach my $node ( $new_tree->get_node_refs ) {
$node->delete_cached_values;
}
$new_tree->delete_cached_values;
}

if ($merge_knuckle_nodes) {
Expand All @@ -1837,6 +1830,10 @@ sub do_trim_tree_to_basedata {

}

# clear the caches --after-- all the above method calls
# that use them internally
$new_tree->delete_all_cached_values;

$new_tree->set_param( NAME => $chosen_name );

# now we add it if it is not already in the list
Expand Down
75 changes: 64 additions & 11 deletions lib/Biodiverse/Indices/Phylogenetic.pm
Expand Up @@ -19,6 +19,7 @@ use Scalar::Util qw /blessed/;
our $VERSION = '3.99_004';

use constant HAVE_BD_UTILS => eval 'require Biodiverse::Utils';
use constant HAVE_BD_UTILS_108 => HAVE_BD_UTILS && eval '$Biodiverse::Utils::VERSION >= 1.08';

use constant HAVE_PANDA_LIB
=> !$ENV{BD_NO_USE_PANDA} && eval 'require Panda::Lib';
Expand Down Expand Up @@ -609,24 +610,27 @@ sub get_path_lengths_to_root_node {

# now loop through the labels and get the path to the root node
my $path_hash = {};
my @collected_paths; # used if we have B::Utils 1.07 or greater
foreach my $label (grep exists $all_nodes->{$_}, keys %$label_list) {
# Could assign to $current_node here, but profiling indicates it
# takes meaningful chunks of time for large data sets
my $current_node = $all_nodes->{$label};
my $sub_path = $cache && $path_cache->{$current_node};

if (!$sub_path) {
$sub_path = $current_node->get_path_to_root_node (cache => $cache);
my @p = map (($_->get_name), @$sub_path);
$sub_path = \@p;
$sub_path = $current_node->get_path_name_array_to_root_node_aa (!$cache);
if ($cache) {
$path_cache->{$current_node} = $sub_path;
}
}

# This is a bottleneck for large data sets,
# so use an XSUB if possible.
if (HAVE_BD_UTILS) {
if (HAVE_BD_UTILS_108) {
# collect them all and process in an xsub
push @collected_paths, $sub_path;
}
elsif (HAVE_BD_UTILS) {
Biodiverse::Utils::add_hash_keys_until_exists (
$path_hash,
$sub_path,
Expand All @@ -649,8 +653,15 @@ sub get_path_lengths_to_root_node {

# Assign the lengths once each.
# ~15% faster than repeatedly assigning in the slice above
# but first option is faster still
my $len_hash = $tree_ref->get_node_length_hash;
if (HAVE_BD_UTILS) {
if (HAVE_BD_UTILS_108) {
# get keys and vals in one call
Biodiverse::Utils::XS::add_hash_keys_and_vals_until_exists_AoA (
$path_hash, \@collected_paths, $len_hash,
);
}
elsif (HAVE_BD_UTILS) {
Biodiverse::Utils::copy_values_from ($path_hash, $len_hash);
}
else {
Expand Down Expand Up @@ -1476,6 +1487,7 @@ sub get_metadata__calc_pe {
get_pe_element_cache
get_path_length_cache
set_path_length_cache_by_group_flag
get_inverse_range_weighted_path_lengths
/],
pre_calc => ['calc_abc'], # don't need calc_abc2 as we don't use its counts
uses_nbr_lists => 1, # how many lists it must have
Expand Down Expand Up @@ -1664,6 +1676,44 @@ sub get_node_range_hash_as_lists {
return wantarray ? %results : \%results;
}

sub get_metadata_get_inverse_range_weighted_path_lengths {
my %metadata = (
name => 'get_metadata_get_node_range_hash',
description
=> "Get a hash of the node lengths divided by their ranges\n"
. "Forms the basis of the PE calcs for equal area cells",
required_args => ['tree_ref'],
pre_calc_global => ['get_node_range_hash'],
indices => {
inverse_range_weighted_node_lengths => {
description => 'Hash of node lengths divided by their ranges',
},
},
);
return $metadata_class->new(\%metadata);
}

sub get_inverse_range_weighted_path_lengths {
my $self = shift;
my %args = @_;

my $tree = $args{tree_ref};
my $node_ranges = $args{node_range};

my %range_weighted;

foreach my $node ($tree->get_node_refs) {
my $name = $node->get_name;
next if !$node_ranges->{$name};
$range_weighted{$name} = $node->get_length / $node_ranges->{$name};
}

my %results = (inverse_range_weighted_node_lengths => \%range_weighted);

return wantarray ? %results : \%results;
}


sub get_metadata_get_node_range_hash {
my %metadata = (
name => 'get_node_range_hash',
Expand Down Expand Up @@ -1734,12 +1784,15 @@ sub get_node_range_hash {
}
}
$count ++;
$progress = $count / $to_do;
$progress_text = int (100 * $progress);
$progress_bar->update(
"Calculating node ranges\n($progress_text)",
$progress,
);
# fewer progress calls as we get heaps with large data sets
if (not $count % 20) {
$progress = $count / $to_do;
$progress_text = int (100 * $progress);
$progress_bar->update(
"Calculating node ranges\n($count of $to_do)",
$progress,
);
}
}

my %results = (node_range => \%node_range);
Expand Down
76 changes: 35 additions & 41 deletions lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm
Expand Up @@ -4,14 +4,10 @@ use warnings;

our $VERSION = '3.99_004';

use constant HAVE_PANDA_LIB
=> !$ENV{BD_NO_USE_PANDA} && eval 'require Panda::Lib';
use constant HAVE_DATA_RECURSIVE
=> !$ENV{BD_NO_USE_PANDA} && eval 'require Data::Recursive';

use feature 'refaliasing';
no warnings 'experimental::refaliasing';

use List::Util qw /sum/;

sub _calc_pe {
my $self = shift;
Expand All @@ -20,7 +16,8 @@ sub _calc_pe {
my $tree_ref = $args{trimmed_tree};
my $results_cache = $args{PE_RESULTS_CACHE};
my $element_list_all = $args{element_list_all};
\my %node_ranges = $args{node_range};
\my %node_ranges = $args{node_range};
\my %rw_node_lengths = $args{inverse_range_weighted_node_lengths};

my $bd = $args{basedata_ref} || $self->get_basedata_ref;

Expand Down Expand Up @@ -57,37 +54,41 @@ sub _calc_pe {
# else build them and cache them
else {
my $labels = $bd->get_labels_in_group_as_hash_aa ($group);

# This is a slow point for many calcs but the innards are cached
# and the cost is amortised when PD is also calculated
my $nodes_in_path = $self->get_path_lengths_to_root_node (
@_,
labels => $labels,
el_list => [$group],
);

my ($gp_score, %gp_wts, %gp_ranges);

# slice assignment wasn't faster according to nytprof and benchmarking
#@gp_ranges{keys %$nodes_in_path} = @$node_ranges{keys %$nodes_in_path};

# refaliasing avoids hash deref overheads below
\my %node_lengths = $nodes_in_path;

# loop over the nodes and run the calcs
NODE:
foreach my $node_name (keys %node_lengths) {
# Not sure we even need to test for zero ranges.
# We should never suffer this given the pre_calcs.
my $range = $node_ranges{$node_name}
|| next NODE;
my $wt = $node_lengths{$node_name} / $range;
$gp_score += $wt;
$gp_wts{$node_name} = $wt;
$gp_ranges{$node_name} = $range;
}
my %gp_wts = %rw_node_lengths{keys %$nodes_in_path};
my $gp_score = sum values %gp_wts;

# old approach - left here as notes for the
# non-equal area case in the future
# # loop over the nodes and run the calcs
# refaliasing avoids hash deref overheads below in the loop
# albeit the loop is not used any more...
# \my %nodes = $nodes_in_path;
#NODE:
# foreach my $node_name (keys %node_lengths) {
# # Not sure we even need to test for zero ranges.
# # We should never suffer this given the pre_calcs.
# #my $range = $node_ranges{$node_name}
# # or next NODE;
# #my $wt = $node_lengths{$node_name} / $range;
# my $wt = $rw_node_lengths{$node_name};
# #say STDERR (sprintf ('%s %f %f', $node_name, $wt, $wt2));
# $gp_score += $wt;
# #$gp_wts{$node_name} = $wt;
# #$gp_ranges{$node_name} = $range;
# }

$results_this_gp = {
PE_WE => $gp_score,
PE_WTLIST => \%gp_wts,
PE_RANGELIST => \%gp_ranges,
};

$results_cache->{$group} = $results_this_gp;
Expand All @@ -107,18 +108,6 @@ sub _calc_pe {
@local_ranges{keys %$hashref} = (1) x scalar keys %$hashref;
}
else {
# ranges are invariant, so can be crashed together
my $hash_ref = $results_this_gp->{PE_RANGELIST};
if (HAVE_DATA_RECURSIVE) {
Data::Recursive::hash_merge (\%ranges, $hash_ref, Data::Recursive::LAZY());
}
elsif (HAVE_PANDA_LIB) {
Panda::Lib::hash_merge (\%ranges, $hash_ref, Panda::Lib::MERGE_LAZY());
}
else {
@ranges{keys %$hash_ref} = values %$hash_ref;
}

# refalias might be a nano-optimisation here...
\my %wt_hash = $results_this_gp->{PE_WTLIST};

Expand All @@ -145,11 +134,16 @@ sub _calc_pe {
$PE_WE_P = eval {$PE_WE / $total_tree_length};
}

# need the collated versions
# need the collated versions for multiple elements
if (scalar @$element_list_all > 1) {
$results{PE_WE} = $PE_WE;
$results{PE_WTLIST} = \%wts;
$results{PE_RANGELIST} = \%ranges;
my %nranges = %node_ranges{keys %wts};
$results{PE_RANGELIST} = \%nranges;
}
else {
my %nranges = %node_ranges{keys %{$results{PE_WTLIST}}};
$results{PE_RANGELIST} = \%nranges;
}

# need to set these
Expand Down

0 comments on commit 0247368

Please sign in to comment.