Merge pull request #831 from shawnlaffan/faster_RPE2_maybe

Speed improvements for PE, RPE etc
shawnlaffan · Oct 24, 2022 · 0247368 · 0247368
2 parents f0852df + 8badc66
commit 0247368
Show file tree

Hide file tree

Showing 11 changed files with 369 additions and 161 deletions.
diff --git a/etc/benchmark_scripts/bench_hash_sum.pl b/etc/benchmark_scripts/bench_hash_sum.pl
@@ -0,0 +1,57 @@
+use Benchmark qw {:all};
+use 5.016;
+use Data::Dumper;
+
+use List::Util qw {sum};
+
+my @keys = 'a' .. 'zzm';
+my %base_hash;
+@base_hash{@keys} = map {rand() + $_} (1..@keys);
+
+say sum_foreach();
+say lu_sum();
+say lu_sum0();
+say lu_reduce();
+say pf_values();
+
+
+cmpthese (
+    -2,
+    {
+        foreach   => \&sum_foreach,
+        lu_sum    => \&lu_sum,
+        lu_sum0   => \&lu_sum0,
+        lu_reduce => \&lu_reduce,
+        pf_values => \&pf_values,
+    }
+);
+
+sub sum_foreach {
+    my $sum;
+    foreach (values %base_hash) {
+        $sum += $_;
+    }
+    $sum;
+}
+
+sub lu_sum {
+    my $sum = sum values %base_hash;
+    $sum;
+}
+
+sub lu_sum0 {
+    my $sum = sum 0, values %base_hash;
+    $sum;
+}
+
+sub lu_reduce {
+    my $sum = List::Util::reduce {$a + $b} values %base_hash;
+    $sum;
+}
+
+sub pf_values {
+    my $sum;
+    $sum += $_ for values %base_hash;
+    $sum;
+}
+
diff --git a/lib/Biodiverse/BaseData/ManageOutputs.pm b/lib/Biodiverse/BaseData/ManageOutputs.pm
@@ -141,7 +141,7 @@ sub delete_output {
         delete $self->{SPATIAL_OUTPUTS}{$name};
     }
     elsif ( $type =~ /Cluster|Tree|RegionGrower/ ) {
-        my $x = eval { $object->delete_cached_values_below };
+        my $x = eval { $object->delete_all_cached_values };
         $self->{CLUSTER_OUTPUTS}{$name} = undef;
         delete $self->{CLUSTER_OUTPUTS}{$name};
     }
@@ -396,7 +396,7 @@ sub delete_cluster_output_cached_values {
     my $self = shift;
     print "[BASEDATA] Deleting cached values in cluster trees\n";
     foreach my $cluster ( $self->get_cluster_output_refs ) {
-        $cluster->delete_cached_values_below(@_);
+        $cluster->delete_all_cached_values(@_);
     }
 
     return;

diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm
@@ -2245,7 +2245,7 @@ sub cluster {
     $root_node->ladderise;
 
     if ($args{clear_cached_values}) {
-        $root_node->delete_cached_values_below;
+        $self->delete_all_cached_values;
     }
     $root_node->number_terminal_nodes;
 

diff --git a/lib/Biodiverse/GUI/GUIManager.pm b/lib/Biodiverse/GUI/GUIManager.pm
@@ -1795,9 +1795,8 @@ sub do_trim_tree_to_basedata {
     return if $response ne 'ok';    #  they chickened out
 
     my $new_tree = $phylogeny->clone;
-    $new_tree->delete_cached_values;
-    $new_tree->reset_total_length;
-    $new_tree->reset_total_length_below;
+    #$new_tree->delete_cached_values;  #  we use the caches so clear up at the end
+    $new_tree->reset_total_length;  #  also handles each node
 
     my $trim_to_lca = $checkbox->get_active;
     my $merge_knuckle_nodes = $knuckle_checkbox->get_active;
@@ -1817,12 +1816,6 @@ sub do_trim_tree_to_basedata {
         if ($trim_to_lca) {
             $new_tree->trim_to_last_common_ancestor;
         }
-        #  clear the caches --after-- all the above method calls
-        #  that use them internally
-        foreach my $node ( $new_tree->get_node_refs ) {
-            $node->delete_cached_values;
-        }
-        $new_tree->delete_cached_values;
     }
 
     if ($merge_knuckle_nodes) {
@@ -1837,6 +1830,10 @@ sub do_trim_tree_to_basedata {
 
     }
 
+    #  clear the caches --after-- all the above method calls
+    #  that use them internally
+    $new_tree->delete_all_cached_values;
+
     $new_tree->set_param( NAME => $chosen_name );
 
     #  now we add it if it is not already in the list

diff --git a/lib/Biodiverse/Indices/Phylogenetic.pm b/lib/Biodiverse/Indices/Phylogenetic.pm
@@ -19,6 +19,7 @@ use Scalar::Util qw /blessed/;
 our $VERSION = '3.99_004';
 
 use constant HAVE_BD_UTILS => eval 'require Biodiverse::Utils';
+use constant HAVE_BD_UTILS_108 => HAVE_BD_UTILS && eval '$Biodiverse::Utils::VERSION >= 1.08';
 
 use constant HAVE_PANDA_LIB
   => !$ENV{BD_NO_USE_PANDA} && eval 'require Panda::Lib';
@@ -609,24 +610,27 @@ sub get_path_lengths_to_root_node {
 
     #  now loop through the labels and get the path to the root node
     my $path_hash = {};
+    my @collected_paths;  #  used if we have B::Utils 1.07 or greater
     foreach my $label (grep exists $all_nodes->{$_}, keys %$label_list) {
         #  Could assign to $current_node here, but profiling indicates it
         #  takes meaningful chunks of time for large data sets
         my $current_node = $all_nodes->{$label};
         my $sub_path = $cache && $path_cache->{$current_node};
 
         if (!$sub_path) {
-            $sub_path = $current_node->get_path_to_root_node (cache => $cache);
-            my @p = map (($_->get_name), @$sub_path);
-            $sub_path = \@p;
+            $sub_path = $current_node->get_path_name_array_to_root_node_aa (!$cache);
             if ($cache) {
                 $path_cache->{$current_node} = $sub_path;
             }
         }
 
         #  This is a bottleneck for large data sets,
         #  so use an XSUB if possible.
-        if (HAVE_BD_UTILS) {
+        if (HAVE_BD_UTILS_108) {
+            #  collect them all and process in an xsub
+            push @collected_paths, $sub_path;
+        }
+        elsif (HAVE_BD_UTILS) {
             Biodiverse::Utils::add_hash_keys_until_exists (
                 $path_hash,
                 $sub_path,
@@ -649,8 +653,15 @@ sub get_path_lengths_to_root_node {
 
     #  Assign the lengths once each.
     #  ~15% faster than repeatedly assigning in the slice above
+    #  but first option is faster still
     my $len_hash = $tree_ref->get_node_length_hash;
-    if (HAVE_BD_UTILS) {
+    if (HAVE_BD_UTILS_108) {
+        #  get keys and vals in one call
+        Biodiverse::Utils::XS::add_hash_keys_and_vals_until_exists_AoA (
+            $path_hash, \@collected_paths, $len_hash,
+        );
+    }
+    elsif (HAVE_BD_UTILS) {
         Biodiverse::Utils::copy_values_from ($path_hash, $len_hash);
     }
     else {
@@ -1476,6 +1487,7 @@ sub get_metadata__calc_pe {
             get_pe_element_cache
             get_path_length_cache
             set_path_length_cache_by_group_flag
+            get_inverse_range_weighted_path_lengths
         /],
         pre_calc        => ['calc_abc'],  #  don't need calc_abc2 as we don't use its counts
         uses_nbr_lists  => 1,  #  how many lists it must have
@@ -1664,6 +1676,44 @@ sub get_node_range_hash_as_lists {
     return wantarray ? %results : \%results;
 }
 
+sub get_metadata_get_inverse_range_weighted_path_lengths {
+    my %metadata = (
+        name  => 'get_metadata_get_node_range_hash',
+        description
+            => "Get a hash of the node lengths divided by their ranges\n"
+             . "Forms the basis of the PE calcs for equal area cells",
+        required_args => ['tree_ref'],
+        pre_calc_global => ['get_node_range_hash'],
+        indices => {
+            inverse_range_weighted_node_lengths => {
+                description => 'Hash of node lengths divided by their ranges',
+            },
+        },
+    );
+    return $metadata_class->new(\%metadata);
+}
+
+sub get_inverse_range_weighted_path_lengths {
+    my $self = shift;
+    my %args = @_;
+
+    my $tree = $args{tree_ref};
+    my $node_ranges = $args{node_range};
+
+    my %range_weighted;
+
+    foreach my $node ($tree->get_node_refs) {
+        my $name = $node->get_name;
+        next if !$node_ranges->{$name};
+        $range_weighted{$name} = $node->get_length / $node_ranges->{$name};
+    }
+
+    my %results = (inverse_range_weighted_node_lengths => \%range_weighted);
+
+    return wantarray ? %results : \%results;
+}
+
+
 sub get_metadata_get_node_range_hash {
     my %metadata = (
         name            => 'get_node_range_hash',
@@ -1734,12 +1784,15 @@ sub get_node_range_hash {
             }
         }
         $count ++;
-        $progress      = $count / $to_do;
-        $progress_text = int (100 * $progress);
-        $progress_bar->update(
-            "Calculating node ranges\n($progress_text)",
-            $progress,
-        );
+        #  fewer progress calls as we get heaps with large data sets
+        if (not $count % 20) {  
+            $progress      = $count / $to_do;
+            $progress_text = int (100 * $progress);
+            $progress_bar->update(
+                "Calculating node ranges\n($count of $to_do)",
+                $progress,
+            );
+        }
     }
 
     my %results = (node_range => \%node_range);

diff --git a/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm b/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm
@@ -4,14 +4,10 @@ use warnings;
 
 our $VERSION = '3.99_004';
 
-use constant HAVE_PANDA_LIB
-  => !$ENV{BD_NO_USE_PANDA} && eval 'require Panda::Lib';
-use constant HAVE_DATA_RECURSIVE
-  => !$ENV{BD_NO_USE_PANDA} && eval 'require Data::Recursive';
-
 use feature 'refaliasing';
 no warnings 'experimental::refaliasing';
 
+use List::Util qw /sum/;
 
 sub _calc_pe {
     my $self = shift;
@@ -20,7 +16,8 @@ sub _calc_pe {
     my $tree_ref         = $args{trimmed_tree};
     my $results_cache    = $args{PE_RESULTS_CACHE};
     my $element_list_all = $args{element_list_all};
-    \my %node_ranges = $args{node_range};
+    \my %node_ranges     = $args{node_range};
+    \my %rw_node_lengths = $args{inverse_range_weighted_node_lengths};
 
     my $bd = $args{basedata_ref} || $self->get_basedata_ref;
 
@@ -57,37 +54,41 @@ sub _calc_pe {
         #  else build them and cache them
         else {
             my $labels = $bd->get_labels_in_group_as_hash_aa ($group);
+
+            #  This is a slow point for many calcs but the innards are cached
+            #  and the cost is amortised when PD is also calculated
             my $nodes_in_path = $self->get_path_lengths_to_root_node (
                 @_,
                 labels  => $labels,
                 el_list => [$group],
             );
 
-            my ($gp_score, %gp_wts, %gp_ranges);
-
-            #  slice assignment wasn't faster according to nytprof and benchmarking
-            #@gp_ranges{keys %$nodes_in_path} = @$node_ranges{keys %$nodes_in_path};
-
-            #  refaliasing avoids hash deref overheads below
-            \my %node_lengths = $nodes_in_path;
-
-            #  loop over the nodes and run the calcs
-          NODE:
-            foreach my $node_name (keys %node_lengths) {
-                # Not sure we even need to test for zero ranges.
-                # We should never suffer this given the pre_calcs.
-                my $range = $node_ranges{$node_name}
-                  || next NODE;
-                my $wt     = $node_lengths{$node_name} / $range;
-                $gp_score += $wt;
-                $gp_wts{$node_name}    = $wt;
-                $gp_ranges{$node_name} = $range;
-            }
+            my %gp_wts    = %rw_node_lengths{keys %$nodes_in_path};
+            my $gp_score  = sum values %gp_wts;
+
+          #  old approach - left here as notes for the
+          #  non-equal area case in the future
+          #  #  loop over the nodes and run the calcs
+          #  refaliasing avoids hash deref overheads below in the loop
+          #  albeit the loop is not used any more...
+          #  \my %nodes = $nodes_in_path;
+          #NODE:
+          #  foreach my $node_name (keys %node_lengths) {
+          #      # Not sure we even need to test for zero ranges.
+          #      # We should never suffer this given the pre_calcs.
+          #      #my $range = $node_ranges{$node_name}
+          #      #  or next NODE;
+          #      #my $wt     = $node_lengths{$node_name} / $range;
+          #      my $wt = $rw_node_lengths{$node_name};
+          #      #say STDERR (sprintf ('%s %f %f', $node_name, $wt, $wt2));
+          #      $gp_score += $wt;
+          #      #$gp_wts{$node_name}    = $wt;
+          #      #$gp_ranges{$node_name} = $range;
+          #  }
 
             $results_this_gp = {
                 PE_WE           => $gp_score,
                 PE_WTLIST       => \%gp_wts,
-                PE_RANGELIST    => \%gp_ranges,
             };
 
             $results_cache->{$group} = $results_this_gp;
@@ -107,18 +108,6 @@ sub _calc_pe {
             @local_ranges{keys %$hashref} = (1) x scalar keys %$hashref;
         }
         else {
-            # ranges are invariant, so can be crashed together
-            my $hash_ref = $results_this_gp->{PE_RANGELIST};
-            if (HAVE_DATA_RECURSIVE) {
-                Data::Recursive::hash_merge (\%ranges, $hash_ref, Data::Recursive::LAZY());
-            }
-            elsif (HAVE_PANDA_LIB) {
-                Panda::Lib::hash_merge (\%ranges, $hash_ref, Panda::Lib::MERGE_LAZY());
-            }
-            else {
-                @ranges{keys %$hash_ref} = values %$hash_ref;
-            }
-
             # refalias might be a nano-optimisation here...
             \my %wt_hash = $results_this_gp->{PE_WTLIST};
 
@@ -145,11 +134,16 @@ sub _calc_pe {
         $PE_WE_P = eval {$PE_WE / $total_tree_length};
     }
 
-    #  need the collated versions
+    #  need the collated versions for multiple elements
     if (scalar @$element_list_all > 1) {
         $results{PE_WE}     = $PE_WE;
         $results{PE_WTLIST} = \%wts;
-        $results{PE_RANGELIST} = \%ranges;
+        my %nranges = %node_ranges{keys %wts};
+        $results{PE_RANGELIST} = \%nranges;
+    }
+    else {
+        my %nranges = %node_ranges{keys %{$results{PE_WTLIST}}};
+        $results{PE_RANGELIST} = \%nranges;
     }
 
     #  need to set these