Permalink
Browse files

moving get_module_versions_in_release to batch mode

  • Loading branch information...
1 parent 3957e82 commit aa4922104423490a559e2ff9898bdf0d41924cc2 @semuel committed Jul 9, 2013
Showing with 133 additions and 41 deletions.
  1. +4 −0 Changes
  2. +38 −7 lib/Dist/Surveyor.pm
  3. +83 −31 lib/Dist/Surveyor/Inquiry.pm
  4. +2 −2 lib/Dist/Surveyor/MakeCpan.pm
  5. +6 −1 t/02-inquery.t
View
@@ -1,8 +1,12 @@
Revision history for Dist-Surveyor
{{$NEXT}}
+ Adding keep_alive to connections - major speedup
+ Changed to batch get_module_versions_in_release - another 30% speedup
0.011 2013-07-08 16:11:57 Asia/Tokyo
+ More refactoring
+ added a missing 'use version' in Inquiry.pm
0.010 2013-07-05 11:15:05 Asia/Tokyo
Major refactoring done
View
@@ -377,20 +377,52 @@ sub get_installed_mod_info {
return $mi;
}
-# pick_best_cpan_dist_release - memoized
+# pick_best_cpan_dist_release
# for each %$ccdr adds a fraction_installed based on %$installed_mod_info
# returns ref to array of %$ccdr values that have the max fraction_installed
-
+#
+# this is the biggest time - consuming function in the module, as it done
+# for each module. And the time is mostly spent on doing calls to MetaAPI.
+# I optimized it to batch the calls three at a time, cutting 30% of the
+# running time.
sub pick_best_cpan_dist_release {
my ($ccdr, $installed_mod_info) = @_;
- for my $release (sort keys %$ccdr) {
- my $release_info = $ccdr->{$release};
+ my $set_fractions = sub {
+ my ($release_info, $release, $mods_in_rel) = @_;
$release_info->{fraction_installed}
- = dist_fraction_installed($release_info->{author}, $release, $installed_mod_info);
+ = dist_fraction_installed($release_info->{author}, $release, $mods_in_rel, $installed_mod_info);
$release_info->{percent_installed} # for informal use
= sprintf "%.2f", $release_info->{fraction_installed} * 100;
+ };
+
+ my @waiting_releases;
+ my $process_waiting = sub {
+ my @params = map { [ $_->[2], $_->[1] ] } @waiting_releases;
+ my $results = get_module_versions_in_releases(@params);
+ foreach my $rec (@waiting_releases) {
+ my $key = "$rec->[2]/$rec->[1]";
+ my $mods_in_rel = $results->{$key};
+ die "pick_best_cpan_dist_release: no results for $key"
+ unless $mods_in_rel;
+ $set_fractions->($rec->[0], $rec->[1], $mods_in_rel);
+ }
+ @waiting_releases = ();
+ };
+
+ for my $release (sort keys %$ccdr) {
+ my $release_info = $ccdr->{$release};
+ my $author = $release_info->{author};
+ if (my $mods_in_rel = get_module_versions_in_release_cached($author, $release)) {
+ $set_fractions->($release_info, $release, $mods_in_rel);
+ next;
+ }
+ push @waiting_releases, [ $release_info, $release, $author ];
+ if (scalar(@waiting_releases) > 2) {
+ $process_waiting->();
+ }
}
+ $process_waiting->() if @waiting_releases;
my $max_fraction_installed = max( map { $_->{fraction_installed} } values %$ccdr );
my @best = grep { $_->{fraction_installed} == $max_fraction_installed } values %$ccdr;
@@ -402,10 +434,9 @@ sub pick_best_cpan_dist_release {
# returns a number from 0 to 1 representing the fraction of the modules
# in a particular release match the coresponding modules in %$installed_mod_info
sub dist_fraction_installed {
- my ($author, $release, $installed_mod_info) = @_;
+ my ($author, $release, $mods_in_rel, $installed_mod_info) = @_;
my $tag = "$author/$release";
- my $mods_in_rel = get_module_versions_in_release($author, $release);
my $mods_in_rel_count = keys %$mods_in_rel;
my $mods_inst_count = sum( map {
my $mi = $installed_mod_info->{ $_->{name} };
@@ -66,7 +66,8 @@ our @ISA = qw{Exporter};
our @EXPORT = qw{
get_candidate_cpan_dist_releases
get_candidate_cpan_dist_releases_fallback
- get_module_versions_in_release
+ get_module_versions_in_releases
+ get_module_versions_in_release_cached
get_release_info
};
@@ -101,7 +102,6 @@ sub perma_cache {
my @memoize_subs = qw(
get_candidate_cpan_dist_releases
get_candidate_cpan_dist_releases_fallback
- get_module_versions_in_release
get_release_info
);
for my $subname (@memoize_subs) {
@@ -308,13 +308,18 @@ sub _process_response {
return \%dists;
}
-=head2 get_module_versions_in_release($author, $release)
+=head2 get_module_versions_in_releases([$author, $release], [$author2, $release2], ...)
Receive release info, such as:
- get_module_versions_in_release('SEMUELF', 'Dist-Surveyor-0.009')
+ get_module_versions_in_releases(['SEMUELF', 'Dist-Surveyor-0.009'])
-And returns a hashref, that contains one entry for each module that exists
+We are using this function for a batch of releases for each time, as it is
+called a lot, and takes most of the program running time. see also
+get_module_versions_in_release_cached for caching.
+
+And returns a hashref, with key is "author/release", and values are hashrefs.
+This hashref contains one entry for each module that exists
in the release. module information is the format:
'Dist::Surveyor' => {
@@ -329,23 +334,42 @@ possibilities and aren't actually installed, so generally it's quiet
=cut
-sub get_module_versions_in_release {
- my ($author, $release) = @_;
+sub get_module_versions_in_releases {
+ my @releases = @_;
+
+ my $err_str = '[' . join(', ', map "$_->[0]=>$_->[1]", @releases) . ']';
+
+ my @r_filters;
+ foreach my $rec (@releases) {
+ my ($author, $release) = @$rec;
+ push @r_filters, {
+ 'and' => [
+ {"term" => {"release" => $release }},
+ {"term" => {"author" => $author }},
+ {"term" => {"mime" => "text/x-script.perl-module"}},
+ ]
+ };
+ }
+ my $filter;
+ if (scalar(@r_filters) == 1) {
+ $filter = $r_filters[0];
+ }
+ else {
+ $filter = { 'or' => \@r_filters };
+ }
$metacpan_calls++;
+ my $query = {
+ "size" => $metacpan_size,
+ "query" => { "filtered" => {
+ "filter" => $filter,
+ "query" => {"match_all" => {}},
+ }},
+ "fields" => ["path","name","_source.module", "_source.stat.size",
+ "_source.author", "_source.release"],
+ };
+
my $results = eval {
- my $query = {
- "size" => $metacpan_size,
- "query" => { "filtered" => {
- "filter" => {"and" => [
- {"term" => {"release" => $release }},
- {"term" => {"author" => $author }},
- {"term" => {"mime" => "text/x-script.perl-module"}},
- ]},
- "query" => {"match_all" => {}},
- }},
- "fields" => ["path","name","_source.module", "_source.stat.size"],
- };
my $response = $ua->post(
'http://api.metacpan.org/v0/file',
Content_Type => 'application/json',
@@ -355,21 +379,23 @@ sub get_module_versions_in_release {
decode_json $response->decoded_content;
};
if (not $results) {
- warn "Failed get_module_versions_in_release for $author/$release: $@";
+ warn "Failed get_module_versions_in_release for $err_str: $@";
return {};
}
my $hits = $results->{hits}{hits};
- die "get_module_versions_in_release($author, $release): too many results"
+ die "get_module_versions_in_release($err_str): too many results"
if @$hits >= $metacpan_size;
- my %modules_in_release;
+ my %modules_in_all_release;
for my $hit (@$hits) {
my $path = $hit->{fields}{path};
+ my $mod_author = $hit->{fields}{"_source.author"};
+ my $mod_release = $hit->{fields}{"_source.release"};
# XXX try to ignore files that won't get installed
# XXX should use META noindex!
if ($path =~ m!^(?:t|xt|tests?|inc|samples?|ex|examples?|bak|local-lib)\b!) {
- warn "$author/$release: ignored non-installed module $path\n"
+ warn "$mod_author/$mod_release: ignored non-installed module $path\n"
if $DEBUG;
next;
}
@@ -385,28 +411,28 @@ sub get_module_versions_in_release {
# can't rely on $path including the full package name.
(my $filebasename = $hit->{fields}{name}) =~ s/\.pm$//;
if ($mod->{name} !~ m/\b$filebasename$/) {
- warn "$author/$release: ignored $mod->{name} in $path\n"
+ warn "$mod_author/$mod_release: ignored $mod->{name} in $path\n"
if $DEBUG;
next;
}
# warn if package previously seen in this release
# with a different version or file size
- if (my $prev = $modules_in_release{$mod->{name}}) {
+ if (my $prev = $modules_in_all_release{"$mod_author/$mod_release"}->{$mod->{name}}) {
my $version_obj = eval { version->parse($mod->{version}) };
- die "$author/$release: $mod $mod->{version}: $@" if $@;
+ die "$mod_author/$mod_release: $mod $mod->{version}: $@" if $@;
if ($VERBOSE) {
# XXX could add a show-only-once cache here
my $msg = "$mod->{name} $mod->{version} ($size) seen in $path after $prev->{path} $prev->{version} ($prev->{size})";
- warn "$release: $msg\n"
+ warn "$mod_release: $msg\n"
if ($version_obj != version->parse($prev->{version}) or $size != $prev->{size});
}
}
# keep result small as Storable thawing this is major runtime cost
# (specifically we avoid storing a version_obj here)
- $modules_in_release{$mod->{name}} = {
+ $modules_in_all_release{"$mod_author/$mod_release"}->{$mod->{name}} = {
name => $mod->{name},
path => $path,
version => $mod->{version},
@@ -415,10 +441,36 @@ sub get_module_versions_in_release {
}
}
- warn "\n$author/$release contains: @{[ map { qq($_->{name} $_->{version}) } values %modules_in_release ]}\n"
- if $DEBUG;
+ # warn "\n$author/$release contains: @{[ map { qq($_->{name} $_->{version}) } values %modules_in_release ]}\n"
+ # if $DEBUG;
- return \%modules_in_release;
+ foreach my $release (keys %modules_in_all_release) {
+ my $key = join('', 'gmvir_cache', "\034", $release);
+ $memoize_cache{$key} = $modules_in_all_release{$release};
+ }
+
+ return \%modules_in_all_release;
+}
+
+=head2 get_module_versions_in_release_cached($author, $release)
+
+Check if we have the result of get_module_versions_in_releases for one release
+cached. So we will be able to bundle requests effectively, and not bundle
+a cached request with uncached requests.
+
+return undef if not cached, or the hashref for that release if cached.
+
+=cut
+
+sub get_module_versions_in_release_cached {
+ my ($author, $release) = @_;
+ my $key = join('', 'gmvir_cache', "\034", $author, '/', $release);
+ if (exists $memoize_cache{$key}) {
+ return $memoize_cache{$key};
+ }
+ else {
+ return;
+ }
}
=head1 License, Copyright
@@ -158,8 +158,8 @@ sub add_release {
warn "$mirror_status $main_url\n" if $verbose;
}
-
- my $mods_in_rel = get_module_versions_in_release($ri->{author}, $ri->{name});
+ my $mods_in_rel = get_module_versions_in_release_cached($ri->{author}, $ri->{name});
+ $mods_in_rel ||= get_module_versions_in_releases([$ri->{author}, $ri->{name}]);
if (!keys %$mods_in_rel) { # XXX hack for common::sense
(my $dist_as_pkg = $ri->{distribution}) =~ s/-/::/g;
View
@@ -4,7 +4,12 @@ use Dist::Surveyor::Inquiry;
use Data::Dumper;
use Test::More;
-my $module_data = get_module_versions_in_release('SEMUELF', 'Dist-Surveyor-0.009');
+is(get_module_versions_in_release_cached('SEMUELF', 'Dist-Surveyor-0.009'), undef, "Still not in cache");
+my $modules_data = get_module_versions_in_releases(['SEMUELF', 'Dist-Surveyor-0.009']);
+is(scalar keys %$modules_data, 1, "get_module_versions_in_releases returned one answer");
+ok(exists $modules_data->{'SEMUELF/Dist-Surveyor-0.009'}, 'and the right one, too');
+isnt(get_module_versions_in_release_cached('SEMUELF', 'Dist-Surveyor-0.009'), undef, "now in cache");
+my $module_data = $modules_data->{'SEMUELF/Dist-Surveyor-0.009'};
my $expected = {
'Dist::Surveyor' => {
'version' => '0.009',

0 comments on commit aa49221

Please sign in to comment.