Skip to content

Commit

Permalink
Merge pull request #6 from andrewjpage/master
Browse files Browse the repository at this point in the history
speedup searching fastas
  • Loading branch information
andrewjpage committed Apr 26, 2013
2 parents d9f2252 + aa9866e commit fa3ed20
Show file tree
Hide file tree
Showing 25 changed files with 1,117 additions and 6 deletions.
19 changes: 19 additions & 0 deletions bin/plot_pan_genome_groups
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env perl

package Bio::PanGenome::Main::PlotPanGenomeGroups;

# ABSTRACT: Take in the groups file and output some summary plots
# PODNAME: plot_pan_genome_groups

=head1 SYNOPSIS
Take in the groups file and output some summary plots
=cut

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::PanGenome::CommandLine::PlotPanGenomeGroups;

Bio::PanGenome::CommandLine::PlotPanGenomeGroups->new(args => \@ARGV, script_name => $0)->run;
19 changes: 19 additions & 0 deletions bin/query_pan_genome
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env perl

package Bio::PanGenome::Main::QueryPanGenome;

# ABSTRACT: Take in a groups file and the protein fasta files and output selected data
# PODNAME: query_pan_genome

=head1 SYNOPSIS
Take in a groups file and the protein fasta files and output selected data
=cut

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::PanGenome::CommandLine::QueryPanGenome;

Bio::PanGenome::CommandLine::QueryPanGenome->new(args => \@ARGV, script_name => $0)->run;
1 change: 1 addition & 0 deletions dist.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ requires = makeblastdb
requires = cd-hit
requires = mcl
requires = mcxdeblast
requires = fasta_grep

[@Basic]
[PruneCruft]
Expand Down
27 changes: 23 additions & 4 deletions lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use Bio::PanGenome::CombinedProteome;
use Bio::PanGenome::External::Cdhit;
use Bio::PanGenome::External::Mcl;
use Bio::PanGenome::InflateClusters;
use Bio::PanGenome::AnalyseGroups;
use Bio::PanGenome::GroupLabels;

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
Expand All @@ -26,10 +28,11 @@ has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );
sub run {
my ($self) = @_;

my $output_combined_filename = 'combined_files.faa';
my $output_cd_hit_filename = 'clustered.faa';
my $output_combined_filename = 'combined_files';
my $output_cd_hit_filename = 'clustered';
my $output_blast_results_filename = 'blast_results';
my $output_mcl_filename = 'uninflated_mcl_groups';
my $output_inflate_clusters_filename = 'inflated_mcl_groups';

my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
Expand Down Expand Up @@ -64,11 +67,27 @@ sub run {
my $inflate_clusters = Bio::PanGenome::InflateClusters->new(
clusters_filename => $cdhit_obj->clusters_filename,
mcl_filename => $output_mcl_filename,
output_file => $self->output_filename
output_file => $output_inflate_clusters_filename
);
$inflate_clusters->inflate();

my $group_labels = Bio::PanGenome::GroupLabels->new(
groups_filename => $output_inflate_clusters_filename,
output_filename => $self->output_filename
);
$group_labels->add_labels();

my $analyse_groups_obj = Bio::PanGenome::AnalyseGroups->new(
fasta_files => $self->fasta_files,
groups_filename => $self->output_filename
);
$analyse_groups_obj->create_plots();

# Cleanup files
unlink($output_blast_results_filename);
unlink($output_combined_filename);
unlink($output_cd_hit_filename );
unlink($output_mcl_filename );
unlink($output_inflate_clusters_filename);
}

no Moose;
Expand Down
175 changes: 175 additions & 0 deletions lib/Bio/PanGenome/AnalyseGroups.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package Bio::PanGenome::AnalyseGroups;

# ABSTRACT: Take in a groups file and the original FASTA files and create plots and stats

=head1 SYNOPSIS
Take in a groups file and the original FASTA files and create plots and stats
use Bio::PanGenome::AnalyseGroups;
my $plot_groups_obj = Bio::PanGenome::AnalyseGroups->new(
fasta_files => $fasta_files,
groups_filename => $groups_filename,
output_filename => $output_filename
);
$plot_groups_obj->create_plots();
=cut

use Moose;
use Bio::PanGenome::Exceptions;
use Bio::PanGenome::Plot::FreqOfGenes;

has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'groups_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'summary_of_groups' );

has '_number_of_isolates' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_builder__number_of_isolates' );
has '_number_of_genes_per_file' =>
( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__number_of_genes_per_file' );
has '_genes_to_file' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__genes_to_file' );
has '_freq_groups_per_genome' =>
( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_builder__freq_groups_per_genome' );
has '_groups_to_genes' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__groups_to_genes' );
has '_genes_to_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__genes_to_groups' );

# Fixme: Same files opened multiple times

sub _builder__number_of_isolates {
my ($self) = @_;
return @{ $self->fasta_files };
}

sub _builder__number_of_genes_per_file {
my ($self) = @_;

my %gene_count;
for my $filename ( @{ $self->fasta_files } ) {

# Count the number of sequence description lines
open( my $fh, '-|', 'grep \> ' . $filename . ' | wc -l' );
my $count = <$fh>;
chomp($count);
$count =~ s/^\s*(.*?)\s*$/$1/;
$gene_count{$filename} = $count;
close($fh);
}
return \%gene_count;
}

sub _freq_dist_of_genes {
my ($self) = @_;
my @gene_counts = values %{ $self->_number_of_genes_per_file };
my %gene_freq;
for my $gene_count (@gene_counts) {
$gene_freq{$gene_count}++;
}
return \%gene_freq;
}

sub _builder__genes_to_file {
my ($self) = @_;
my %genes_to_file;
for my $filename ( @{ $self->fasta_files } ) {
open(my $fh, '-|', 'grep \> '.$filename.' | awk \'{print $1}\' | sed \'s/>//\' ');
while(<$fh>)
{
chomp;
$genes_to_file{ $_ } = $filename;
}
close($fh);
}
return \%genes_to_file;
}

sub _count_num_files_in_group {
my ( $self, $genes ) = @_;
my $count = 0;
my %filename_freq;
for my $gene ( @{$genes} ) {
next if ( $gene eq "" );
if ( defined( $self->_genes_to_file->{$gene} ) ) {
$filename_freq{ $self->_genes_to_file->{$gene} }++;
}
}
my @uniq_filenames = keys %filename_freq;
return @uniq_filenames;
}

sub _builder__genes_to_groups {
my ($self) = @_;
my %genes_to_groups;

open( my $fh, $self->groups_filename )
or Bio::PanGenome::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
while (<$fh>) {
chomp;
my $line = $_;
if ( $line =~ /^(.+): (.+)$/ ) {
my $group_name = $1;
my $genes = $2;
my @elements = split( /\s+/, $genes );

for my $gene (@elements) {
$genes_to_groups{$gene} = $group_name;
}
}
}
return \%genes_to_groups;
}

sub _builder__groups_to_genes {
my ($self) = @_;
my %groups_to_genes;

open( my $fh, $self->groups_filename )
or Bio::PanGenome::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
while (<$fh>) {
chomp;
my $line = $_;
if ( $line =~ /^(.+): (.+)$/ ) {
my $group_name = $1;
my $genes = $2;
my @elements = split( /\s+/, $genes );
$groups_to_genes{$group_name} = \@elements;
}
}
return \%groups_to_genes;
}

sub _builder__freq_groups_per_genome {
my ($self) = @_;
my @group_count;

open( my $fh, $self->groups_filename )
or Bio::PanGenome::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
while (<$fh>) {
chomp;
my $line = $_;

# Remove the group name
$line =~ s!^(.+: )?!!;
my @elements = split( /\s+/, $line );
my $number_of_files_in_group = $self->_count_num_files_in_group( \@elements );
$number_of_files_in_group = ( $number_of_files_in_group * 100 / $self->_number_of_isolates );
push( @group_count, $number_of_files_in_group );

}
close($fh);
my @sorted_group_count = sort { $b <=> $a } @group_count;
return \@sorted_group_count;
}

sub create_plots {
my ($self) = @_;

my $plot_groups_obj =
Bio::PanGenome::Plot::FreqOfGenes->new( freq_groups_per_genome => $self->_freq_groups_per_genome );
$plot_groups_obj->create_plot();
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;

2 changes: 1 addition & 1 deletion lib/Bio/PanGenome/ChunkFastaFile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use Cwd;
use File::Temp;

has 'fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'target_chunk_size' => ( is => 'ro', isa => 'Int', default => 100000 );
has 'target_chunk_size' => ( is => 'ro', isa => 'Int', default => 200000 );
has 'sequence_file_names' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_sequence_file_names' );
has '_working_directory' =>
( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
Expand Down
99 changes: 99 additions & 0 deletions lib/Bio/PanGenome/CommandLine/PlotPanGenomeGroups.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package Bio::PanGenome::CommandLine::PlotPanGenomeGroups;

# ABSTRACT: Take in the groups file and output some summary plots

=head1 SYNOPSIS
Take in the groups file and output some summary plots
=cut

use Moose;
use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::AnalyseGroups;

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'groups_filename' => ( is => 'rw', isa => 'Str' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );

has '_error_message' => ( is => 'rw', isa => 'Str' );

sub BUILD {
my ($self) = @_;

my ( $fasta_files, $output_filename, $groups_filename, $help );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'g|groups_filename=s' => \$groups_filename,
'h|help' => \$help,
);

if ( @{ $self->args } == 0 ) {
$self->_error_message("Error: You need to provide a FASTA file");
}

$self->output_filename($output_filename) if ( defined($output_filename) );
if ( defined($groups_filename) && (-e $groups_filename))
{
$self->groups_filename($groups_filename) ;
}
else
{
$self->_error_message("Error: Cant access the groups file $groups_filename");
}

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
$self->_error_message("Error: Cant access file $filename");
last;
}
}
$self->fasta_files( $self->args );

}

sub run {
my ($self) = @_;

( !$self->help ) or die $self->usage_text;
if ( defined( $self->_error_message ) ) {
print $self->_error_message . "\n";
die $self->usage_text;
}

my $plot_groups_obj = Bio::PanGenome::AnalyseGroups->new(
fasta_files => $self->fasta_files,
groups_filename => $self->groups_filename,
output_filename => $self->output_filename
);
$plot_groups_obj->create_plots();
}

sub usage_text {
my ($self) = @_;

return <<USAGE;
Usage: plot_pan_genome_groups [options]
Take in the groups file and output some summary plots
# Create summary plots
plot_pan_genome_groups -g groupfile example.faa
# Provide an output filename
plot_pan_genome_groups -g groupfile -o results *.faa
# This help message
plot_pan_genome_groups -h
USAGE
}

__PACKAGE__->meta->make_immutable;
no Moose;
1;
Loading

0 comments on commit fa3ed20

Please sign in to comment.