Skip to content

Commit

Permalink
Merge pull request #89 from andrewjpage/add_gnu_parallel_support
Browse files Browse the repository at this point in the history
Add gnu parallel support
  • Loading branch information
andrewjpage committed Feb 11, 2015
2 parents 4a8ae3a + ca8d45e commit 51854b9
Show file tree
Hide file tree
Showing 33 changed files with 194 additions and 110 deletions.
1 change: 1 addition & 0 deletions dist.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ requires = fasta_grep
requires = bedtools
requires = muscle
requires = revtrans.py
requires = parallel


[@Basic]
Expand Down
11 changes: 9 additions & 2 deletions lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ has 'output_filename' => ( is => 'rw', isa => 'Str', default =
has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' );
has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'group_statisics.csv' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
Expand All @@ -40,7 +41,8 @@ has 'perc_identity' => ( is => 'ro', isa => 'Num', default =
has 'dont_delete_files' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );

has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );

Expand Down Expand Up @@ -71,6 +73,7 @@ sub run {
number_of_input_files => $number_of_input_files,
output_filtered_clustered_fasta => $output_filtered_clustered_fasta,
job_runner => $self->job_runner,
cpus => $self->cpus
);

$iterative_cdhit->run();
Expand All @@ -79,6 +82,7 @@ sub run {
fasta_file => $output_cd_hit_filename,
blast_results_file_name => $output_blast_results_filename,
job_runner => $self->job_runner,
cpus => $self->cpus,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec,
perc_identity => $self->perc_identity
Expand All @@ -95,6 +99,7 @@ sub run {
mcxdeblast_exec => $self->mcxdeblast_exec,
mcl_exec => $self->mcl_exec,
job_runner => $self->job_runner,
cpus => $self->cpus,
output_file => $output_mcl_filename
);
$mcl->run();
Expand All @@ -104,6 +109,7 @@ sub run {

my $post_analysis = Bio::PanGenome::External::PostAnalysis->new(
job_runner => $self->job_runner,
cpus => $self->cpus,
fasta_files => $self->fasta_files,
input_files => $self->input_files,
output_filename => $self->output_filename,
Expand All @@ -115,7 +121,8 @@ sub run {
dont_delete_files => $self->dont_delete_files,
dont_create_rplots => $self->dont_create_rplots,
verbose_stats => $self->verbose_stats,
translation_table => $self->translation_table
translation_table => $self->translation_table,
group_limit => $self->group_limit,
);
$post_analysis->run();

Expand Down
32 changes: 24 additions & 8 deletions lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,21 @@ has 'mcxdeblast_exec' => ( is => 'rw', isa => 'Str', default => 'mcxdeblast' )
has 'mcl_exec' => ( is => 'rw', isa => 'Str', default => 'mcl' );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 98 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 98 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );

has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );

sub BUILD {
my ($self) = @_;

my ( $fasta_files, $dont_create_rplots, $dont_delete_files, $perc_identity, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus,$output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $help );
my ( $fasta_files, $create_rplots,$group_limit, $max_threads, $dont_delete_files, $perc_identity, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus,$output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $help );

GetOptionsFromArray(
$self->args,
Expand All @@ -55,9 +56,10 @@ sub BUILD {
'e|output_multifasta_files' => \$output_multifasta_files,
'i|perc_identity=i' => \$perc_identity,
'dont_delete_files' => \$dont_delete_files,
'dont_create_rplots' => \$dont_create_rplots,
'create_rplots' => \$create_rplots,
'verbose_stats' => \$verbose_stats,
't|translation_table=i' => \$translation_table,
'group_limit=i' => \$group_limit,
'qc|run_qc' => \$run_qc,
'h|help' => \$help,
);
Expand All @@ -78,9 +80,10 @@ sub BUILD {
$self->apply_unknowns_filter($apply_unknowns_filter) if ( defined($apply_unknowns_filter) );
$self->output_multifasta_files($output_multifasta_files) if ( defined($output_multifasta_files) );
$self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) );
$self->dont_create_rplots($dont_create_rplots) if (defined($dont_create_rplots) );
$self->dont_create_rplots(0) if (defined($create_rplots) );
$self->verbose_stats($verbose_stats) if ( defined $verbose_stats );
$self->translation_table($translation_table) if (defined($translation_table) );
$self->group_limit($group_limit) if ( defined($group_limit) );
$self->run_qc($run_qc) if ( defined( $run_qc ) );

for my $filename ( @{ $self->args } ) {
Expand All @@ -106,6 +109,7 @@ sub run {
input_files => $self->fasta_files,
job_runner => $self->job_runner,
apply_unknowns_filter => $self->apply_unknowns_filter,
cpus => $self->cpus,
translation_table => $self->translation_table
);

Expand All @@ -122,14 +126,16 @@ sub run {
fasta_files => $prepare_input_files->fasta_files,
output_filename => $self->output_filename,
job_runner => $self->job_runner,
cpus => $self->cpus,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec,
output_multifasta_files => $self->output_multifasta_files,
perc_identity => $self->perc_identity,
dont_delete_files => $self->dont_delete_files,
dont_create_rplots => $self->dont_create_rplots,
verbose_stats => $self->verbose_stats,
translation_table => $self->translation_table
translation_table => $self->translation_table,
group_limit => $self->group_limit
);
$pan_genome_obj->run();
}
Expand Down Expand Up @@ -164,6 +170,16 @@ sub usage_text {
# Include full annotation and inference in group statistics
create_pan_genome --verbose_stats *.gff
# Run sequentially without LSF
create_pan_genome -j Local *.gff
# Run locally with GNU parallel and 4 processors
create_pan_genome -j Parallel -p 4 *.gff
# Increase the groups/clusters limit (default 50,000). If you need to change this your
# probably trying to work data from more than one species (which this script wasnt designed for).
create_pan_genome --group_limit 60000 *.gff
# Generate QC report detailing top genus and species for each assembly
create_pan_genome -qc *.gff
Expand Down
15 changes: 11 additions & 4 deletions lib/Bio/PanGenome/CommandLine/IterativeCdhit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ has 'output_filtered_clustered_fasta' => ( is => 'rw', isa => 'Str', default =>
has 'lower_bound_percentage' => ( is => 'rw', isa => 'Num', default => 0.98 );
has 'upper_bound_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'step_size_percentage' => ( is => 'rw', isa => 'Num', default => 0.005 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );


sub BUILD {
my ($self) = @_;

my ( $output_cd_hit_filename,$lower_bound_percentage,$upper_bound_percentage,$step_size_percentage, $output_combined_filename, $number_of_input_files, $output_filtered_clustered_fasta,
my ( $output_cd_hit_filename,$cpus,$lower_bound_percentage,$upper_bound_percentage,$step_size_percentage, $output_combined_filename, $number_of_input_files, $output_filtered_clustered_fasta,
$help );

GetOptionsFromArray(
Expand All @@ -42,6 +43,7 @@ sub BUILD {
'l|lower_bound_percentage=s' => \$lower_bound_percentage,
'u|upper_bound_percentage=s' => \$upper_bound_percentage,
's|step_size_percentage=s' => \$step_size_percentage,
'cpus=i' => \$cpus,
'h|help' => \$help,
);

Expand All @@ -52,6 +54,7 @@ sub BUILD {
$self->output_cd_hit_filename($output_cd_hit_filename) if ( defined($output_cd_hit_filename) );
$self->output_combined_filename($output_combined_filename) if ( defined($output_combined_filename) );
$self->number_of_input_files($number_of_input_files) if ( defined($number_of_input_files) );
$self->cpus($cpus) if ( defined($cpus) );
$self->output_filtered_clustered_fasta($output_filtered_clustered_fasta)
if ( defined($output_filtered_clustered_fasta) );

Expand All @@ -73,7 +76,8 @@ sub run {
output_filtered_clustered_fasta => $self->output_filtered_clustered_fasta,
lower_bound_percentage => $self->lower_bound_percentage,
upper_bound_percentage => $self->upper_bound_percentage,
step_size_percentage => $self->step_size_percentage
step_size_percentage => $self->step_size_percentage,
cpus => $self->cpus

);
$obj->run;
Expand All @@ -86,10 +90,13 @@ sub usage_text {
Usage: iterative_cdhit [options]
Iteratively cluster a set of proteins with CD-hit, lower the threshold each time and extracting core genes (1 per isolate) to another file, and remove them from the input proteins file.
# Basic usage where you have a single isolate
# Basic usage where you have a single isolate
iterative_cdhit -m proteome_fasta.faa
# Where you have 10 isolates
# Use multiple CPUs
iterative_cdhit -m proteome_fasta.faa --cpus 8
# Where you have 10 isolates
iterative_cdhit -m proteome_fasta.faa -n 10
# Specify the output file name cdhit results
Expand Down
35 changes: 23 additions & 12 deletions lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::PostAnalysis;
use File::Find::Rule;
use Bio::PanGenome::External::ProteinMuscleAlignmentFromNucleotides;

use File::Path qw(remove_tree);

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
Expand All @@ -28,10 +28,12 @@ has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'g
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'clusters_filename' => ( is => 'rw', isa => 'Str' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );


sub BUILD {
Expand All @@ -40,7 +42,7 @@ sub BUILD {
my (
$output_filename, $dont_create_rplots, $dont_delete_files, $output_pan_geneome_filename,
$job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename,
$fasta_files, $input_files, $verbose_stats, $translation_table, $help
$fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit
);


Expand All @@ -57,7 +59,9 @@ sub BUILD {
'dont_delete_files' => \$dont_delete_files,
'dont_create_rplots' => \$dont_create_rplots,
'verbose_stats' => \$verbose_stats,
'processors=i' => \$cpus,
't|translation_table=i' => \$translation_table,
'group_limit=i' => \$group_limit,
'h|help' => \$help,
);

Expand All @@ -74,6 +78,8 @@ sub BUILD {
$self->dont_create_rplots($dont_create_rplots) if (defined($dont_create_rplots) );
$self->verbose_stats($verbose_stats) if (defined($verbose_stats));
$self->translation_table($translation_table) if (defined($translation_table) );
$self->cpus($cpus) if ( defined($cpus) );
$self->group_limit($group_limit) if ( defined($group_limit) );

}

Expand All @@ -97,19 +103,23 @@ sub run {
dont_delete_files => $self->dont_delete_files,
dont_create_rplots => $self->dont_create_rplots,
verbose_stats => $self->verbose_stats,
group_limit => $self->group_limit,
);
$obj->run();


if($self->output_multifasta_files == 1)

my $output_gene_files = $self->_find_input_files;
my $seg = Bio::PanGenome::External::ProteinMuscleAlignmentFromNucleotides->new(
fasta_files => $output_gene_files,
job_runner => $self->job_runner,
translation_table => $self->translation_table,
cpus => $self->cpus
);
$seg->run();

# Cleanup intermediate multifasta files
if($self->output_multifasta_files == 0)
{
my $output_gene_files = $self->_find_input_files;
my $seg = Bio::PanGenome::External::ProteinMuscleAlignmentFromNucleotides->new(
fasta_files => $output_gene_files,
job_runner => $self->job_runner,
translation_table => $self->translation_table
);
$seg->run();
remove_tree('pan_genome_sequences');
}
}

Expand Down Expand Up @@ -151,6 +161,7 @@ sub usage_text {
-c output_clusters_filename
-f file_of_proteins
-i file_of_gffs
--processors number of processors
--verbose_stats
# This help message
Expand Down
9 changes: 8 additions & 1 deletion lib/Bio/PanGenome/CommandLine/ParallelAllAgainstAllBlastp.pm
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'blast_results' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );

Expand All @@ -29,14 +30,15 @@ has '_error_message' => ( is => 'rw', isa => 'Str' );
sub BUILD {
my ($self) = @_;

my ( $fasta_files, $output_filename, $job_runner, $makeblastdb_exec, $blastp_exec, $help );
my ( $fasta_files, $output_filename, $job_runner, $makeblastdb_exec, $blastp_exec, $help, $cpus );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'j|job_runner=s' => \$job_runner,
'm|makeblastdb_exec=s' => \$makeblastdb_exec,
'b|blastp_exec=s' => \$blastp_exec,
'p|processors=i' => \$cpus,
'h|help' => \$help,
);

Expand All @@ -49,6 +51,7 @@ sub BUILD {
$self->job_runner($job_runner) if ( defined($job_runner) );
$self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) );
$self->blastp_exec($blastp_exec) if ( defined($blastp_exec) );
$self->cpus($cpus) if ( defined($cpus) );

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
Expand Down Expand Up @@ -94,6 +97,7 @@ sub run {
fasta_file => $output_combined_filename,
blast_results_file_name => $self->output_filename,
job_runner => $self->job_runner,
cpus => $self->cpus,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec
);
Expand All @@ -112,6 +116,9 @@ sub usage_text {
# Provide an output filename
parallel_all_against_all_blastp -o blast_results example.faa
# number of processors to use
parallel_all_against_all_blastp -p 10 example.faa
# This help message
parallel_all_against_all_blastp -h
Expand Down

0 comments on commit 51854b9

Please sign in to comment.