Skip to content

Commit

Permalink
Merge pull request #4 from andrewjpage/master
Browse files Browse the repository at this point in the history
run mcl and inflate results
  • Loading branch information
andrewjpage committed Apr 23, 2013
2 parents 6e71494 + 862abb7 commit f4bd8d4
Show file tree
Hide file tree
Showing 17 changed files with 456 additions and 124 deletions.
2 changes: 2 additions & 0 deletions dist.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ repository.type = git
requires = blastp
requires = makeblastdb
requires = cd-hit
requires = mcl
requires = mcxdeblast

[@Basic]
[PruneCruft]
Expand Down
73 changes: 71 additions & 2 deletions lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
@@ -1,8 +1,77 @@
package Bio::PanGenome;

# ABSTRACT: Create a pan genome

=head1 SYNOPSIS
Create a pan genome
=cut

use Moose;
# ABSTRACT: Pan Genomes
use Bio::PanGenome::ParallelAllAgainstAllBlast;
use Bio::PanGenome::CombinedProteome;
use Bio::PanGenome::External::Cdhit;
use Bio::PanGenome::External::Mcl;
use Bio::PanGenome::InflateClusters;

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );

sub run {
my ($self) = @_;

my $output_combined_filename = 'combined_files.faa';
my $output_cd_hit_filename = 'clustered.faa';
my $output_blast_results_filename = 'blast_results';
my $output_mcl_filename = 'uninflated_mcl_groups';

my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
output_filename => $output_combined_filename,
apply_unknowns_filter => 1
);
$combine_fasta_files->create_combined_proteome_file;

my $cdhit_obj = Bio::PanGenome::External::Cdhit->new(
input_file => $output_combined_filename,
output_base => $output_cd_hit_filename
);
$cdhit_obj->run();

my $blast_obj = Bio::PanGenome::ParallelAllAgainstAllBlast->new(
fasta_file => $output_cd_hit_filename,
blast_results_file_name => $output_blast_results_filename,
job_runner => $self->job_runner,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec
);
$blast_obj->run();

my $mcl = Bio::PanGenome::External::Mcl->new(
blast_results => $output_blast_results_filename,
mcxdeblast_exec => $self->mcxdeblast_exec,
mcl_exec => $self->mcl_exec,
output_file => $output_mcl_filename
);
$mcl->run();

my $inflate_clusters = Bio::PanGenome::InflateClusters->new(
clusters_filename => $cdhit_obj->clusters_filename,
mcl_filename => $output_mcl_filename,
output_file => $self->output_filename
);
$inflate_clusters->inflate();

# Cleanup files
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
1;
95 changes: 45 additions & 50 deletions lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
Original file line number Diff line number Diff line change
@@ -1,47 +1,59 @@
package Bio::PanGenome::CommandLine::CreatePanGenome;

# ABSTRACT: Create a pan genome from a set of proteins in a FASTA file
# ABSTRACT: Take in FASTA files of proteins and cluster them

=head1 SYNOPSIS
Create a pan genome from a set of proteins in a FASTA file
Take in FASTA files of proteins and cluster them
=cut

use Moose;
use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::CombinedProteome;
use Bio::PanGenome::External::Cdhit;
use Bio::PanGenome::External::Makeblastdb;
use Bio::PanGenome::External::Blastp;
use Bio::PanGenome::GGFile;
use Bio::PanGenome;

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str' );
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );

has '_error_message' => ( is => 'rw', isa => 'Str' );

sub BUILD {
my ($self) = @_;

my ( $fasta_files, $output_filename, $help );
my ( $fasta_files, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $help );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'h|help' => \$help,
'o|output=s' => \$output_filename,
'j|job_runner=s' => \$job_runner,
'm|makeblastdb_exec=s' => \$makeblastdb_exec,
'b|blastp_exec=s' => \$blastp_exec,
'd|mcxdeblast_exec' => \$mcxdeblast_exec,
'c|mcl_exec' => \$mcl_exec,
'h|help' => \$help,
);

$self->output_filename($output_filename) if ( defined($output_filename) );


if ( @{ $self->args } == 0 ) {
$self->_error_message("Error: You need to provide at least 1 FASTA file");
$self->_error_message("Error: You need to provide a FASTA file");
}

$self->output_filename($output_filename) if ( defined($output_filename) );
$self->job_runner($job_runner) if ( defined($job_runner) );
$self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) );
$self->blastp_exec($blastp_exec) if ( defined($blastp_exec) );
$self->mcxdeblast_exec($mcxdeblast_exec) if ( defined($mcxdeblast_exec) );
$self->mcl_exec($mcl_exec) if ( defined($mcl_exec) );

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
$self->_error_message("Error: Cant access file $filename");
Expand All @@ -60,46 +72,29 @@ sub run {
print $self->_error_message . "\n";
die $self->usage_text;
}

my $combined_proteome_obj = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
output_filename => 'combined_proteome.faa'

my $pan_genome_obj = Bio::PanGenome->new(
fasta_files => $self->fasta_files,
output_filename => $self->output_filename,
job_runner => $self->job_runner,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec
);
$combined_proteome_obj->create_combined_proteome_file;
print "Created combined file:\n";
my $percentage_sequences_ignored = (($combined_proteome_obj->number_of_sequences_ignored/$combined_proteome_obj->number_of_sequences_seen)*100);
print $percentage_sequences_ignored." percent of sequences ignored\n";

print "Clustering the data:\n";
my $cdhit_obj = Bio::PanGenome::External::Cdhit->new( input_file => 'combined_proteome.faa', output_base => 'clustered.faa');
$cdhit_obj->run();

print "Creating a blast database:\n";
my $blast_database= Bio::PanGenome::External::Makeblastdb->new(fasta_file => 'clustered.faa');
$blast_database->run();

print "Blasting all against all:\n";
my $blastp_obj = Bio::PanGenome::External::Blastp->new(
fasta_file => 'clustered.faa',
blast_database => $blast_database->output_database,
output_file => 'results.out'
);
$blastp_obj->run();

$pan_genome_obj->run();
}

sub usage_text {
my ($self) = @_;

return <<USAGE;
Usage: create_pan_geneome [options]
Create a pan genome from a set of proteins in a FASTA file
# Create a pan genome from some FASTA files
create_pan_geneome *.faa
Take in FASTA files of proteins and cluster them
# Take in FASTA files of proteins and cluster them
create_pan_geneome example.faa
# Provide an output filename
create_pan_geneome -o outputfile.faa *.faa
create_pan_geneome -o results *.faa
# This help message
create_pan_geneome -h
Expand Down
6 changes: 6 additions & 0 deletions lib/Bio/PanGenome/External/Cdhit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 0
has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 0.99 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

sub clusters_filename
{
my ($self) = @_;
return join('.',($self->output_base,'clstr'));
}

sub _command_to_run {
my ($self) = @_;
return join(
Expand Down
54 changes: 54 additions & 0 deletions lib/Bio/PanGenome/External/Mcl.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package Bio::PanGenome::External::Mcl;

# ABSTRACT: Wrapper around MCL which takes in blast results and outputs clustered results

=head1 SYNOPSIS
Wrapper around MCL which takes in blast results and outputs clustered results
use Bio::PanGenome::External::Mcl;
my $mcl= Bio::PanGenome::External::Mcl->new(
blast_results => 'db',
mcxdeblast_exec => 'mcxdeblast',
mcl_exec => 'mcl',
output_file => 'output.groups'
);
$mcl->run();
=cut

use Moose;

has 'blast_results' => ( is => 'ro', isa => 'Str', required => 1 );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );
has 'output_file' => ( is => 'ro', isa => 'Str', default => 'output_groups' );

has '_inflation_value' => ( is => 'ro', isa => 'Num', default => 1.5 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

sub _command_to_run {
my ($self) = @_;
return join(
" ",
(
$self->mcxdeblast_exec, '-m9',
'--line-mode=abc', $self->blast_results,
'|', $self->mcl_exec, '-', '--abc',
'-I', $self->_inflation_value, '-o', $self->output_file,
$self->_logging
)
);
}

sub run {
my ($self) = @_;
system( $self->_command_to_run );
1;
}

no Moose;
__PACKAGE__->meta->make_immutable;
1;
69 changes: 0 additions & 69 deletions lib/Bio/PanGenome/GGFile.pm

This file was deleted.

Loading

0 comments on commit f4bd8d4

Please sign in to comment.