Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Duplicate sequences in pan genome reference fasta #182

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dist.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = Bio-Roary
version = 3.3.0
version = 3.3.1
author = Andrew J. Page <ap13@sanger.ac.uk>
license = GPL_3
copyright_holder = Wellcome Trust Sanger Institute
Expand Down
3 changes: 2 additions & 1 deletion lib/Bio/Roary/BedFromGFFRole.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ use Bio::Tools::GFF;

has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );
has 'min_gene_size_in_nucleotides' => ( is => 'ro', isa => 'Int', default => 120 );
has 'output_directory' => ( is => 'ro', isa => 'Str', default => '.' );

sub _bed_output_filename {
my ($self) = @_;
return join( '.', ( $self->output_filename, 'intermediate.bed' ) );
return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.bed' ) )));
}

sub _create_bed_file_from_gff {
Expand Down
16 changes: 10 additions & 6 deletions lib/Bio/Roary/CommandLine/ExtractProteomeFromGff.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,25 @@ has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'gff_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_suffix' => ( is => 'rw', isa => 'Str', default => 'proteome.faa' );
has 'output_suffix' => ( is => 'rw', isa => 'Str', default => 'proteome.faa' );
has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'output_directory' => ( is => 'rw', isa => 'Str', default => '.' );

sub BUILD {
my ($self) = @_;

my ( $gff_files, $output_suffix, $apply_unknowns_filter, $help, $translation_table,$verbose,$cmd_version );
my ( $gff_files, $output_suffix, $apply_unknowns_filter, $help, $translation_table, $verbose, $cmd_version, $output_directory );

GetOptionsFromArray(
$self->args,
'o|output_suffix=s' => \$output_suffix,
'f|apply_unknowns_filter=i' => \$apply_unknowns_filter,
't|translation_table=i' => \$translation_table,
'v|verbose' => \$verbose,
'd|output_directory=s' => \$output_directory,
'w|version' => \$cmd_version,
'h|help' => \$help,
);
Expand All @@ -53,14 +55,15 @@ sub BUILD {
if ( $self->version ) {
die($self->_version());
}

if ( @{ $self->args } == 0 ) {
$self->_error_message("Error: You need to provide a GFF file");
}

$self->output_suffix($output_suffix) if ( defined($output_suffix) ) ;
$self->apply_unknowns_filter($apply_unknowns_filter) if ( defined($apply_unknowns_filter) );
$self->translation_table($translation_table) if (defined($translation_table) );
$self->translation_table($translation_table) if ( defined($translation_table) );
$self->output_directory($output_directory) if ( defined($output_directory) );

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
Expand All @@ -75,7 +78,6 @@ sub BUILD {
sub run {
my ($self) = @_;


if ( defined( $self->_error_message ) ) {
print $self->_error_message . "\n";
die $self->usage_text;
Expand All @@ -87,7 +89,8 @@ sub run {
gff_file => $gff_file,
output_filename => $filename . '.' . $self->output_suffix,
apply_unknowns_filter => $self->apply_unknowns_filter,
translation_table => $self->translation_table
translation_table => $self->translation_table,
output_directory => $self->output_directory,
);
$obj->fasta_file();
}
Expand All @@ -105,6 +108,7 @@ Options: -o STR output suffix [proteome.faa]
-t INT translation table [11]
-f filter sequences with missing data
-v verbose output to STDOUT
-d STR output directory
-w print version and exit
-h this help message

Expand Down
10 changes: 8 additions & 2 deletions lib/Bio/Roary/CommandLine/Roary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use Bio::Roary::External::CheckTools;
use File::Which;
use File::Path qw(make_path);
use Cwd qw(abs_path getcwd);
use File::Temp;
extends 'Bio::Roary::CommandLine::Common';

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
Expand Down Expand Up @@ -50,8 +51,8 @@ has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'kraken_db' => ( is => 'rw', isa => 'Str', default => '/lustre/scratch108/pathogen/pathpipe/kraken/minikraken_20140330/' );

has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );
has '_working_directory' => ( is => 'rw', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );

sub BUILD {
my ($self) = @_;
Expand Down Expand Up @@ -188,6 +189,9 @@ sub BUILD {
}
push(@{$self->fasta_files}, abs_path($filename ));
}


$self->_working_directory(File::Temp->newdir( DIR => getcwd, CLEANUP => 0 )) if( $self->dont_delete_files );
}

sub _setup_output_directory {
Expand Down Expand Up @@ -235,7 +239,9 @@ sub run {
apply_unknowns_filter => $self->apply_unknowns_filter,
cpus => $self->cpus,
translation_table => $self->translation_table,
verbose => $self->verbose
verbose => $self->verbose,
working_directory => $self->_working_directory,

);

if ( $self->run_qc ) {
Expand Down
24 changes: 20 additions & 4 deletions lib/Bio/Roary/External/PostAnalysis.pm
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Perform the post analysis
=cut

use Moose;
use Cwd qw(getcwd);
with 'Bio::Roary::JobRunner::Role';

has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
Expand All @@ -36,6 +37,9 @@ has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 );
has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_gff_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__gff_fofn' );
has '_fasta_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__fasta_fofn' );

# Overload Role
has '_memory_required_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__memory_required_in_mb' );
Expand Down Expand Up @@ -72,11 +76,23 @@ sub _build__memory_required_in_mb {
return $memory_required;
}

sub _build__gff_fofn
{
my ($self) = @_;
return join('/', ($self->_working_directory, '/_gff_files'));
}

sub _build__fasta_fofn
{
my ($self) = @_;
return join('/', ($self->_working_directory, '/_fasta_files'));
}


sub _output_gff_files
{
my ($self) = @_;
open(my $out_fh, '>', '_gff_files');
open(my $out_fh, '>', $self->_gff_fofn);
for my $filename (@{$self->input_files})
{
print {$out_fh} $filename."\n";
Expand All @@ -87,7 +103,7 @@ sub _output_gff_files
sub _output_fasta_files
{
my ($self) = @_;
open(my $out_fh, '>', '_fasta_files');
open(my $out_fh, '>', $self->_fasta_fofn);
for my $filename (@{$self->fasta_files})
{
print {$out_fh} $filename."\n";
Expand Down Expand Up @@ -131,8 +147,8 @@ sub _command_to_run {
'-s', $self->output_statistics_filename,
'-c', $self->clusters_filename,
$output_multifasta_files_flag,
'-i', '_gff_files',
'-f', '_fasta_files',
'-i', $self->_gff_fofn,
'-f', $self->_fasta_fofn,
'-t', $self->translation_table,
$dont_delete_files_flag,
$dont_create_rplots_flag,
Expand Down
27 changes: 12 additions & 15 deletions lib/Bio/Roary/ExtractProteomeFromGFF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,20 @@ with 'Bio::Roary::BedFromGFFRole';
has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );

has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );


has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );

sub _build_fasta_file {
my ($self) = @_;
$self->_extract_nucleotide_regions;
$self->_convert_nucleotide_to_protein;
$self->_cleanup_fasta;
$self->_cleanup_intermediate_files;
$self->_filter_fasta_sequences( $self->output_filename );
return $self->output_filename;
$self->_filter_fasta_sequences( join('/',($self->output_directory,$self->output_filename)) );
return join('/',($self->output_directory,$self->output_filename));
}

sub _build__working_directory_name {
Expand All @@ -68,17 +65,17 @@ sub _cleanup_intermediate_files {

sub _nucleotide_fasta_file_from_gff_filename {
my ($self) = @_;
return join( '.', ( $self->output_filename, 'intermediate.fa' ) );
return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.fa' ) )));
}

sub _extracted_nucleotide_fasta_file_from_bed_filename {
my ($self) = @_;
return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) );
return join('/',($self->output_directory,join( '.', ( $self->output_filename,'intermediate.extracted.fa' ) )));
}

sub _unfiltered_output_filename {
my $self = shift;
return join( '.', ( $self->output_filename, 'unfiltered.fa' ) );
return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'unfiltered.fa' ) )));
}


Expand Down Expand Up @@ -113,7 +110,7 @@ sub _extract_nucleotide_regions {
sub _cleanup_fasta {
my $self = shift;
my $infile = $self->_unfiltered_output_filename;
my $outfile = $self->output_filename;
my $outfile = join('/',($self->output_directory,$self->output_filename));
return unless ( -e $infile );

open( my $in, '<', $infile );
Expand All @@ -129,7 +126,7 @@ sub _cleanup_fasta {

sub _fastatranslate_filename {
my ($self) = @_;
return join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) );
return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) )));
}

sub _fastatranslate {
Expand Down
12 changes: 7 additions & 5 deletions lib/Bio/Roary/ExtractProteomeFromGFFs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,18 @@ use Moose;
use Bio::Roary::Exceptions;
use Bio::Roary::ExtractProteomeFromGFF;
use File::Basename;
use Cwd qw(getcwd);
use File::Temp;
with 'Bio::Roary::JobRunner::Role';

has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_fasta_files' );
has 'fasta_files_to_gff_files' =>
( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_fasta_files_to_gff_files' );
has 'fasta_files_to_gff_files' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_fasta_files_to_gff_files' );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has '_queue' => ( is => 'rw', isa => 'Str', default => 'small' );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );

sub _build__extract_proteome_objects
{
Expand Down Expand Up @@ -61,10 +63,10 @@ sub _build_fasta_files_to_gff_files {
my $output_suffix = "proteome.faa";

my $output_filename = $filename.'.'.$output_suffix;
$fasta_files{ $filename } = $gff_filename_without_directory.'.'.$output_suffix;
push(@commands_to_run, "extract_proteome_from_gff --translation_table ".$self->translation_table." --apply_unknowns_filter ".$self->apply_unknowns_filter." -o $output_suffix $filename");
$fasta_files{ $filename } = $self->working_directory.'/'.$gff_filename_without_directory.'.'.$output_suffix;
push(@commands_to_run, "extract_proteome_from_gff --translation_table ".$self->translation_table." --apply_unknowns_filter ".$self->apply_unknowns_filter." -d ".$self->working_directory." -o $output_suffix $filename");
}
# Farm out the computation and block until its ready
#Farm out the computation and block until its ready
my $job_runner_obj = $self->_job_runner_class->new( commands_to_run => \@commands_to_run, memory_in_mb => $self->_memory_required_in_mb, queue => $self->_queue, cpus => $self->cpus);
$job_runner_obj->run();

Expand Down
3 changes: 2 additions & 1 deletion lib/Bio/Roary/GroupStatistics.pm
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Add labels to the groups
use Moose;
use POSIX;
use Text::CSV;
use File::Basename;
use Bio::SeqIO;
use Bio::Roary::Exceptions;
use Bio::Roary::AnalyseGroups;
Expand Down Expand Up @@ -63,7 +64,7 @@ sub _header {
my @header = @{ $self->fixed_headers };

for my $filename ( @{ $self->_sorted_file_names } ) {
my $filename_cpy = $filename;
my $filename_cpy = basename($filename);
$filename_cpy =~ s!\.gff\.proteome\.faa!!;
push( @header, $filename_cpy );
}
Expand Down
7 changes: 6 additions & 1 deletion lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ sub _build__input_seqio {
return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
}

sub _bed_output_filename {
my ($self) = @_;
return join( '.', ( $self->output_filename, 'intermediate.bed' ) );
}

sub populate_files {
my ($self) = @_;
while ( my $input_seq = $self->_input_seqio->next_seq() )
Expand All @@ -63,7 +68,7 @@ sub populate_files {
if(! defined($self->pan_reference_groups_seen->{$current_group}))
{
my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group);
$pan_output_seq->write_seq($input_seq);
$pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => $current_group, -seq => $input_seq->seq ) );
$self->pan_reference_groups_seen->{$current_group} = 1;
}

Expand Down
21 changes: 10 additions & 11 deletions lib/Bio/Roary/Output/GroupsMultifastasNucleotide.pm
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,15 @@ use Bio::Roary::Exceptions;
use Bio::Roary::AnalyseGroups;
use Bio::Roary::Output::GroupsMultifastaNucleotide;

has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 );
has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
has 'dont_delete_files' => ( is => 'ro', isa => 'Bool', default => 0 );

has 'output_directory' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_directory');

has '_number_of_groups' => ( is => 'rw', isa => 'Num', lazy_build => 1 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 );
has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
has 'dont_delete_files' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'output_directory' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_directory');
has '_number_of_groups' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__number_of_groups' );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );

sub _build_output_directory
{
Expand All @@ -58,6 +56,7 @@ sub create_files {
}

make_path($self->output_directory);
unlink('pan_genome_reference.fa');

my $number_of_gff_files = @{$self->gff_files};
my %pan_reference_groups_seen;
Expand Down
Loading