Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept genbank files #121

Merged
merged 8 commits into from
May 14, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/create_pan_genome_plots.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/software/pathogen/external/apps/usr/local/bin/Rscript
#!/usr/bin/env Rscript
# ABSTRACT: Create R plots
# PODNAME: create_plots.R

Expand Down
2 changes: 1 addition & 1 deletion dist.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = Bio-Roary
version = 2.1.2
version = 2.2.0
author = Andrew J. Page <ap13@sanger.ac.uk>
license = GPL_3
copyright_holder = Wellcome Trust Sanger Institute
Expand Down
11 changes: 11 additions & 0 deletions lib/Bio/Roary/CommandLine/Common.pm
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ Common command line settings

use Moose;
use FindBin;
use Log::Log4perl qw(:easy);

has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger');

sub _build_logger
{
my ($self) = @_;
Log::Log4perl->easy_init(level => $ERROR);
my $logger = get_logger();
return $logger;
}


sub run {
Expand Down
193 changes: 100 additions & 93 deletions lib/Bio/Roary/CommandLine/Roary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,41 +13,47 @@ use Getopt::Long qw(GetOptionsFromArray);
use Bio::Roary;
use Bio::Roary::PrepareInputFiles;
use Bio::Roary::QC::Report;
use File::Which;
use File::Which;
extends 'Bio::Roary::CommandLine::Common';

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'rw', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'rw', isa => 'Str', default => 'mcl' );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 98 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'dont_run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 1 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );

has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'rw', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'rw', isa => 'Str', default => 'mcl' );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 98 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'dont_run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 1 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );

has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );

sub BUILD {
my ($self) = @_;

my ( $fasta_files,$verbose, $create_rplots,$group_limit, $dont_run_qc, $max_threads, $dont_delete_files, $dont_split_groups, $perc_identity, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus,$output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $core_definition, $help );
my (
$fasta_files, $verbose, $create_rplots, $group_limit, $dont_run_qc,
$max_threads, $dont_delete_files, $dont_split_groups, $perc_identity, $output_filename,
$job_runner, $makeblastdb_exec, $mcxdeblast_exec, $mcl_exec, $blastp_exec,
$apply_unknowns_filter, $cpus, $output_multifasta_files, $verbose_stats, $translation_table,
$run_qc, $core_definition, $help
);

GetOptionsFromArray(
$self->args,
Expand All @@ -56,7 +62,7 @@ sub BUILD {
'm|makeblastdb_exec=s' => \$makeblastdb_exec,
'b|blastp_exec=s' => \$blastp_exec,
'd|mcxdeblast_exec=s' => \$mcxdeblast_exec,
'c|mcl_exec=s' => \$mcl_exec,
'c|mcl_exec=s' => \$mcl_exec,
'p|processors=i' => \$cpus,
'apply_unknowns_filter=i' => \$apply_unknowns_filter,
'e|output_multifasta_files' => \$output_multifasta_files,
Expand All @@ -68,17 +74,26 @@ sub BUILD {
't|translation_table=i' => \$translation_table,
'group_limit=i' => \$group_limit,
'qc|run_qc' => \$run_qc,
'dont_run_qc' => \$dont_run_qc,
'dont_run_qc' => \$dont_run_qc,
'cd|core_definition=i' => \$core_definition,
'v|verbose' => \$verbose,
'v|verbose' => \$verbose,
'h|help' => \$help,
);

$self->help($help) if(defined($help));
if ( @{ $self->args } == 0 ) {
$self->_error_message("Error: You need to provide a GFF file");
}

print "\nPlease cite Roary if you use any of the results it produces:
\"Roary: Rapid large-scale prokaryote pan genome analysis\",
Andrew J. Page, Carla A. Cummins, Martin Hunt, Vanessa K. Wong, Sandra Reuter, Matthew T. G. Holden, Maria Fookes, Jacqueline A. Keane, Julian Parkhill,
bioRxiv doi: http://dx.doi.org/10.1101/019315\n\n";

if ( defined($verbose) ) {
$self->verbose($verbose);
$self->logger->level(10000);
}
$self->help($help) if ( defined($help) );
if(@{$self->args} == 0)
{
$self->logger->error("Error: You need to provide a GFF file");
}
$self->output_filename($output_filename) if ( defined($output_filename) );
$self->job_runner($job_runner) if ( defined($job_runner) );
$self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) );
Expand All @@ -87,58 +102,50 @@ sub BUILD {
$self->mcl_exec($mcl_exec) if ( defined($mcl_exec) );
$self->cpus($cpus) if ( defined($cpus) );
$self->perc_identity($perc_identity) if ( defined($perc_identity) );
$self->apply_unknowns_filter($apply_unknowns_filter) if ( defined($apply_unknowns_filter) );
if ( defined($output_multifasta_files) )
{
if(which('revtrans.py'))
{
$self->output_multifasta_files($output_multifasta_files) ;
}
else
{
print "WARNING: revtrans.py not found in your PATH so cannot generate multiFASTA alignments, skipping for now.\n";
}
}
$self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) );
$self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) );
$self->dont_create_rplots(0) if ( defined($create_rplots) );
$self->verbose_stats($verbose_stats) if ( defined $verbose_stats );
$self->translation_table($translation_table) if ( defined($translation_table) );
$self->group_limit($group_limit) if ( defined($group_limit) );
$self->verbose($verbose) if ( defined($verbose) );

if ( defined( $run_qc ) )
{
if(which('kraken') && which('kraken-report'))
{
$self->run_qc($run_qc) ;
}
else
{
print "WARNING: kraken or kraken-report not found in your PATH so cannot run QC, skipping for now.\n";
}
}

if($self->cpus > 1)
{
$self->job_runner('Parallel');
}

$self->core_definition( $core_definition/100 ) if ( defined($core_definition) );
$self->apply_unknowns_filter($apply_unknowns_filter)
if ( defined($apply_unknowns_filter) );

if ( defined($output_multifasta_files) ) {
if ( which('revtrans.py') ) {
$self->output_multifasta_files($output_multifasta_files);
}
else {
$self->logger->warn("revtrans.py not found in your PATH so cannot generate multiFASTA alignments, skipping for now.");
}
}
$self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) );
$self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) );
$self->dont_create_rplots(0) if ( defined($create_rplots) );
$self->verbose_stats($verbose_stats) if ( defined $verbose_stats );
$self->translation_table($translation_table) if ( defined($translation_table) );
$self->group_limit($group_limit) if ( defined($group_limit) );


if ( defined($run_qc) ) {
if ( which('kraken') && which('kraken-report') ) {
$self->run_qc($run_qc);
}
else {
$self->logger->warn("kraken or kraken-report not found in your PATH so cannot run QC, skipping for now.");
}
}

if ( $self->cpus > 1 ) {
$self->job_runner('Parallel');
}

$self->core_definition( $core_definition / 100 ) if ( defined($core_definition) );

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
$self->_error_message("Error: Cant access file $filename");
$self->logger->error("Error: Cant access file $filename");
last;
}
}
$self->fasta_files( $self->args );

}




sub run {
my ($self) = @_;

Expand All @@ -147,24 +154,24 @@ sub run {
print $self->_error_message . "\n";
die $self->usage_text;
}
print "Extracting proteins from GFF files\n" if($self->verbose);

$self->logger->info("Extracting proteins from GFF files");
my $prepare_input_files = Bio::Roary::PrepareInputFiles->new(
input_files => $self->fasta_files,
job_runner => $self->job_runner,
apply_unknowns_filter => $self->apply_unknowns_filter,
cpus => $self->cpus,
translation_table => $self->translation_table,
verbose => $self->verbose
input_files => $self->fasta_files,
job_runner => $self->job_runner,
apply_unknowns_filter => $self->apply_unknowns_filter,
cpus => $self->cpus,
translation_table => $self->translation_table,
verbose => $self->verbose
);

if( $self->run_qc ){
print "Running Kraken on each input assembly\n" if($self->verbose);
if ( $self->run_qc ) {
$self->logger->info("Running Kraken on each input assembly");
my $qc_input_files = Bio::Roary::QC::Report->new(
input_files => $self->fasta_files,
job_runner => $self->job_runner,
cpus => $self->cpus,
verbose => $self->verbose
cpus => $self->cpus,
verbose => $self->verbose
);
$qc_input_files->report;
}
Expand All @@ -186,8 +193,8 @@ sub run {
translation_table => $self->translation_table,
group_limit => $self->group_limit,
core_definition => $self->core_definition,
verbose => $self->verbose
);
verbose => $self->verbose
);
$pan_genome_obj->run();
}

Expand Down
27 changes: 20 additions & 7 deletions lib/Bio/Roary/ExtractProteomeFromGFF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ has '_working_directory' =>
( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );

sub _build_fasta_file
{
Expand Down Expand Up @@ -90,8 +91,10 @@ sub _create_bed_file_from_gff {
my $cmd =
'sed -n \'/##gff-version 3/,/##FASTA/p\' '
. $self->gff_file
.' | awk \'BEGIN {FS="\t"};{ if ($3 ~/'. $self->_tags_to_filter. '/) print $0;}\' '
. ' | grep -v \'^#\' | awk \'{if ($5 - $4 >= '.$self->min_gene_size_in_nucleotides.') print $1"\t"($4-1)"\t"($5)"\t"$9"\t1\t"$7}\' | sed \'s/ID=//\' | sed \'s/;[^\t]*\t/\t/g\' > '
. $self->_bed_output_filename;
$self->logger->debug($cmd);
system($cmd);
}

Expand All @@ -102,6 +105,7 @@ sub _create_nucleotide_fasta_file_from_gff {
. $self->gff_file
. ' | grep -v \'##FASTA\' > '
. $self->_nucleotide_fasta_file_from_gff_filename;
$self->logger->debug($cmd);
system($cmd);
}

Expand All @@ -119,10 +123,12 @@ sub _extract_nucleotide_regions {
. ' -fo '
. $self->_extracted_nucleotide_fasta_file_from_bed_filename
. ' -name > /dev/null 2>&1';
system($cmd);
unlink($self->_nucleotide_fasta_file_from_gff_filename);
unlink($self->_bed_output_filename);
unlink($self->_nucleotide_fasta_file_from_gff_filename.'.fai');

$self->logger->debug($cmd);
system($cmd);
unlink($self->_nucleotide_fasta_file_from_gff_filename);
unlink($self->_bed_output_filename);
unlink($self->_nucleotide_fasta_file_from_gff_filename.'.fai');
}

sub _cleanup_fasta {
Expand Down Expand Up @@ -170,15 +176,13 @@ sub _convert_nucleotide_to_protein
unlink($self->_extracted_nucleotide_fasta_file_from_bed_filename);
}



sub _does_sequence_contain_too_many_unknowns
{
my ($self, $sequence_obj) = @_;
my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
if($number_of_Xs_found > $maximum_number_of_Xs)
{
{
return 1;
}
else
Expand All @@ -194,6 +198,8 @@ sub _filter_fasta_sequences
my $temp_output_file = $filename.'.tmp.filtered.fa';
my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$temp_output_file, -format => 'Fasta');
my $fasta_obj = Bio::SeqIO->new( -file => $filename, -format => 'Fasta');

my $sequence_found = 0;

while(my $seq = $fasta_obj->next_seq())
{
Expand All @@ -203,7 +209,14 @@ sub _filter_fasta_sequences
}
$seq->desc(undef);
$out_fasta_obj->write_seq($seq);
$sequence_found = 1;
}

if($sequence_found == 0)
{
$self->logger->error("Could not extract any protein sequences from ".$self->gff_file.". Does the file contain the assembly as well as the annotation?");
}

# Replace the original file.
move($temp_output_file, $filename);
return 1;
Expand Down
Loading