Skip to content

Commit

Permalink
Merge pull request #143 from andrewjpage/blast_errors
Browse files Browse the repository at this point in the history
Fix input files with duplicate IDs
  • Loading branch information
andrewjpage committed Jun 8, 2015
2 parents a2a9965 + 29f760a commit 8157771
Show file tree
Hide file tree
Showing 28 changed files with 2,178 additions and 47 deletions.
1 change: 0 additions & 1 deletion bin/create_pan_genome
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Create a pan genome from a set of proteome FASTA files

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::CreatePanGenome;

Bio::Roary::CommandLine::CreatePanGenome->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/extract_proteome_from_gff
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Take in GFF files and output the proteome

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::ExtractProteomeFromGff;

Bio::Roary::CommandLine::ExtractProteomeFromGff->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/iterative_cdhit
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Iteratively run cdhit

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::IterativeCdhit;

Bio::Roary::CommandLine::IterativeCdhit->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/pan_genome_core_alignment
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ package Bio::Roary::Main::RoaryCoreAlignment;

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::RoaryCoreAlignment;

Bio::Roary::CommandLine::RoaryCoreAlignment->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/pan_genome_post_analysis
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Perform the post analysis on the pan genome

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::RoaryPostAnalysis;

Bio::Roary::CommandLine::RoaryPostAnalysis->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/pan_genome_reorder_spreadsheet
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Take in a tree and a spreadsheet and output a reordered spreadsheet

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::RoaryReorderSpreadsheet;

Bio::Roary::CommandLine::RoaryReorderSpreadsheet->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/parallel_all_against_all_blastp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Take in a FASTA file of proteins and blast against itself

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::ParallelAllAgainstAllBlastp;

Bio::Roary::CommandLine::ParallelAllAgainstAllBlastp->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/protein_muscle_alignment_from_nucleotides
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Take in a multifasta file of nucleotides, convert to proteins and align with mus

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::ProteinMuscleAlignmentFromNucleotides;

Bio::Roary::CommandLine::ProteinMuscleAlignmentFromNucleotides->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/query_pan_genome
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Take in a groups file and the protein fasta files and output selected data

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::QueryRoary;

Bio::Roary::CommandLine::QueryRoary->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/roary
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Create a pan genome from a set of proteome FASTA files

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::Roary;

Bio::Roary::CommandLine::Roary->new(args => \@ARGV, script_name => $0)->run;
1 change: 0 additions & 1 deletion bin/transfer_annotation_to_groups
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Take in a groups file and a set of GFF files and transfer the consensus annotati

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::Roary::CommandLine::TransferAnnotationToGroups;

Bio::Roary::CommandLine::TransferAnnotationToGroups->new(args => \@ARGV, script_name => $0)->run;
2 changes: 1 addition & 1 deletion dist.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = Bio-Roary
version = 2.3.1
version = 2.3.2
author = Andrew J. Page <ap13@sanger.ac.uk>
license = GPL_3
copyright_holder = Wellcome Trust Sanger Institute
Expand Down
11 changes: 11 additions & 0 deletions lib/Bio/Roary/CommandLine/Roary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use Getopt::Long qw(GetOptionsFromArray);
use Bio::Roary;
use Bio::Roary::PrepareInputFiles;
use Bio::Roary::QC::Report;
use Bio::Roary::ReformatInputGFFs;
use File::Which;
extends 'Bio::Roary::CommandLine::Common';

Expand Down Expand Up @@ -154,6 +155,16 @@ sub run {

( !$self->help ) or die $self->usage_text;

$self->logger->info("Fixing input GFF files");
my $reformat_input_files = Bio::Roary::ReformatInputGFFs->new( gff_files => $self->fasta_files, logger => $self->logger );
$reformat_input_files->fix_duplicate_gene_ids();
if(@{$reformat_input_files->fixed_gff_files} == 0)
{
$self->logger->error("All input files have been excluded from analysis. Please check you have valid GFF files, with annotation and a FASTA sequence at the end. Better still, reannotate your FASTA file with PROKKA.");
die();
}
$self->fasta_files($reformat_input_files->fixed_gff_files);

$self->logger->info("Extracting proteins from GFF files");
my $prepare_input_files = Bio::Roary::PrepareInputFiles->new(
input_files => $self->fasta_files,
Expand Down
8 changes: 8 additions & 0 deletions lib/Bio/Roary/ParallelAllAgainstAllBlast.pm
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder

has '_memory_required_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__memory_required_in_mb' );


sub BUILD {
my ($self) = @_;
$self->_makeblastdb_obj();
}


sub _build__blast_database {
my ($self) = @_;
return $self->_makeblastdb_obj->output_database;
Expand Down Expand Up @@ -106,6 +113,7 @@ sub run {
my ($self) = @_;
my @expected_output_files;
my @commands_to_run;

for my $filename ( @{ $self->_sequence_file_names } ) {
my ( $filename_without_directory, $directories, $suffix ) = fileparse($filename);
my $output_seq_results_file =
Expand Down
162 changes: 162 additions & 0 deletions lib/Bio/Roary/ReformatInputGFFs.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package Bio::Roary::ReformatInputGFFs;

# ABSTRACT: Take in gff files and add suffix where a gene id is seen twice

=head1 SYNOPSIS
Take in gff files and add suffix where a gene id is seen twice
use Bio::Roary::ReformatInputGFFs;
my $obj = Bio::Roary::PrepareInputFiles->new(
gff_files => ['abc.gff','ddd.faa'],
);
$obj->fix_duplicate_gene_ids;
$obj->fixed_gff_files;
=cut

use Moose;
use Bio::Roary::Exceptions;
use Cwd;
use Log::Log4perl qw(:easy);
use Bio::Tools::GFF;
use File::Path qw(make_path);
use File::Basename;

has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );
has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' );
has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 );

has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );

sub _build_logger {
my ($self) = @_;
Log::Log4perl->easy_init( level => $ERROR );
my $logger = get_logger();
return $logger;
}

sub fix_duplicate_gene_ids {
my ($self) = @_;

my %gene_ids_seen_before;
for my $file ( @{ $self->gff_files } ) {

my $ids_seen = 0;
my $ids_from_file = $self->_get_ids_for_gff_file($file);

if ( @{$ids_from_file} < 1 ) {
$self->logger->warn(
"Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
);
}
else {
for my $gene_id ( @{$ids_from_file} ) {
if ( $gene_ids_seen_before{$gene_id} ) {
$self->logger->warn(
"Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix. New GFF in the fixed_input_files directory. $file "
);
my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file);
push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
$ids_seen = 1;
last;
}
$gene_ids_seen_before{$gene_id}++;
}
if ( $ids_seen == 0 ) {
push( @{ $self->fixed_gff_files }, $file );
}
}
}
return 1;
}

sub _add_suffix_to_gene_ids_and_return_new_file {
my ( $self, $input_file ) = @_;
my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
my $output_file = $self->output_directory . '/' . $filename . $suffix;

open( my $input_gff_fh, $input_file );
open( my $out_gff_fh, '>', $output_file );

my $found_fasta = 0;
while (<$input_gff_fh>) {
my $line = $_;

if ( $line =~ /^\#\#FASTA/ ) {
$found_fasta = 1;
}

if ( $line =~ /\#/ || $found_fasta == 1 ) {
print {$out_gff_fh} $line;
next;
}

my @cells = split( /\t/, $line );
my @tags = split( /;/, $cells[8] );
my $found_id = 0;
for ( my $i = 0 ; $i < @tags ; $i++ ) {
if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
my $current_id = $2;
$current_id .= '___' . $self->suffix_counter;
$tags[$i] = $1 . $current_id . $3;
$self->suffix_counter( $self->suffix_counter + 1 );
$found_id++;
last;
}
}
if ( $found_id == 0 ) {
unshift( @tags, 'ID=id___' . $self->suffix_counter );
$self->suffix_counter( $self->suffix_counter + 1 );
}
$cells[8] = join( ';', @tags );
print {$out_gff_fh} join( "\t", @cells );
}

if ( $found_fasta == 0 ) {
$self->logger->warn(
"Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
return undef;
}
close($out_gff_fh);
close($input_gff_fh);
return $output_file;
}

sub _get_ids_for_gff_file {
my ( $self, $file ) = @_;
my @gene_ids;
my $tags_regex = $self->_tags_to_filter;
my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
while ( my $feature = $gffio->next_feature() ) {
next if !( $feature->primary_tag =~ /$tags_regex/ );
my $gene_id = $self->_get_feature_id($feature);
push( @gene_ids, $gene_id ) if ( defined($gene_id) );
}
return \@gene_ids;
}

sub _get_feature_id {
my ( $self, $feature ) = @_;
my ( $gene_id, @junk );
if ( $feature->has_tag('ID') ) {
( $gene_id, @junk ) = $feature->get_tag_values('ID');
}
elsif ( $feature->has_tag('locus_tag') ) {
( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
}
else {
return undef;
}
$gene_id =~ s!["']!!g;
return undef if ( $gene_id eq "" );
return $gene_id;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
10 changes: 5 additions & 5 deletions t/Bio/Roary/CommandLine/Roary.t
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ system('touch empty_file');

' --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
[ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
' -t 1 --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
' -j Local -t 1 --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
[ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
' -j Parallel --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
[ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
' -t 1 -j Parallel --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
[ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
' --dont_split_groups t/data/genbank_gbff/genbank1.gff t/data/genbank_gbff/genbank2.gff t/data/genbank_gbff/genbank3.gff' =>
' -j Local --dont_split_groups t/data/genbank_gbff/genbank1.gff t/data/genbank_gbff/genbank2.gff t/data/genbank_gbff/genbank3.gff' =>
[ 'gene_presence_absence.csv', 't/data/genbank_gbff/genbank_gene_presence_absence.csv' ],
'-h' =>
[ 'empty_file', 't/data/empty_file' ],
Expand All @@ -46,7 +46,7 @@ cleanup_files();


%scripts_and_expected_files = (
' --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
' -j Local --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
[ 'clustered_proteins', 't/data/clustered_proteins_pan_genome' ],
' -j Parallel --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
[ 'clustered_proteins', 't/data/clustered_proteins_pan_genome' ],
Expand All @@ -72,12 +72,12 @@ ok((-e 'query_5.gff.proteome.faa'),'Check protein query_5.gff.proteome.faa is no
cleanup_files();

%scripts_and_expected_files = (
'-j Local --dont_delete_files t/data/locus_tag_gffs/query_1.gff t/data/locus_tag_gffs/query_2.gff t/data/locus_tag_gffs/query_3.gff ' =>
'-j Local --dont_delete_files t/data/locus_tag_gffs/query_1.gff t/data/locus_tag_gffs/query_2.gff t/data/locus_tag_gffs/query_5.gff ' =>
[ 'empty_file', 't/data/empty_file' ],
);
mock_execute_script_and_check_output_sorted_groups( $script_name, \%scripts_and_expected_files, [0,6,7,8,9] );

for my $filename (('query_1.gff.proteome.faa','query_2.gff.proteome.faa','query_3.gff.proteome.faa')) {
for my $filename (('query_1.gff.proteome.faa','query_2.gff.proteome.faa','query_5.gff.proteome.faa')) {
is( read_file($filename), read_file( 't/data/locus_tag_gffs/' . $filename . '.expected' ), "content of proteome $filename as expected" );
}

Expand Down
Loading

0 comments on commit 8157771

Please sign in to comment.