Merge pull request #143 from andrewjpage/blast_errors

Fix input files with duplicate IDs
sanger-pathogens · Jun 8, 2015 · 8157771 · 8157771
2 parents a2a9965 + 29f760a
commit 8157771
Show file tree

Hide file tree

Showing 28 changed files with 2,178 additions and 47 deletions.
diff --git a/bin/create_pan_genome b/bin/create_pan_genome
@@ -13,7 +13,6 @@ Create a pan genome from a set of proteome FASTA files
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::CreatePanGenome;
 
 Bio::Roary::CommandLine::CreatePanGenome->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/extract_proteome_from_gff b/bin/extract_proteome_from_gff
@@ -13,7 +13,6 @@ Take in GFF files and output the proteome
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::ExtractProteomeFromGff;
 
 Bio::Roary::CommandLine::ExtractProteomeFromGff->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/iterative_cdhit b/bin/iterative_cdhit
@@ -13,7 +13,6 @@ Iteratively run cdhit
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::IterativeCdhit;
 
 Bio::Roary::CommandLine::IterativeCdhit->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/pan_genome_core_alignment b/bin/pan_genome_core_alignment
@@ -13,7 +13,6 @@ package Bio::Roary::Main::RoaryCoreAlignment;
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::RoaryCoreAlignment;
 
 Bio::Roary::CommandLine::RoaryCoreAlignment->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/pan_genome_post_analysis b/bin/pan_genome_post_analysis
@@ -13,7 +13,6 @@ Perform the post analysis on the pan genome
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::RoaryPostAnalysis;
 
 Bio::Roary::CommandLine::RoaryPostAnalysis->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/pan_genome_reorder_spreadsheet b/bin/pan_genome_reorder_spreadsheet
@@ -13,7 +13,6 @@ Take in a tree and a spreadsheet and output a reordered spreadsheet
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::RoaryReorderSpreadsheet;
 
 Bio::Roary::CommandLine::RoaryReorderSpreadsheet->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/parallel_all_against_all_blastp b/bin/parallel_all_against_all_blastp
@@ -13,7 +13,6 @@ Take in a FASTA file of proteins and blast against itself
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::ParallelAllAgainstAllBlastp;
 
 Bio::Roary::CommandLine::ParallelAllAgainstAllBlastp->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/protein_muscle_alignment_from_nucleotides b/bin/protein_muscle_alignment_from_nucleotides
@@ -13,7 +13,6 @@ Take in a multifasta file of nucleotides, convert to proteins and align with mus
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::ProteinMuscleAlignmentFromNucleotides;
 
 Bio::Roary::CommandLine::ProteinMuscleAlignmentFromNucleotides->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/query_pan_genome b/bin/query_pan_genome
@@ -13,7 +13,6 @@ Take in a groups file and the protein fasta files and output selected data
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::QueryRoary;
 
 Bio::Roary::CommandLine::QueryRoary->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/roary b/bin/roary
@@ -13,7 +13,6 @@ Create a pan genome from a set of proteome FASTA files
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::Roary;
 
 Bio::Roary::CommandLine::Roary->new(args => \@ARGV, script_name => $0)->run;
diff --git a/bin/transfer_annotation_to_groups b/bin/transfer_annotation_to_groups
@@ -13,7 +13,6 @@ Take in a groups file and a set of GFF files and transfer the consensus annotati
 
 BEGIN { unshift( @INC, '../lib' ) }
 BEGIN { unshift( @INC, './lib' ) }
-#BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
 use Bio::Roary::CommandLine::TransferAnnotationToGroups;
 
 Bio::Roary::CommandLine::TransferAnnotationToGroups->new(args => \@ARGV, script_name => $0)->run;
diff --git a/dist.ini b/dist.ini
@@ -1,5 +1,5 @@
 name    = Bio-Roary
-version = 2.3.1
+version = 2.3.2
 author  = Andrew J. Page <ap13@sanger.ac.uk>
 license = GPL_3
 copyright_holder = Wellcome Trust Sanger Institute

diff --git a/lib/Bio/Roary/CommandLine/Roary.pm b/lib/Bio/Roary/CommandLine/Roary.pm
@@ -13,6 +13,7 @@ use Getopt::Long qw(GetOptionsFromArray);
 use Bio::Roary;
 use Bio::Roary::PrepareInputFiles;
 use Bio::Roary::QC::Report;
+use Bio::Roary::ReformatInputGFFs;
 use File::Which;
 extends 'Bio::Roary::CommandLine::Common';
 
@@ -154,6 +155,16 @@ sub run {
 
     ( !$self->help ) or die $self->usage_text;
 
+    $self->logger->info("Fixing input GFF files"); 
+    my $reformat_input_files = Bio::Roary::ReformatInputGFFs->new( gff_files => $self->fasta_files, logger => $self->logger );
+	$reformat_input_files->fix_duplicate_gene_ids();
+	if(@{$reformat_input_files->fixed_gff_files} == 0)
+	{
+		$self->logger->error("All input files have been excluded from analysis. Please check you have valid GFF files, with annotation and a FASTA sequence at the end. Better still, reannotate your FASTA file with PROKKA.");
+		die();
+	}
+	$self->fasta_files($reformat_input_files->fixed_gff_files);
+
     $self->logger->info("Extracting proteins from GFF files");
     my $prepare_input_files = Bio::Roary::PrepareInputFiles->new(
         input_files           => $self->fasta_files,

diff --git a/lib/Bio/Roary/ParallelAllAgainstAllBlast.pm b/lib/Bio/Roary/ParallelAllAgainstAllBlast.pm
@@ -42,6 +42,13 @@ has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder
 
 has '_memory_required_in_mb'  => ( is => 'ro', isa => 'Int',  lazy => 1, builder => '_build__memory_required_in_mb' );
 
+
+sub BUILD {
+    my ($self) = @_;
+	$self->_makeblastdb_obj();
+}
+
+
 sub _build__blast_database {
     my ($self) = @_;
     return $self->_makeblastdb_obj->output_database;
@@ -106,6 +113,7 @@ sub run {
     my ($self) = @_;
     my @expected_output_files;
     my @commands_to_run;
+
     for my $filename ( @{ $self->_sequence_file_names } ) {
         my ( $filename_without_directory, $directories, $suffix ) = fileparse($filename);
         my $output_seq_results_file =

diff --git a/lib/Bio/Roary/ReformatInputGFFs.pm b/lib/Bio/Roary/ReformatInputGFFs.pm
@@ -0,0 +1,162 @@
+package Bio::Roary::ReformatInputGFFs;
+
+# ABSTRACT: Take in gff files and add suffix where a gene id is seen twice
+
+=head1 SYNOPSIS
+
+Take in gff files and add suffix where a gene id is seen twice
+   use Bio::Roary::ReformatInputGFFs;
+   
+   my $obj = Bio::Roary::PrepareInputFiles->new(
+     gff_files   => ['abc.gff','ddd.faa'],
+   );
+   $obj->fix_duplicate_gene_ids;
+   $obj->fixed_gff_files;
+
+=cut
+
+use Moose;
+use Bio::Roary::Exceptions;
+use Cwd;
+use Log::Log4perl qw(:easy);
+use Bio::Tools::GFF;
+use File::Path qw(make_path);
+use File::Basename;
+
+has 'gff_files'        => ( is => 'ro', isa  => 'ArrayRef', required => 1 );
+has 'logger'           => ( is => 'ro', lazy => 1,          builder  => '_build_logger' );
+has '_tags_to_filter'  => ( is => 'ro', isa  => 'Str',      default  => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );
+has 'output_directory' => ( is => 'ro', isa  => 'Str',      default  => 'fixed_input_files' );
+has 'suffix_counter'   => ( is => 'rw', isa  => 'Int',      default  => 1 );
+
+has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
+
+sub _build_logger {
+    my ($self) = @_;
+    Log::Log4perl->easy_init( level => $ERROR );
+    my $logger = get_logger();
+    return $logger;
+}
+
+sub fix_duplicate_gene_ids {
+    my ($self) = @_;
+
+    my %gene_ids_seen_before;
+    for my $file ( @{ $self->gff_files } ) {
+
+        my $ids_seen      = 0;
+        my $ids_from_file = $self->_get_ids_for_gff_file($file);
+
+        if ( @{$ids_from_file} < 1 ) {
+            $self->logger->warn(
+                "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
+            );
+        }
+        else {
+            for my $gene_id ( @{$ids_from_file} ) {
+                if ( $gene_ids_seen_before{$gene_id} ) {
+                    $self->logger->warn(
+  "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix.  New GFF in the fixed_input_files directory.  $file "
+                    );
+                    my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file);
+                    push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
+                    $ids_seen = 1;
+                    last;
+                }
+                $gene_ids_seen_before{$gene_id}++;
+            }
+            if ( $ids_seen == 0 ) {
+                push( @{ $self->fixed_gff_files }, $file );
+            }
+        }
+    }
+    return 1;
+}
+
+sub _add_suffix_to_gene_ids_and_return_new_file {
+    my ( $self, $input_file ) = @_;
+    my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
+    make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
+    my $output_file = $self->output_directory . '/' . $filename . $suffix;
+
+    open( my $input_gff_fh, $input_file );
+    open( my $out_gff_fh, '>', $output_file );
+
+    my $found_fasta = 0;
+    while (<$input_gff_fh>) {
+        my $line = $_;
+
+        if ( $line =~ /^\#\#FASTA/ ) {
+            $found_fasta = 1;
+        }
+
+        if ( $line =~ /\#/ || $found_fasta == 1 ) {
+            print {$out_gff_fh} $line;
+            next;
+        }
+
+        my @cells = split( /\t/, $line );
+        my @tags  = split( /;/,  $cells[8] );
+        my $found_id = 0;
+        for ( my $i = 0 ; $i < @tags ; $i++ ) {
+            if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
+                my $current_id = $2;
+                $current_id .= '___' . $self->suffix_counter;
+                $tags[$i] = $1 . $current_id . $3;
+                $self->suffix_counter( $self->suffix_counter + 1 );
+                $found_id++;
+                last;
+            }
+        }
+        if ( $found_id == 0 ) {
+            unshift( @tags, 'ID=id___' . $self->suffix_counter );
+            $self->suffix_counter( $self->suffix_counter + 1 );
+        }
+        $cells[8] = join( ';', @tags );
+        print {$out_gff_fh} join( "\t", @cells );
+    }
+
+    if ( $found_fasta == 0 ) {
+        $self->logger->warn(
+            "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
+        return undef;
+    }
+    close($out_gff_fh);
+    close($input_gff_fh);
+    return $output_file;
+}
+
+sub _get_ids_for_gff_file {
+    my ( $self, $file ) = @_;
+    my @gene_ids;
+    my $tags_regex = $self->_tags_to_filter;
+    my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
+    while ( my $feature = $gffio->next_feature() ) {
+        next if !( $feature->primary_tag =~ /$tags_regex/ );
+        my $gene_id = $self->_get_feature_id($feature);
+        push( @gene_ids, $gene_id ) if ( defined($gene_id) );
+    }
+    return \@gene_ids;
+}
+
+sub _get_feature_id {
+    my ( $self, $feature ) = @_;
+    my ( $gene_id, @junk );
+    if ( $feature->has_tag('ID') ) {
+        ( $gene_id, @junk ) = $feature->get_tag_values('ID');
+    }
+    elsif ( $feature->has_tag('locus_tag') ) {
+        ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
+    }
+    else {
+        return undef;
+    }
+    $gene_id =~ s!["']!!g;
+    return undef if ( $gene_id eq "" );
+    return $gene_id;
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
diff --git a/t/Bio/Roary/CommandLine/Roary.t b/t/Bio/Roary/CommandLine/Roary.t
@@ -28,13 +28,13 @@ system('touch empty_file');
 
       ' --dont_split_groups   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
         [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],     
-      ' -t 1 --dont_split_groups   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
+      ' -j Local -t 1 --dont_split_groups   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
         [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
       ' -j Parallel  --dont_split_groups t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
         [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],     
       ' -t 1 -j Parallel --dont_split_groups  t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
         [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
-      ' --dont_split_groups t/data/genbank_gbff/genbank1.gff t/data/genbank_gbff/genbank2.gff t/data/genbank_gbff/genbank3.gff' =>
+      ' -j Local --dont_split_groups t/data/genbank_gbff/genbank1.gff t/data/genbank_gbff/genbank2.gff t/data/genbank_gbff/genbank3.gff' =>
         [ 'gene_presence_absence.csv', 't/data/genbank_gbff/genbank_gene_presence_absence.csv' ],
       '-h' =>
         [ 'empty_file', 't/data/empty_file' ],
@@ -46,7 +46,7 @@ cleanup_files();
 
 
 %scripts_and_expected_files = (
-' --dont_split_groups   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
+' -j Local --dont_split_groups   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
   [ 'clustered_proteins', 't/data/clustered_proteins_pan_genome' ],
 ' -j Parallel --dont_split_groups  t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
   [ 'clustered_proteins', 't/data/clustered_proteins_pan_genome' ],
@@ -72,12 +72,12 @@ ok((-e 'query_5.gff.proteome.faa'),'Check protein query_5.gff.proteome.faa is no
 cleanup_files();
 
 %scripts_and_expected_files = (
-'-j Local --dont_delete_files t/data/locus_tag_gffs/query_1.gff t/data/locus_tag_gffs/query_2.gff t/data/locus_tag_gffs/query_3.gff ' =>
+'-j Local --dont_delete_files t/data/locus_tag_gffs/query_1.gff t/data/locus_tag_gffs/query_2.gff t/data/locus_tag_gffs/query_5.gff ' =>
   [  'empty_file', 't/data/empty_file'  ],
   );
 mock_execute_script_and_check_output_sorted_groups( $script_name, \%scripts_and_expected_files, [0,6,7,8,9] );
 
-for my $filename (('query_1.gff.proteome.faa','query_2.gff.proteome.faa','query_3.gff.proteome.faa')) {
+for my $filename (('query_1.gff.proteome.faa','query_2.gff.proteome.faa','query_5.gff.proteome.faa')) {
     is( read_file($filename), read_file( 't/data/locus_tag_gffs/' . $filename . '.expected' ), "content of proteome $filename as expected" );
 }