sanger-pathogens · andrewjpage · Nov 19, 2013 · Nov 13, 2013 · Nov 14, 2013 · Nov 15, 2013
diff --git a/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm b/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
@@ -51,7 +51,7 @@ sub BUILD {
     );
 
     if ( @{ $self->args } == 0 ) {
-        $self->_error_message("Error: You need to provide a FASTA file");
+        $self->_error_message("Error: You need to provide a GFF file");
     }
 
     $self->output_filename($output_filename)   if ( defined($output_filename) );

diff --git a/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm b/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm
@@ -0,0 +1,63 @@
+package Bio::PanGenome::ContigsToGeneIDsFromGFF;
+
+# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig
+
+=head1 SYNOPSIS
+
+Parse a GFF and efficiently and extract ordered gene ids on each contig
+   use Bio::PanGenome::ContigsToGeneIDsFromGFF;
+
+   my $obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new(
+     gff_file   => 'abc.gff'
+   );
+   $obj->contig_to_ids;
+
+=cut
+
+use Moose;
+use Bio::Tools::GFF;
+with 'Bio::PanGenome::ParseGFFAnnotationRole';
+
+has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');
+
+# Manually parse the GFF file because the BioPerl module is too slow
+sub _build_contig_to_ids
+{
+  my ($self) = @_;
+  my %contigs_to_ids;
+
+  open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
+  while(<$fh>)
+  {
+    chomp;
+    my $line = $_;   
+    my $id_name;
+    if($line =~/ID=([^;]+);/)
+    {
+      $id_name= $1;
+    }
+    else
+    {
+      next;
+    }
+
+    my @annotation_elements = split(/\t/,$line);
+    # Map gene IDs to the contig
+    push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
+  }
+  close($fh);
+  return \%contigs_to_ids;
+}
+
+sub _build__awk_filter {
+    my ($self) = @_;
+    return
+        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
+      . $self->_tags_to_filter
+      . '/) print $1"\t"$9;}\' ';
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
diff --git a/lib/Bio/PanGenome/External/PostAnalysis.pm b/lib/Bio/PanGenome/External/PostAnalysis.pm
@@ -97,6 +97,7 @@ sub run {
     my ($self) = @_;
     my @commands_to_run;
     push( @commands_to_run, $self->_command_to_run );
+    print $self->_command_to_run."\n";
 
     my $job_runner_obj = $self->_job_runner_class->new(
         commands_to_run => \@commands_to_run,

diff --git a/lib/Bio/PanGenome/GeneNamesFromGFF.pm b/lib/Bio/PanGenome/GeneNamesFromGFF.pm
@@ -16,12 +16,7 @@ Parse a GFF and efficiently extract ID -> Gene Name
 
 use Moose;
 use Bio::Tools::GFF;
-
-has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
-
-has '_tags_to_filter' => ( is => 'ro', isa => 'Str',             default => 'CDS' );
-has '_gff_parser'     => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy    => 1, builder => '_build__gff_parser' );
-has '_awk_filter'     => ( is => 'ro', isa => 'Str',             lazy    => 1, builder => '_build__awk_filter' );
+with 'Bio::PanGenome::ParseGFFAnnotationRole';
 
 has 'ids_to_gene_name' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_ids_to_gene_name' );
 has 'ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
@@ -65,21 +60,6 @@ sub _build_ids_to_gene_name {
     return \%id_to_gene_name;
 }
 
-
-
-sub _gff_fh_input_string {
-    my ($self) = @_;
-    return 'sed -n \'/##gff-version 3/,/##FASTA/p\' '.$self->gff_file.'| grep -v \'##FASTA\''." | " .  $self->_awk_filter;
-}
-
-sub _build__awk_filter {
-    my ($self) = @_;
-    return
-        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
-      . $self->_tags_to_filter
-      . '/) print $9;}\' ';
-}
-
 no Moose;
 __PACKAGE__->meta->make_immutable;
 

diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm
@@ -26,6 +26,7 @@ use Bio::PanGenome::AnnotateGroups;
 has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnnotateGroups', required => 1 );
 has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
 has 'output_filename'     => ( is => 'ro', isa => 'Str',                            default  => 'group_statitics.csv' );
+has 'groups_to_contigs'   => ( is => 'ro', isa => 'Maybe[HashRef]');
 
 has '_output_fh'         => ( is => 'ro', lazy => 1,           builder => '_build__output_fh' );
 has '_text_csv_obj'      => ( is => 'ro', isa  => 'Text::CSV', lazy    => 1, builder => '_build__text_csv_obj' );
@@ -49,7 +50,7 @@ sub _build__text_csv_obj {
 sub fixed_headers {
     my ($self) = @_;
     my @header =
-      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate' );
+      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'Accessory Fragement','Accessory Order with Fragment', 'QC' );
     return \@header;
 }
 
@@ -130,10 +131,25 @@ sub _row {
     my $annotated_group_name = $self->annotate_groups_obj->_groups_to_consensus_gene_names->{$group};
 
     my $duplicate_gene_name = $self->_non_unique_name_for_group($annotated_group_name);
-
+
+    my $genome_number = '';
+    my $qc_comment = '';
+    my $order_within_fragement = '';
+    my $accessory_order_within_fragement = '';
+    my $accessory_genome_number = '';
+    if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name}))
+    {
+      $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label};
+      $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment};
+      $order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{order};
+
+      $accessory_genome_number = $self->groups_to_contigs->{$annotated_group_name}->{accessory_label};
+      $accessory_order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{accessory_order};
+    }
+
     my @row = (
         $annotated_group_name,  $duplicate_gene_name,    $annotation,
-        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate
+        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$accessory_genome_number,$accessory_order_within_fragement,$qc_comment
     );
 
     for my $filename ( @{ $self->_sorted_file_names } ) {