From 10422bc979248541b0b956ce393e298528a2a4e4 Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Wed, 13 Nov 2013 13:24:27 +0000
Subject: [PATCH 1/5] create the list of genes and their links

---
 lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm | 63 +++++++++++++++++
 lib/Bio/PanGenome/GeneNamesFromGFF.pm        | 22 +-----
 lib/Bio/PanGenome/OrderGenes.pm              | 71 ++++++++++++++++++++
 lib/Bio/PanGenome/ParseGFFAnnotationRole.pm  | 32 +++++++++
 lib/Bio/PanGenome/PostAnalysis.pm            |  7 ++
 t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t    | 22 ++++++
 t/Bio/PanGenome/OrderGenes.t                 | 27 ++++++++
 7 files changed, 223 insertions(+), 21 deletions(-)
 create mode 100644 lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm
 create mode 100644 lib/Bio/PanGenome/OrderGenes.pm
 create mode 100644 lib/Bio/PanGenome/ParseGFFAnnotationRole.pm
 create mode 100644 t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t
 create mode 100644 t/Bio/PanGenome/OrderGenes.t

diff --git a/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm b/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm
new file mode 100644
index 0000000..61b75ba
--- /dev/null
+++ b/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm
@@ -0,0 +1,63 @@
+package Bio::PanGenome::ContigsToGeneIDsFromGFF;
+
+# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig
+
+=head1 SYNOPSIS
+
+Parse a GFF and efficiently and extract ordered gene ids on each contig
+   use Bio::PanGenome::ContigsToGeneIDsFromGFF;
+   
+   my $obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new(
+     gff_file   => 'abc.gff'
+   );
+   $obj->contig_to_ids;
+
+=cut
+
+use Moose;
+use Bio::Tools::GFF;
+with 'Bio::PanGenome::ParseGFFAnnotationRole';
+
+has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');
+
+# Manually parse the GFF file because the BioPerl module is too slow
+sub _build_contig_to_ids
+{
+  my ($self) = @_;
+  my %contigs_to_ids;
+  
+  open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
+  while(<$fh>)
+  {
+    chomp;
+    my $line = $_;   
+    my $id_name;
+    if($line =~/ID=([^;]+);/)
+    {
+      $id_name= $1;
+    }
+    else
+    {
+      next;
+    }
+    
+    my @annotation_elements = split(/\t/,$line);
+    # Map gene IDs to the contig
+    push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
+  }
+  close($fh);
+  return \%contigs_to_ids;
+}
+
+sub _build__awk_filter {
+    my ($self) = @_;
+    return
+        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
+      . $self->_tags_to_filter
+      . '/) print $1"\t"$9;}\' ';
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
diff --git a/lib/Bio/PanGenome/GeneNamesFromGFF.pm b/lib/Bio/PanGenome/GeneNamesFromGFF.pm
index ad695ae..f5a38b3 100644
--- a/lib/Bio/PanGenome/GeneNamesFromGFF.pm
+++ b/lib/Bio/PanGenome/GeneNamesFromGFF.pm
@@ -16,12 +16,7 @@ Parse a GFF and efficiently extract ID -> Gene Name
 
 use Moose;
 use Bio::Tools::GFF;
-
-has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
-
-has '_tags_to_filter' => ( is => 'ro', isa => 'Str',             default => 'CDS' );
-has '_gff_parser'     => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy    => 1, builder => '_build__gff_parser' );
-has '_awk_filter'     => ( is => 'ro', isa => 'Str',             lazy    => 1, builder => '_build__awk_filter' );
+with 'Bio::PanGenome::ParseGFFAnnotationRole';
 
 has 'ids_to_gene_name' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_ids_to_gene_name' );
 has 'ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
@@ -65,21 +60,6 @@ sub _build_ids_to_gene_name {
     return \%id_to_gene_name;
 }
 
-
-
-sub _gff_fh_input_string {
-    my ($self) = @_;
-    return 'sed -n \'/##gff-version 3/,/##FASTA/p\' '.$self->gff_file.'| grep -v \'##FASTA\''." | " .  $self->_awk_filter;
-}
-
-sub _build__awk_filter {
-    my ($self) = @_;
-    return
-        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
-      . $self->_tags_to_filter
-      . '/) print $9;}\' ';
-}
-
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm
new file mode 100644
index 0000000..0f98472
--- /dev/null
+++ b/lib/Bio/PanGenome/OrderGenes.pm
@@ -0,0 +1,71 @@
+package Bio::PanGenome::OrderGenes;
+
+# ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes
+
+=head1 SYNOPSIS
+
+Take in the analyse groups and create a matrix of what genes are beside what other genes
+   use Bio::PanGenome::OrderGenes;
+   
+   my $obj = Bio::PanGenome::OrderGenes->new(
+     analyse_groups_obj => $analyse_groups_obj,
+     gff_files => ['file1.gff','file2.gff']
+   );
+   $obj->create_matrix;
+
+=cut
+
+use Moose;
+use Bio::PanGenome::Exceptions;
+use Bio::PanGenome::AnalyseGroups;
+use Bio::PanGenome::ContigsToGeneIDsFromGFF;
+
+has 'gff_files'           => ( is => 'ro', isa => 'ArrayRef',  required => 1 );
+has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
+has 'group_order'         => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_group_order');
+
+sub _build_group_order
+{
+  my ($self) = @_;
+  my %group_order;
+  
+  # Open each GFF file
+  for my $filename (@{$self->gff_files})
+  {
+    my $contigs_to_ids_obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new(gff_file   => $filename);
+    
+    # Loop over each contig in the GFF file
+    for my $contig_name (keys %{$contigs_to_ids_obj->contig_to_ids})
+    {
+      my @groups_on_contig;
+      # loop over each gene in each contig in the GFF file
+      for my $gene_id (@{$contigs_to_ids_obj->contig_to_ids->{$contig_name}})
+      {
+        # convert to group name
+        my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id};
+        next unless(defined($group_name));
+        push(@groups_on_contig, $group_name);
+      }
+      
+      for(my $i = 1; $i < @groups_on_contig; $i++)
+      {
+        my $group_from = $groups_on_contig[$i -1];
+        my $group_to = $groups_on_contig[$i];
+        $group_order{$group_from}{$group_to}++;
+        # TODO: remove because you only need half the matix
+        $group_order{$group_to}{$group_from}++;
+      }
+    }
+  }
+  return \%group_order;
+}
+
+sub create_matrix
+{
+  my ($self) = @_;
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
diff --git a/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm
new file mode 100644
index 0000000..f67161a
--- /dev/null
+++ b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm
@@ -0,0 +1,32 @@
+package Bio::PanGenome::ParseGFFAnnotationRole;
+
+# ABSTRACT: A role for parsing a gff file efficiently
+
+=head1 SYNOPSIS
+
+with 'Bio::PanGenome::ParseGFFAnnotationRole';
+
+=cut
+use Moose::Role;
+use Bio::Tools::GFF;
+
+has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
+
+has '_tags_to_filter' => ( is => 'ro', isa => 'Str',             default => 'CDS' );
+has '_gff_parser'     => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy    => 1, builder => '_build__gff_parser' );
+has '_awk_filter'     => ( is => 'ro', isa => 'Str',             lazy    => 1, builder => '_build__awk_filter' );
+
+sub _gff_fh_input_string {
+    my ($self) = @_;
+    return 'sed -n \'/##gff-version 3/,/##FASTA/p\' '.$self->gff_file.'| grep -v \'##FASTA\''." | " .  $self->_awk_filter;
+}
+
+sub _build__awk_filter {
+    my ($self) = @_;
+    return
+        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
+      . $self->_tags_to_filter
+      . '/) print $9;}\' ';
+}
+
+1;
diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm
index 3a226d9..4661afb 100644
--- a/lib/Bio/PanGenome/PostAnalysis.pm
+++ b/lib/Bio/PanGenome/PostAnalysis.pm
@@ -17,6 +17,7 @@ use Bio::PanGenome::Output::OneGenePerGroupFasta;
 use Bio::PanGenome::GroupStatistics;
 use Bio::PanGenome::Output::GroupsMultifastasNucleotide;
 use Bio::PanGenome::Output::NumberOfGroups;
+use Bio::PanGenome::OrderGenes;
 
 has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
 has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
@@ -64,6 +65,12 @@ sub run {
     );
     $analyse_groups_obj->create_plots();
     
+    my $order_genes_obj = Bio::PanGenome::OrderGenes->new(
+      analyse_groups_obj => $analyse_groups_obj,
+      gff_files => $self->input_files,
+    );
+    $order_genes_obj->group_order;
+    
 
     my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new(
         analyse_groups  => $analyse_groups_obj,
diff --git a/t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t b/t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t
new file mode 100644
index 0000000..3b6273e
--- /dev/null
+++ b/t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t
@@ -0,0 +1,22 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Data::Dumper;
+use File::Slurp;
+
+BEGIN { unshift( @INC, './lib' ) }
+
+BEGIN {
+    use Test::Most;
+    use_ok('Bio::PanGenome::ContigsToGeneIDsFromGFF');
+}
+
+
+ok(my $obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new(
+  gff_file   => 't/data/query_1.gff'
+),'Initialise contigs to gene ids obj');
+
+print Dumper $obj->contig_to_ids;
+
+
+done_testing();
diff --git a/t/Bio/PanGenome/OrderGenes.t b/t/Bio/PanGenome/OrderGenes.t
new file mode 100644
index 0000000..0206962
--- /dev/null
+++ b/t/Bio/PanGenome/OrderGenes.t
@@ -0,0 +1,27 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Data::Dumper;
+use File::Slurp;
+
+BEGIN { unshift( @INC, './lib' ) }
+
+BEGIN {
+    use Test::Most;
+    use_ok('Bio::PanGenome::OrderGenes');
+    use Bio::PanGenome::AnalyseGroups;
+}
+
+my $analyse_groups = Bio::PanGenome::AnalyseGroups->new(
+    fasta_files     => ['t/data/query_1.fa','t/data/query_2.fa','t/data/query_3.fa'],
+    groups_filename => 't/data/query_groups'
+);
+
+ok(my $obj = Bio::PanGenome::OrderGenes->new(
+  analyse_groups_obj => $analyse_groups,
+  gff_files   => ['t/data/query_1.gff','t/data/query_2.gff','t/data/query_3.gff'],
+),'Initialise order genes object');
+
+print Dumper $obj->group_order;
+
+done_testing();

From d5472dee24063b2c2a2047bca6593688acd23171 Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Thu, 14 Nov 2013 13:51:56 +0000
Subject: [PATCH 2/5] breath first search

---
 .../PanGenome/CommandLine/CreatePanGenome.pm  |  2 +-
 lib/Bio/PanGenome/External/PostAnalysis.pm    |  1 +
 lib/Bio/PanGenome/GroupStatistics.pm          | 15 ++-
 lib/Bio/PanGenome/OrderGenes.pm               | 91 ++++++++++++++++++-
 lib/Bio/PanGenome/ParseGFFAnnotationRole.pm   |  2 +-
 lib/Bio/PanGenome/PostAnalysis.pm             | 27 +++---
 6 files changed, 117 insertions(+), 21 deletions(-)

diff --git a/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm b/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
index aaf5958..f5b673b 100644
--- a/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
+++ b/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
@@ -51,7 +51,7 @@ sub BUILD {
     );
     
     if ( @{ $self->args } == 0 ) {
-        $self->_error_message("Error: You need to provide a FASTA file");
+        $self->_error_message("Error: You need to provide a GFF file");
     }
 
     $self->output_filename($output_filename)   if ( defined($output_filename) );
diff --git a/lib/Bio/PanGenome/External/PostAnalysis.pm b/lib/Bio/PanGenome/External/PostAnalysis.pm
index 8cd7b9a..68380a8 100644
--- a/lib/Bio/PanGenome/External/PostAnalysis.pm
+++ b/lib/Bio/PanGenome/External/PostAnalysis.pm
@@ -97,6 +97,7 @@ sub run {
     my ($self) = @_;
     my @commands_to_run;
     push( @commands_to_run, $self->_command_to_run );
+    print $self->_command_to_run."\n";
 
     my $job_runner_obj = $self->_job_runner_class->new(
         commands_to_run => \@commands_to_run,
diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm
index 5671c90..8d70704 100644
--- a/lib/Bio/PanGenome/GroupStatistics.pm
+++ b/lib/Bio/PanGenome/GroupStatistics.pm
@@ -26,6 +26,7 @@ use Bio::PanGenome::AnnotateGroups;
 has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnnotateGroups', required => 1 );
 has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
 has 'output_filename'     => ( is => 'ro', isa => 'Str',                            default  => 'group_statitics.csv' );
+has 'groups_to_contigs'   => ( is => 'ro', isa => 'Maybe[HashRef]');
 
 has '_output_fh'         => ( is => 'ro', lazy => 1,           builder => '_build__output_fh' );
 has '_text_csv_obj'      => ( is => 'ro', isa  => 'Text::CSV', lazy    => 1, builder => '_build__text_csv_obj' );
@@ -49,7 +50,7 @@ sub _build__text_csv_obj {
 sub fixed_headers {
     my ($self) = @_;
     my @header =
-      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate' );
+      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome', 'QC' );
     return \@header;
 }
 
@@ -130,10 +131,18 @@ sub _row {
     my $annotated_group_name = $self->annotate_groups_obj->_groups_to_consensus_gene_names->{$group};
 
     my $duplicate_gene_name = $self->_non_unique_name_for_group($annotated_group_name);
-
+    
+    my $genome_number = '';
+    my $qc_comment = '';
+    if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name}))
+    {
+      $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label};
+      $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment};
+    }
+    
     my @row = (
         $annotated_group_name,  $duplicate_gene_name,    $annotation,
-        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate
+        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$qc_comment
     );
 
     for my $filename ( @{ $self->_sorted_file_names } ) {
diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm
index 0f98472..cb28b4f 100644
--- a/lib/Bio/PanGenome/OrderGenes.pm
+++ b/lib/Bio/PanGenome/OrderGenes.pm
@@ -11,7 +11,7 @@ Take in the analyse groups and create a matrix of what genes are beside what oth
      analyse_groups_obj => $analyse_groups_obj,
      gff_files => ['file1.gff','file2.gff']
    );
-   $obj->create_matrix;
+   $obj->groups_to_contigs;
 
 =cut
 
@@ -19,10 +19,26 @@ use Moose;
 use Bio::PanGenome::Exceptions;
 use Bio::PanGenome::AnalyseGroups;
 use Bio::PanGenome::ContigsToGeneIDsFromGFF;
+use Boost::Graph;
 
 has 'gff_files'           => ( is => 'ro', isa => 'ArrayRef',  required => 1 );
 has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
 has 'group_order'         => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_group_order');
+has 'group_graphs'        => ( is => 'ro', isa => 'Boost::Graph',  lazy => 1, builder => '_build_group_graphs');
+has 'groups_to_contigs'        => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_groups_to_contigs');
+
+has '_groups'             => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_groups');
+
+sub _build_groups
+{
+  my ($self) = @_;
+  my %groups;
+  for my $group_name (@{$self->analyse_groups_obj->_groups})
+  {
+    $groups{$group_name}++;
+  }
+  return \%groups;
+}
 
 sub _build_group_order
 {
@@ -57,14 +73,83 @@ sub _build_group_order
       }
     }
   }
+
   return \%group_order;
 }
 
-sub create_matrix
+sub _build_group_graphs
 {
-  my ($self) = @_;
+  my($self) = @_;
+  return  Boost::Graph->new(directed=>0);
+}
+
+
+sub _add_groups_to_graph
+{
+  my($self) = @_;
+
+  for my $current_group (keys %{$self->group_order()})
+  {
+    for my $group_to (keys %{$self->group_order->{$current_group}})
+    {
+      my $weight = 1.0/ ($self->group_order->{$current_group}->{$group_to}) ;
+      $self->group_graphs->add_edge(node1=>$current_group, node2=>$group_to, weight=>$weight);
+    }
+  }
+
+}
+
+
+sub _get_connected_nodes
+{
+  my($self) = @_;
+  
+  my @all_groups = keys %{$self->_groups};
+  my $search_group = $all_groups[0];
+  
+  my $connected_groups = $self->group_graphs->breadth_first_search($search_group);
+  for my $group_name (@{$connected_groups })
+  {
+    delete($self->_groups->{$group_name});
+  }
+  if(defined($self->_groups->{$search_group}))
+  {
+    delete($self->_groups->{$search_group});
+    push(@{$connected_groups},$search_group );
+  }
+
+  return  $connected_groups;
+}
+
+sub _build_groups_to_contigs
+{
+  my($self) = @_;
+  $self->_add_groups_to_graph;
+
+  my %groups_to_contigs;
+  my $counter = 1;
+  while((keys %{$self->_groups} ) > 0)
+  {
+    my $contig_groups = $self->_get_connected_nodes;
+    for my $group_name (@{$contig_groups})
+    {
+      $groups_to_contigs{$group_name}{label} = $counter;
+      $groups_to_contigs{$group_name}{comment} = '';
+      if(@{$contig_groups} == 1)
+      {
+        $groups_to_contigs{$group_name}{comment} = 'Contamination';
+      }
+    }
+    $counter++;
+  }
+  
+  $self->group_graphs->connected_components;
+
+
+  return \%groups_to_contigs;
 }
 
+
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
diff --git a/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm
index f67161a..154ec92 100644
--- a/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm
+++ b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm
@@ -12,7 +12,7 @@ use Bio::Tools::GFF;
 
 has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
 
-has '_tags_to_filter' => ( is => 'ro', isa => 'Str',             default => 'CDS' );
+has '_tags_to_filter' => ( is => 'ro', isa => 'Str',             default => '(CDS|ncRNA|tRNA|tmRNA)' );
 has '_gff_parser'     => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy    => 1, builder => '_build__gff_parser' );
 has '_awk_filter'     => ( is => 'ro', isa => 'Str',             lazy    => 1, builder => '_build__awk_filter' );
 
diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm
index 4661afb..51f6329 100644
--- a/lib/Bio/PanGenome/PostAnalysis.pm
+++ b/lib/Bio/PanGenome/PostAnalysis.pm
@@ -69,7 +69,7 @@ sub run {
       analyse_groups_obj => $analyse_groups_obj,
       gff_files => $self->input_files,
     );
-    $order_genes_obj->group_order;
+   $order_genes_obj->groups_to_contigs;
     
 
     my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new(
@@ -81,7 +81,8 @@ sub run {
     my $group_statistics = Bio::PanGenome::GroupStatistics->new(
         output_filename     => $self->output_statistics_filename,
         annotate_groups_obj => $annotate_groups,
-        analyse_groups_obj  => $analyse_groups_obj
+        analyse_groups_obj  => $analyse_groups_obj,
+        groups_to_contigs   => $order_genes_obj->groups_to_contigs
     );
     $group_statistics->create_spreadsheet;
     
@@ -100,17 +101,17 @@ sub run {
       $group_multifastas_nucleotides->create_files();
     }
 
-    unlink($output_mcl_filename);
-    unlink($output_inflate_clusters_filename);
-    unlink($output_group_labels_filename);
-    unlink($output_combined_filename);
-    unlink( $self->clusters_filename);
-    unlink( $self->clusters_filename . '.clstr' );
-    unlink( $self->clusters_filename . '.bak.clstr' );
-    unlink('_gff_files');
-    unlink('_fasta_files');
-    unlink('_clustered_filtered.fa');
-    unlink($input_cd_hit_groups_file);
+   # unlink($output_mcl_filename);
+   # unlink($output_inflate_clusters_filename);
+   # unlink($output_group_labels_filename);
+   # unlink($output_combined_filename);
+   # unlink( $self->clusters_filename);
+   # unlink( $self->clusters_filename . '.clstr' );
+   # unlink( $self->clusters_filename . '.bak.clstr' );
+   # unlink('_gff_files');
+   # unlink('_fasta_files');
+   # unlink('_clustered_filtered.fa');
+   # unlink($input_cd_hit_groups_file);
 
 }
 

From 94e8819824383999f6347a41d623a6e6cc391bea Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Fri, 15 Nov 2013 09:25:44 +0000
Subject: [PATCH 3/5] boost

---
 lib/Bio/PanGenome/GroupStatistics.pm |   6 +-
 lib/Bio/PanGenome/OrderGenes.pm      | 135 +++++++++++++++++++--------
 2 files changed, 101 insertions(+), 40 deletions(-)

diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm
index 8d70704..2bf5491 100644
--- a/lib/Bio/PanGenome/GroupStatistics.pm
+++ b/lib/Bio/PanGenome/GroupStatistics.pm
@@ -50,7 +50,7 @@ sub _build__text_csv_obj {
 sub fixed_headers {
     my ($self) = @_;
     my @header =
-      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome', 'QC' );
+      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'QC' );
     return \@header;
 }
 
@@ -134,15 +134,17 @@ sub _row {
     
     my $genome_number = '';
     my $qc_comment = '';
+    my $order_within_fragement = '';
     if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name}))
     {
       $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label};
       $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment};
+      $order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{order};
     }
     
     my @row = (
         $annotated_group_name,  $duplicate_gene_name,    $annotation,
-        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$qc_comment
+        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$qc_comment
     );
 
     for my $filename ( @{ $self->_sorted_file_names } ) {
diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm
index cb28b4f..849e117 100644
--- a/lib/Bio/PanGenome/OrderGenes.pm
+++ b/lib/Bio/PanGenome/OrderGenes.pm
@@ -20,12 +20,14 @@ use Bio::PanGenome::Exceptions;
 use Bio::PanGenome::AnalyseGroups;
 use Bio::PanGenome::ContigsToGeneIDsFromGFF;
 use Boost::Graph;
+use Data::Dumper;
 
 has 'gff_files'           => ( is => 'ro', isa => 'ArrayRef',  required => 1 );
 has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
 has 'group_order'         => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_group_order');
 has 'group_graphs'        => ( is => 'ro', isa => 'Boost::Graph',  lazy => 1, builder => '_build_group_graphs');
 has 'groups_to_contigs'        => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_groups_to_contigs');
+has '_groups_to_file_contigs'  => ( is => 'ro', isa => 'ArrayRef',  lazy => 1, builder => '_build__groups_to_file_contigs');
 
 has '_groups'             => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_groups');
 
@@ -40,10 +42,11 @@ sub _build_groups
   return \%groups;
 }
 
-sub _build_group_order
+
+sub _build__groups_to_file_contigs
 {
   my ($self) = @_;
-  my %group_order;
+  my @groups_to_contigs;
   
   # Open each GFF file
   for my $filename (@{$self->gff_files})
@@ -62,15 +65,34 @@ sub _build_group_order
         next unless(defined($group_name));
         push(@groups_on_contig, $group_name);
       }
+      push(@groups_to_contigs,\@groups_on_contig);
+    }
+  }
       
-      for(my $i = 1; $i < @groups_on_contig; $i++)
-      {
-        my $group_from = $groups_on_contig[$i -1];
-        my $group_to = $groups_on_contig[$i];
-        $group_order{$group_from}{$group_to}++;
-        # TODO: remove because you only need half the matix
-        $group_order{$group_to}{$group_from}++;
-      }
+  return \@groups_to_contigs;
+  
+}
+
+sub _build_group_order
+{
+  my ($self) = @_;
+  my %group_order;
+  
+  for my $groups_on_contig (@{$self->_groups_to_file_contigs})
+  {
+    for(my $i = 1; $i < @{$groups_on_contig}; $i++)
+    {
+      my $group_from = $groups_on_contig->[$i -1];
+      my $group_to = $groups_on_contig->[$i];
+      $group_order{$group_from}{$group_to}++;
+      # TODO: remove because you only need half the matix
+      $group_order{$group_to}{$group_from}++;
+    }
+    if(@{$groups_on_contig} == 1)
+    {
+       my $group_from = $groups_on_contig->[0];
+       my $group_to = $groups_on_contig->[0];
+       $group_order{$group_from}{$group_to}++;
     }
   }
 
@@ -92,7 +114,7 @@ sub _add_groups_to_graph
   {
     for my $group_to (keys %{$self->group_order->{$current_group}})
     {
-      my $weight = 1.0/ ($self->group_order->{$current_group}->{$group_to}) ;
+      my $weight = 1.0/($self->group_order->{$current_group}->{$group_to} );
       $self->group_graphs->add_edge(node1=>$current_group, node2=>$group_to, weight=>$weight);
     }
   }
@@ -100,27 +122,6 @@ sub _add_groups_to_graph
 }
 
 
-sub _get_connected_nodes
-{
-  my($self) = @_;
-  
-  my @all_groups = keys %{$self->_groups};
-  my $search_group = $all_groups[0];
-  
-  my $connected_groups = $self->group_graphs->breadth_first_search($search_group);
-  for my $group_name (@{$connected_groups })
-  {
-    delete($self->_groups->{$group_name});
-  }
-  if(defined($self->_groups->{$search_group}))
-  {
-    delete($self->_groups->{$search_group});
-    push(@{$connected_groups},$search_group );
-  }
-
-  return  $connected_groups;
-}
-
 sub _build_groups_to_contigs
 {
   my($self) = @_;
@@ -128,25 +129,83 @@ sub _build_groups_to_contigs
 
   my %groups_to_contigs;
   my $counter = 1;
-  while((keys %{$self->_groups} ) > 0)
+  
+  # connected_components seg faults if you dont run breadth_first_search first
+  my @all_groups = keys %{$self->_groups};
+  $self->group_graphs->breadth_first_search($all_groups[0]);
+  my $groups_to_contigs = $self->group_graphs->connected_components;
+  
+  for my $groups_to_contig (@{ $groups_to_contigs})
   {
-    my $contig_groups = $self->_get_connected_nodes;
-    for my $group_name (@{$contig_groups})
+    my $contig_groups = $groups_to_contig;
+    my $order_counter = 1;
+    my $reordered_group = $self->_reorder_contig($contig_groups);
+    
+    for my $group_name (@{$reordered_group})
     {
       $groups_to_contigs{$group_name}{label} = $counter;
       $groups_to_contigs{$group_name}{comment} = '';
-      if(@{$contig_groups} == 1)
+      $groups_to_contigs{$group_name}{order} = $order_counter;
+      if(@{$contig_groups} <= 4)
       {
         $groups_to_contigs{$group_name}{comment} = 'Contamination';
       }
+      $order_counter++;
     }
     $counter++;
   }
+
+  return \%groups_to_contigs;
+}
+
+
+sub _reorder_contig
+{
+  my($self,$groups_to_contigs) = @_;
+
+  my $longest_path ;  
+  my %current_groups_to_contigs;
+  
+  for my $group (@{$groups_to_contigs})
+  {
+    $current_groups_to_contigs{$group}++;
+  }
+  
+  my @elements = keys(%current_groups_to_contigs);
+  my $starting_node  = $elements[rand @elements];
   
-  $self->group_graphs->connected_components;
+  for(my $i = 0; ($i < 1000 && $i < @elements) ; $i++)
+  {
+    my $ending_node    = $elements[rand @elements];
+    my $current_path   = $self->group_graphs->dijkstra_shortest_path($starting_node, $ending_node);
+    next if(! defined($current_path));
 
+    if(!defined($longest_path))
+    {
+      $longest_path = $current_path;
+    } 
+
+    
+    if(@{$current_path->{path}} > @{$longest_path->{path}})
+    {
+       $longest_path = $current_path;
+    }
+    print Dumper "$i ".@{$longest_path->{path}}."";
+    
+   }
+   
+   
+   my @output_order;
+   for my $group(@{$longest_path->{path}})
+   {
+     push( @output_order,$group);
+   }
+   for my $group (keys(%current_groups_to_contigs))
+   {
+      push( @output_order,$group);
+   }
+  return \@output_order;
 
-  return \%groups_to_contigs;
 }
 
 

From 41b539fa0a6bb0a6603ad95ed2e9db61f2f5fbce Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Fri, 15 Nov 2013 16:38:29 +0000
Subject: [PATCH 4/5] minimum spanning trees

---
 lib/Bio/PanGenome/GroupStatistics.pm |   9 +-
 lib/Bio/PanGenome/OrderGenes.pm      | 165 ++++++++++++++++++---------
 2 files changed, 116 insertions(+), 58 deletions(-)

diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm
index 2bf5491..d489ffd 100644
--- a/lib/Bio/PanGenome/GroupStatistics.pm
+++ b/lib/Bio/PanGenome/GroupStatistics.pm
@@ -50,7 +50,7 @@ sub _build__text_csv_obj {
 sub fixed_headers {
     my ($self) = @_;
     my @header =
-      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'QC' );
+      ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'Accessory Fragement','Accessory Order with Fragment', 'QC' );
     return \@header;
 }
 
@@ -135,16 +135,21 @@ sub _row {
     my $genome_number = '';
     my $qc_comment = '';
     my $order_within_fragement = '';
+    my $accessory_order_within_fragement = '';
+    my $accessory_genome_number = '';
     if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name}))
     {
       $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label};
       $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment};
       $order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{order};
+      
+      $accessory_genome_number = $self->groups_to_contigs->{$annotated_group_name}->{accessory_label};
+      $accessory_order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{accessory_order};
     }
     
     my @row = (
         $annotated_group_name,  $duplicate_gene_name,    $annotation,
-        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$qc_comment
+        $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$accessory_genome_number,$accessory_order_within_fragement,$qc_comment
     );
 
     for my $filename ( @{ $self->_sorted_file_names } ) {
diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm
index 849e117..30ef268 100644
--- a/lib/Bio/PanGenome/OrderGenes.pm
+++ b/lib/Bio/PanGenome/OrderGenes.pm
@@ -19,17 +19,24 @@ use Moose;
 use Bio::PanGenome::Exceptions;
 use Bio::PanGenome::AnalyseGroups;
 use Bio::PanGenome::ContigsToGeneIDsFromGFF;
-use Boost::Graph;
-use Data::Dumper;
+use Graph;
 
 has 'gff_files'           => ( is => 'ro', isa => 'ArrayRef',  required => 1 );
 has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
 has 'group_order'         => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_group_order');
-has 'group_graphs'        => ( is => 'ro', isa => 'Boost::Graph',  lazy => 1, builder => '_build_group_graphs');
+has 'group_graphs'        => ( is => 'ro', isa => 'Graph',  lazy => 1, builder => '_build_group_graphs');
 has 'groups_to_contigs'        => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_groups_to_contigs');
 has '_groups_to_file_contigs'  => ( is => 'ro', isa => 'ArrayRef',  lazy => 1, builder => '_build__groups_to_file_contigs');
 
 has '_groups'             => ( is => 'ro', isa => 'HashRef',  lazy => 1, builder => '_build_groups');
+has 'number_of_files'     =>  => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files');
+
+
+sub _build_number_of_files
+{
+  my ($self) = @_;
+  return @{$self->gff_files};
+}
 
 sub _build_groups
 {
@@ -102,7 +109,7 @@ sub _build_group_order
 sub _build_group_graphs
 {
   my($self) = @_;
-  return  Boost::Graph->new(directed=>0);
+  return Graph->new(undirected => 1);
 }
 
 
@@ -115,13 +122,56 @@ sub _add_groups_to_graph
     for my $group_to (keys %{$self->group_order->{$current_group}})
     {
       my $weight = 1.0/($self->group_order->{$current_group}->{$group_to} );
-      $self->group_graphs->add_edge(node1=>$current_group, node2=>$group_to, weight=>$weight);
+      $self->group_graphs->add_weighted_edge($current_group,$group_to, $weight);
     }
   }
 
 }
 
 
+sub _reorder_connected_components
+{
+   my($self, $graph_groups) = @_;
+   
+   my @ordered_graph_groups;
+   
+   for my $graph_group( @{$graph_groups})
+   {
+     if(@{$graph_group} < 3)
+     {
+       push( @ordered_graph_groups,$graph_group );
+       next;
+     }
+     
+     my $graph = Graph->new(undirected => 1);
+     my %groups;
+     $groups{$_}++ for (@{$graph_group});
+     
+     for my $current_group (keys %groups)
+     {
+       for my $group_to (keys %{$self->group_order->{$current_group}})
+       {
+         next if(! defined($groups{$group_to}));
+         next if($graph->has_edge($group_to,$current_group));
+         my $current_weight = $self->group_order->{$current_group}->{$group_to} ;
+         $current_weight = $self->number_of_files if($current_weight > $self->number_of_files);
+         my $weight = ($self->number_of_files - $current_weight) +1;
+
+         $graph->add_weighted_edge($current_group,$group_to, $weight);
+       }
+     }
+
+     my $minimum_spanning_tree = $graph->minimum_spanning_tree;
+     my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree);
+
+     my @reordered_dfs_groups = $dfs_obj->dfs;
+     
+     push(@ordered_graph_groups, \@reordered_dfs_groups);
+     
+   }
+   return \@ordered_graph_groups;
+}
+
 sub _build_groups_to_contigs
 {
   my($self) = @_;
@@ -130,85 +180,88 @@ sub _build_groups_to_contigs
   my %groups_to_contigs;
   my $counter = 1;
   
-  # connected_components seg faults if you dont run breadth_first_search first
-  my @all_groups = keys %{$self->_groups};
-  $self->group_graphs->breadth_first_search($all_groups[0]);
-  my $groups_to_contigs = $self->group_graphs->connected_components;
+  # Accessory
+  my $accessory_graph = $self->_create_accessory_graph;
+  my @group_graphs = $accessory_graph->connected_components();
+  my $reordered_graphs = $self->_reorder_connected_components(\@group_graphs);
   
-  for my $groups_to_contig (@{ $groups_to_contigs})
+  for my $contig_groups (@{$reordered_graphs})
   {
-    my $contig_groups = $groups_to_contig;
     my $order_counter = 1;
-    my $reordered_group = $self->_reorder_contig($contig_groups);
-    
-    for my $group_name (@{$reordered_group})
+  
+    for my $group_name (@{$contig_groups})
+    {
+      $groups_to_contigs{$group_name}{accessory_label} = $counter;
+      $groups_to_contigs{$group_name}{accessory_order} = $order_counter;
+      $order_counter++;
+    }
+    $counter++;
+  }
+  
+  # Core + accessory
+  my @group_graphs_all = $self->group_graphs->connected_components();
+  my $reordered_graphs_all = $self->_reorder_connected_components(\@group_graphs_all);
+  
+  $counter = 1;
+  for my $contig_groups (@{$reordered_graphs_all})
+  {
+    my $order_counter = 1;
+  
+    for my $group_name (@{$contig_groups})
     {
       $groups_to_contigs{$group_name}{label} = $counter;
       $groups_to_contigs{$group_name}{comment} = '';
       $groups_to_contigs{$group_name}{order} = $order_counter;
-      if(@{$contig_groups} <= 4)
+      if(@{$contig_groups} <= 2)
       {
-        $groups_to_contigs{$group_name}{comment} = 'Contamination';
+        $groups_to_contigs{$group_name}{comment} = 'Investigate';
       }
       $order_counter++;
     }
     $counter++;
   }
+  
 
   return \%groups_to_contigs;
 }
 
-
-sub _reorder_contig
+sub _create_accessory_graph
 {
-  my($self,$groups_to_contigs) = @_;
-
-  my $longest_path ;  
-  my %current_groups_to_contigs;
+  my($self) = @_;
+  my $graph = Graph->new(undirected => 1);
+  
+  my %core_groups;
   
-  for my $group (@{$groups_to_contigs})
+  for my $current_group (keys %{$self->group_order()})
   {
-    $current_groups_to_contigs{$group}++;
+    my $sum_of_weights = 0;
+    for my $group_to (keys %{$self->group_order->{$current_group}})
+    {
+      $sum_of_weights += $self->group_order->{$current_group}->{$group_to};
+    }
+    if($sum_of_weights >= $self->number_of_files )
+    {
+      $core_groups{$current_group}++;
+    }
   }
   
-  my @elements = keys(%current_groups_to_contigs);
-  my $starting_node  = $elements[rand @elements];
-  
-  for(my $i = 0; ($i < 1000 && $i < @elements) ; $i++)
+  for my $current_group (keys %{$self->group_order()})
   {
-    my $ending_node    = $elements[rand @elements];
-    my $current_path   = $self->group_graphs->dijkstra_shortest_path($starting_node, $ending_node);
-    next if(! defined($current_path));
-
-    if(!defined($longest_path))
-    {
-      $longest_path = $current_path;
-    } 
-
-    
-    if(@{$current_path->{path}} > @{$longest_path->{path}})
+    next if(defined($core_groups{$current_group}));
+    for my $group_to (keys %{$self->group_order->{$current_group}})
     {
-       $longest_path = $current_path;
+      next if(defined($core_groups{$group_to}));
+      my $weight = 1.0/($self->group_order->{$current_group}->{$group_to} );
+      $graph->add_weighted_edge($current_group,$group_to, $weight);
     }
-    print Dumper "$i ".@{$longest_path->{path}}."";
-    
-   }
-   
-   
-   my @output_order;
-   for my $group(@{$longest_path->{path}})
-   {
-     push( @output_order,$group);
-   }
-   for my $group (keys(%current_groups_to_contigs))
-   {
-      push( @output_order,$group);
-   }
-  return \@output_order;
-
+  }
+  
+  return $graph;
 }
 
 
+
+
 no Moose;
 __PACKAGE__->meta->make_immutable;
 

From 18d98565d0c971a6d7d6e437fedf75cf8e29c57b Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Mon, 18 Nov 2013 12:23:25 +0000
Subject: [PATCH 5/5] tests for gene ordering

---
 lib/Bio/PanGenome/OrderGenes.pm               |  11 +-
 lib/Bio/PanGenome/Output/EmblGroups.pm        | 106 ++++++++++++++++++
 lib/Bio/PanGenome/PostAnalysis.pm             |  24 +++-
 t/Bio/PanGenome/OrderGenes.t                  |   1 -
 t/Bio/PanGenome/ReorderSpreadsheet.t          |   6 +-
 t/data/expected_group_statitics.csv           |  16 +--
 ...expected_group_statitics_missing_genes.csv |  16 +--
 ...d_set_difference_common_set_statistics.csv |   8 +-
 ...t_difference_unique_set_one_statistics.csv |   4 +-
 ...t_difference_unique_set_two_statistics.csv |   8 +-
 t/data/overall_group_statisics.csv            |  30 ++---
 t/data/reorder_isolates_expected_output.csv   |  16 +--
 t/data/reorder_isolates_input.csv             |  16 +--
 13 files changed, 197 insertions(+), 65 deletions(-)
 create mode 100644 lib/Bio/PanGenome/Output/EmblGroups.pm

diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm
index 30ef268..571de90 100644
--- a/lib/Bio/PanGenome/OrderGenes.pm
+++ b/lib/Bio/PanGenome/OrderGenes.pm
@@ -179,13 +179,14 @@ sub _build_groups_to_contigs
 
   my %groups_to_contigs;
   my $counter = 1;
+  my $overall_counter = 1 ;
   
   # Accessory
   my $accessory_graph = $self->_create_accessory_graph;
   my @group_graphs = $accessory_graph->connected_components();
   my $reordered_graphs = $self->_reorder_connected_components(\@group_graphs);
   
-  for my $contig_groups (@{$reordered_graphs})
+  for my $contig_groups (sort { @{$b} <=> @{$a} }  @{$reordered_graphs})
   {
     my $order_counter = 1;
   
@@ -193,7 +194,10 @@ sub _build_groups_to_contigs
     {
       $groups_to_contigs{$group_name}{accessory_label} = $counter;
       $groups_to_contigs{$group_name}{accessory_order} = $order_counter;
+      $groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter;
+      
       $order_counter++;
+      $overall_counter++;
     }
     $counter++;
   }
@@ -202,8 +206,9 @@ sub _build_groups_to_contigs
   my @group_graphs_all = $self->group_graphs->connected_components();
   my $reordered_graphs_all = $self->_reorder_connected_components(\@group_graphs_all);
   
+  $overall_counter = 1;
   $counter = 1;
-  for my $contig_groups (@{$reordered_graphs_all})
+  for my $contig_groups (sort {@{$b} <=> @{$a} } @{$reordered_graphs_all})
   {
     my $order_counter = 1;
   
@@ -212,11 +217,13 @@ sub _build_groups_to_contigs
       $groups_to_contigs{$group_name}{label} = $counter;
       $groups_to_contigs{$group_name}{comment} = '';
       $groups_to_contigs{$group_name}{order} = $order_counter;
+      $groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter;
       if(@{$contig_groups} <= 2)
       {
         $groups_to_contigs{$group_name}{comment} = 'Investigate';
       }
       $order_counter++;
+      $overall_counter++;
     }
     $counter++;
   }
diff --git a/lib/Bio/PanGenome/Output/EmblGroups.pm b/lib/Bio/PanGenome/Output/EmblGroups.pm
new file mode 100644
index 0000000..3288096
--- /dev/null
+++ b/lib/Bio/PanGenome/Output/EmblGroups.pm
@@ -0,0 +1,106 @@
+package Bio::PanGenome::Output::EmblGroups;
+
+# ABSTRACT: Create a tab/embl file with the features for drawing pretty pictures
+
+=head1 SYNOPSIS
+
+reate a tab/embl file with the features for drawing pretty pictures
+   use Bio::PanGenome::Output::EmblGroups;
+   
+   my $obj = Bio::PanGenome::Output::EmblGroups->new(
+     output_filename => 'group_statitics.csv',
+     annotate_groups_obj => $annotate_groups_obj,
+     analyse_groups_obj  => $analyse_groups_obj
+   );
+   $obj->create_file;
+
+=cut
+
+use Moose;
+use POSIX;
+use Bio::PanGenome::Exceptions;
+use Bio::PanGenome::AnalyseGroups;
+use Bio::PanGenome::AnnotateGroups;
+
+has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnnotateGroups', required => 1 );
+has 'analyse_groups_obj'  => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups',  required => 1 );
+has 'output_filename'     => ( is => 'ro', isa => 'Str',                            default  => 'core_accessory.tab' );
+has 'groups_to_contigs'   => ( is => 'ro', isa => 'Maybe[HashRef]');
+has 'ordering_key'        => ( is => 'ro', isa => 'Str',                            default  => 'core_accessory_overall_order' );
+
+has '_output_fh'         => ( is => 'ro', lazy => 1,           builder => '_build__output_fh' );
+has '_sorted_file_names' => ( is => 'ro', isa  => 'ArrayRef',  lazy    => 1, builder => '_build__sorted_file_names' );
+has '_groups_to_files'   => ( is => 'ro', isa  => 'HashRef',   lazy    => 1, builder => '_build__groups_to_files' );
+
+sub _build__output_fh {
+    my ($self) = @_;
+    open( my $fh, '>', $self->output_filename )
+      or Bio::PanGenome::Exceptions::CouldntWriteToFile->throw(
+        error => "Couldnt write output file:" . $self->output_filename );
+    return $fh;
+}
+
+sub _build__sorted_file_names {
+    my ($self) = @_;
+    my @sorted_file_names = sort( @{ $self->analyse_groups_obj->fasta_files } );
+    return \@sorted_file_names;
+}
+
+sub _build__groups_to_files {
+    my ($self) = @_;
+    my %groups_to_files;
+    for my $group ( @{ $self->annotate_groups_obj->_groups } ) {
+        my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};
+        my %filenames;
+        for my $gene_name ( @{$genes} ) {
+            my $filename = $self->analyse_groups_obj->_genes_to_file->{$gene_name};
+            push( @{ $filenames{$filename} }, $gene_name );
+        }
+        $groups_to_files{$group} = \%filenames;
+    }
+    return \%groups_to_files;
+}
+
+sub _block {
+    my ( $self, $group ) = @_;
+    my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};
+    my @taxon_names_array;
+    my $annotated_group_name = $self->annotate_groups_obj->_groups_to_consensus_gene_names->{$group};
+    
+    return '' if(!(defined($self->groups_to_contigs->{$annotated_group_name}) &&  defined($self->groups_to_contigs->{$annotated_group_name}->{$self->ordering_key}) ));
+    
+    my $coordindates = $self->groups_to_contigs->{$annotated_group_name}->{$self->ordering_key};
+    
+    for my $filename ( @{ $self->_sorted_file_names } ) {
+        my $group_to_file_genes = $self->_groups_to_files->{$group}->{$filename};
+
+        if ( defined($group_to_file_genes) && @{$group_to_file_genes} > 0 ) {
+            my $filename_cpy = $filename;
+            $filename_cpy =~ s!\.gff\.proteome\.faa!!;
+            push( @taxon_names_array,  $filename_cpy );
+            next;
+        }
+    }
+
+    my $taxon_names = join(" ",@taxon_names_array);
+    my $tab_file_entry = "FT   variation       $coordindates\n";
+    $tab_file_entry   .= "FT                   /colour=4\n";
+    $tab_file_entry   .= "FT                   /taxa=\"$taxon_names\"\n";
+
+    return $tab_file_entry;
+}
+
+sub create_file {
+    my ($self) = @_;
+
+    for my $group ( @{ $self->annotate_groups_obj->_groups })
+    {
+       print { $self->_output_fh } $self->_block($group)
+    }
+    close( $self->_output_fh );
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm
index 51f6329..958dccf 100644
--- a/lib/Bio/PanGenome/PostAnalysis.pm
+++ b/lib/Bio/PanGenome/PostAnalysis.pm
@@ -18,6 +18,7 @@ use Bio::PanGenome::GroupStatistics;
 use Bio::PanGenome::Output::GroupsMultifastasNucleotide;
 use Bio::PanGenome::Output::NumberOfGroups;
 use Bio::PanGenome::OrderGenes;
+use Bio::PanGenome::Output::EmblGroups;
 
 has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
 has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
@@ -69,9 +70,8 @@ sub run {
       analyse_groups_obj => $analyse_groups_obj,
       gff_files => $self->input_files,
     );
-   $order_genes_obj->groups_to_contigs;
-    
-
+    $order_genes_obj->groups_to_contigs;
+  
     my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new(
         analyse_groups  => $analyse_groups_obj,
         output_filename => $self->output_pan_geneome_filename
@@ -90,6 +90,24 @@ sub run {
       group_statistics_obj => $group_statistics
     );
     $gene_pool_expansion->create_output_files;
+    
+    my $core_accessory_tab_obj = Bio::PanGenome::Output::EmblGroups->new(
+      output_filename     => 'core_accessory.tab',
+      annotate_groups_obj => $annotate_groups,
+      analyse_groups_obj  => $analyse_groups_obj,
+      ordering_key        => 'core_accessory_overall_order',
+      groups_to_contigs   => $order_genes_obj->groups_to_contigs
+    );
+    $core_accessory_tab_obj->create_file;
+    
+    my $accessory_tab_obj = Bio::PanGenome::Output::EmblGroups->new(
+      output_filename     => 'accessory.tab',
+      annotate_groups_obj => $annotate_groups,
+      analyse_groups_obj  => $analyse_groups_obj,
+      ordering_key        => 'accessory_overall_order',
+      groups_to_contigs   => $order_genes_obj->groups_to_contigs
+    );
+    $accessory_tab_obj->create_file;
 
     if($self->output_multifasta_files)
     {
diff --git a/t/Bio/PanGenome/OrderGenes.t b/t/Bio/PanGenome/OrderGenes.t
index 0206962..d6b6051 100644
--- a/t/Bio/PanGenome/OrderGenes.t
+++ b/t/Bio/PanGenome/OrderGenes.t
@@ -22,6 +22,5 @@ ok(my $obj = Bio::PanGenome::OrderGenes->new(
   gff_files   => ['t/data/query_1.gff','t/data/query_2.gff','t/data/query_3.gff'],
 ),'Initialise order genes object');
 
-print Dumper $obj->group_order;
 
 done_testing();
diff --git a/t/Bio/PanGenome/ReorderSpreadsheet.t b/t/Bio/PanGenome/ReorderSpreadsheet.t
index 67cd898..9e1dc3e 100644
--- a/t/Bio/PanGenome/ReorderSpreadsheet.t
+++ b/t/Bio/PanGenome/ReorderSpreadsheet.t
@@ -20,12 +20,14 @@ ok(
     'initialise reordering the spreadsheet'
 );
 
-is_deeply($obj->_column_mappings,[0,1,2,3,4,5,6,8,9,7],'Column mappings with fixed in same order and end columns ordered by tree file');
+
+
+        
+is_deeply($obj->_column_mappings,[0,1,2,3,4,5,6,7,8,9,10,11,13,12],'Column mappings with fixed in same order and end columns ordered by tree file');
 ok( $obj->reorder_spreadsheet(), 'run the reorder method' );
 ok( -e $obj->output_filename,    'check the output file exists' );
 
 
-
 is(
     read_file('t/data/reorder_isolates_expected_output.csv'),
     read_file('reorder_isolates_output.csv'),
diff --git a/t/data/expected_group_statitics.csv b/t/data/expected_group_statitics.csv
index 2bdead5..424ffde 100644
--- a/t/data/expected_group_statitics.csv
+++ b/t/data/expected_group_statitics.csv
@@ -1,8 +1,8 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa"
-"hly","","Alpha-toxin","3","3","1","1_1","2_1","3_1"
-"argF","","Ornithine carbamoyltransferase","2","2","1","1_3","","3_3"
-"group_4","","","2","2","1","","2_4","3_4"
-"speH","","hypothetical protein","2","2","1","1_2","2_2",""
-"group_7","","Gonococcal growth inhibitor III","1","1","1","","2_7",""
-"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","3_5"
-"group_6","","Gonococcal growth inhibitor III","1","1","1","1_6","",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa"
+"hly","","Alpha-toxin","3","3","1","","","","","","1_1","2_1","3_1"
+"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","","3_3"
+"group_4","","","2","2","1","","","","","","","2_4","3_4"
+"speH","","hypothetical protein","2","2","1","","","","","","1_2","2_2",""
+"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","2_7",""
+"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","","3_5"
+"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","",""
diff --git a/t/data/expected_group_statitics_missing_genes.csv b/t/data/expected_group_statitics_missing_genes.csv
index 138fb4b..949b607 100644
--- a/t/data/expected_group_statitics_missing_genes.csv
+++ b/t/data/expected_group_statitics_missing_genes.csv
@@ -1,8 +1,8 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa","t/data/query_4_missing_genes.fa"
-"hly","","Alpha-toxin","4","4","1","1_1","2_1","3_1","4_1"
-"argF","","Ornithine carbamoyltransferase","2","2","1","1_3","","3_3",""
-"group_4","","","2","2","1","","2_4","3_4",""
-"speH","","hypothetical protein","2","2","1","1_2","2_2","",""
-"group_7","","Gonococcal growth inhibitor III","1","1","1","","2_7","",""
-"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","3_5",""
-"group_6","","Gonococcal growth inhibitor III","1","1","1","1_6","","",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa","t/data/query_4_missing_genes.fa"
+"hly","","Alpha-toxin","4","4","1","","","","","","1_1","2_1","3_1","4_1"
+"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","","3_3",""
+"group_4","","","2","2","1","","","","","","","2_4","3_4",""
+"speH","","hypothetical protein","2","2","1","","","","","","1_2","2_2","",""
+"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","2_7","",""
+"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","","3_5",""
+"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","","",""
diff --git a/t/data/expected_set_difference_common_set_statistics.csv b/t/data/expected_set_difference_common_set_statistics.csv
index e171ced..c9e9fc8 100644
--- a/t/data/expected_set_difference_common_set_statistics.csv
+++ b/t/data/expected_set_difference_common_set_statistics.csv
@@ -1,4 +1,4 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa"
-"group_1","","","3","3","1","1_1","2_1","3_1"
-"group_3","","","2","2","1","1_3","","3_3"
-"group_2","","","2","2","1","1_2","2_2",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa"
+"group_1","","","3","3","1","","","","","","1_1","2_1","3_1"
+"group_3","","","2","2","1","","","","","","1_3","","3_3"
+"group_2","","","2","2","1","","","","","","1_2","2_2",""
diff --git a/t/data/expected_set_difference_unique_set_one_statistics.csv b/t/data/expected_set_difference_unique_set_one_statistics.csv
index 06be68e..e6eb346 100644
--- a/t/data/expected_set_difference_unique_set_one_statistics.csv
+++ b/t/data/expected_set_difference_unique_set_one_statistics.csv
@@ -1,2 +1,2 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa"
-"group_6","","","1","1","1","1_6","",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa"
+"group_6","","","1","1","1","","","","","","1_6","",""
diff --git a/t/data/expected_set_difference_unique_set_two_statistics.csv b/t/data/expected_set_difference_unique_set_two_statistics.csv
index 4c25fc4..1d06674 100644
--- a/t/data/expected_set_difference_unique_set_two_statistics.csv
+++ b/t/data/expected_set_difference_unique_set_two_statistics.csv
@@ -1,4 +1,4 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa"
-"group_4","","","2","2","1","","2_4","3_4"
-"group_5","","","1","1","1","","","3_5"
-"group_7","","","1","1","1","","2_7",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa"
+"group_4","","","2","2","1","","","","","","","2_4","3_4"
+"group_5","","","1","1","1","","","","","","","","3_5"
+"group_7","","","1","1","1","","","","","","","2_7",""
diff --git a/t/data/overall_group_statisics.csv b/t/data/overall_group_statisics.csv
index e776209..2881d7d 100644
--- a/t/data/overall_group_statisics.csv
+++ b/t/data/overall_group_statisics.csv
@@ -1,15 +1,15 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_6"
-"argF","","Ornithine carbamoyltransferase","2","2","1","1_3","2_3",""
-"group_5","","Gonococcal growth inhibitor III","2","2","1","1_6","2_7",""
-"hly","","Alpha-toxin","2","2","1","1_1","2_1",""
-"speH","","hypothetical protein","2","2","1","1_2","2_2",""
-"group_13","","hypothetical protein","1","2","2","","","abc_00002	abc_00002"
-"group_6","","","1","2","2","","abc_01705	abc_01705",""
-"group_14","","hypothetical protein","1","2","2","","abc_00003	abc_00003",""
-"group_9","","C4-dicarboxylate transporter/malic acid transport protein","1","2","2","","abc_00011	abc_00011",""
-"group_8","","hypothetical protein","1","2","2","","abc_00010	abc_00010",""
-"group_2","","superantigen-like protein","1","2","2","","abc_00004	abc_00004",""
-"group_3","","superantigen-like protein","1","2","2","","abc_00006	abc_00006",""
-"group_7","","","1","2","2","","abc_00013	abc_00013",""
-"yfnB","","Putative HAD-hydrolase yfnB","1","2","2","","abc_00016	abc_00016",""
-"arcC1","","Carbamate kinase 1","1","2","2","","abc_00008	abc_00008",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_2","query_6"
+"argF","","Ornithine carbamoyltransferase","2","2","1","1","8",,,"","1_3","2_3",""
+"group_5","","Gonococcal growth inhibitor III","2","2","1","1","3",,,"","1_6","2_7",""
+"hly","","Alpha-toxin","2","2","1","1","1",,,"","1_1","2_1",""
+"speH","","hypothetical protein","2","2","1","1","10",,,"","1_2","2_2",""
+"group_13","","hypothetical protein","1","2","2","1","13",,,"","","","abc_00002	abc_00002"
+"group_6","","","1","2","2","","","","","","","abc_01705	abc_01705",""
+"group_14","","hypothetical protein","1","2","2","1","12",,,"","","abc_00003	abc_00003",""
+"group_9","","C4-dicarboxylate transporter/malic acid transport protein","1","2","2","1","5",,,"","","abc_00011	abc_00011",""
+"group_8","","hypothetical protein","1","2","2","1","6",,,"","","abc_00010	abc_00010",""
+"group_2","","superantigen-like protein","1","2","2","1","11",,,"","","abc_00004	abc_00004",""
+"group_3","","superantigen-like protein","1","2","2","1","9",,,"","","abc_00006	abc_00006",""
+"group_7","","","1","2","2","1","4",,,"","","abc_00013	abc_00013",""
+"yfnB","","Putative HAD-hydrolase yfnB","1","2","2","1","2",,,"","","abc_00016	abc_00016",""
+"arcC1","","Carbamate kinase 1","1","2","2","1","7",,,"","","abc_00008	abc_00008",""
diff --git a/t/data/reorder_isolates_expected_output.csv b/t/data/reorder_isolates_expected_output.csv
index bccdd62..50af5b4 100644
--- a/t/data/reorder_isolates_expected_output.csv
+++ b/t/data/reorder_isolates_expected_output.csv
@@ -1,8 +1,8 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_3","query_4","query_2"
-"hly","","Alpha-toxin","4","4","1","P","P","P","P"
-"argF","","Ornithine carbamoyltransferase","2","2","1","P","P","",""
-"group_4","","","2","2","1","","P","","P"
-"speH","","hypothetical protein","2","2","1","P","","","P"
-"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","P"
-"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","P","",""
-"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_3","query_2"
+"hly","","Alpha-toxin","3","3","1","","","","","","1_1","3_1","2_1"
+"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","3_3",""
+"group_4","","","2","2","1","","","","","","","3_4","2_4"
+"speH","","hypothetical protein","2","2","1","","","","","","1_2","","2_2"
+"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","","2_7"
+"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","3_5",""
+"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","",""
diff --git a/t/data/reorder_isolates_input.csv b/t/data/reorder_isolates_input.csv
index db4212f..725f953 100644
--- a/t/data/reorder_isolates_input.csv
+++ b/t/data/reorder_isolates_input.csv
@@ -1,8 +1,8 @@
-"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_3","query_4"
-"hly","","Alpha-toxin","4","4","1","P","P","P","P"
-"argF","","Ornithine carbamoyltransferase","2","2","1","P","","P",""
-"group_4","","","2","2","1","","P","P",""
-"speH","","hypothetical protein","2","2","1","P","P","",""
-"group_7","","Gonococcal growth inhibitor III","1","1","1","","P","",""
-"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","P",""
-"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","",""
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_2","query_3"
+"hly","","Alpha-toxin","3","3","1","","","","","","1_1","2_1","3_1"
+"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","","3_3"
+"group_4","","","2","2","1","","","","","","","2_4","3_4"
+"speH","","hypothetical protein","2","2","1","","","","","","1_2","2_2",""
+"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","2_7",""
+"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","","3_5"
+"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","",""
\ No newline at end of file