From 10422bc979248541b0b956ce393e298528a2a4e4 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Wed, 13 Nov 2013 13:24:27 +0000 Subject: [PATCH 1/5] create the list of genes and their links --- lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm | 63 +++++++++++++++++ lib/Bio/PanGenome/GeneNamesFromGFF.pm | 22 +----- lib/Bio/PanGenome/OrderGenes.pm | 71 ++++++++++++++++++++ lib/Bio/PanGenome/ParseGFFAnnotationRole.pm | 32 +++++++++ lib/Bio/PanGenome/PostAnalysis.pm | 7 ++ t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t | 22 ++++++ t/Bio/PanGenome/OrderGenes.t | 27 ++++++++ 7 files changed, 223 insertions(+), 21 deletions(-) create mode 100644 lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm create mode 100644 lib/Bio/PanGenome/OrderGenes.pm create mode 100644 lib/Bio/PanGenome/ParseGFFAnnotationRole.pm create mode 100644 t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t create mode 100644 t/Bio/PanGenome/OrderGenes.t diff --git a/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm b/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm new file mode 100644 index 0000000..61b75ba --- /dev/null +++ b/lib/Bio/PanGenome/ContigsToGeneIDsFromGFF.pm @@ -0,0 +1,63 @@ +package Bio::PanGenome::ContigsToGeneIDsFromGFF; + +# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig + +=head1 SYNOPSIS + +Parse a GFF and efficiently and extract ordered gene ids on each contig + use Bio::PanGenome::ContigsToGeneIDsFromGFF; + + my $obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new( + gff_file => 'abc.gff' + ); + $obj->contig_to_ids; + +=cut + +use Moose; +use Bio::Tools::GFF; +with 'Bio::PanGenome::ParseGFFAnnotationRole'; + +has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids'); + +# Manually parse the GFF file because the BioPerl module is too slow +sub _build_contig_to_ids +{ + my ($self) = @_; + my %contigs_to_ids; + + open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file"; + while(<$fh>) + { + chomp; + my $line = $_; + my $id_name; + if($line =~/ID=([^;]+);/) + { + $id_name= $1; + } + else + { + next; + } + + my @annotation_elements = split(/\t/,$line); + # Map gene IDs to the contig + push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name); + } + close($fh); + return \%contigs_to_ids; +} + +sub _build__awk_filter { + my ($self) = @_; + return + 'awk \'BEGIN {FS="\t"};{ if ($3 ~/' + . $self->_tags_to_filter + . '/) print $1"\t"$9;}\' '; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; diff --git a/lib/Bio/PanGenome/GeneNamesFromGFF.pm b/lib/Bio/PanGenome/GeneNamesFromGFF.pm index ad695ae..f5a38b3 100644 --- a/lib/Bio/PanGenome/GeneNamesFromGFF.pm +++ b/lib/Bio/PanGenome/GeneNamesFromGFF.pm @@ -16,12 +16,7 @@ Parse a GFF and efficiently extract ID -> Gene Name use Moose; use Bio::Tools::GFF; - -has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 ); - -has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => 'CDS' ); -has '_gff_parser' => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy => 1, builder => '_build__gff_parser' ); -has '_awk_filter' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__awk_filter' ); +with 'Bio::PanGenome::ParseGFFAnnotationRole'; has 'ids_to_gene_name' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_ids_to_gene_name' ); has 'ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); @@ -65,21 +60,6 @@ sub _build_ids_to_gene_name { return \%id_to_gene_name; } - - -sub _gff_fh_input_string { - my ($self) = @_; - return 'sed -n \'/##gff-version 3/,/##FASTA/p\' '.$self->gff_file.'| grep -v \'##FASTA\''." | " . $self->_awk_filter; -} - -sub _build__awk_filter { - my ($self) = @_; - return - 'awk \'BEGIN {FS="\t"};{ if ($3 ~/' - . $self->_tags_to_filter - . '/) print $9;}\' '; -} - no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm new file mode 100644 index 0000000..0f98472 --- /dev/null +++ b/lib/Bio/PanGenome/OrderGenes.pm @@ -0,0 +1,71 @@ +package Bio::PanGenome::OrderGenes; + +# ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes + +=head1 SYNOPSIS + +Take in the analyse groups and create a matrix of what genes are beside what other genes + use Bio::PanGenome::OrderGenes; + + my $obj = Bio::PanGenome::OrderGenes->new( + analyse_groups_obj => $analyse_groups_obj, + gff_files => ['file1.gff','file2.gff'] + ); + $obj->create_matrix; + +=cut + +use Moose; +use Bio::PanGenome::Exceptions; +use Bio::PanGenome::AnalyseGroups; +use Bio::PanGenome::ContigsToGeneIDsFromGFF; + +has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups', required => 1 ); +has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order'); + +sub _build_group_order +{ + my ($self) = @_; + my %group_order; + + # Open each GFF file + for my $filename (@{$self->gff_files}) + { + my $contigs_to_ids_obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new(gff_file => $filename); + + # Loop over each contig in the GFF file + for my $contig_name (keys %{$contigs_to_ids_obj->contig_to_ids}) + { + my @groups_on_contig; + # loop over each gene in each contig in the GFF file + for my $gene_id (@{$contigs_to_ids_obj->contig_to_ids->{$contig_name}}) + { + # convert to group name + my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id}; + next unless(defined($group_name)); + push(@groups_on_contig, $group_name); + } + + for(my $i = 1; $i < @groups_on_contig; $i++) + { + my $group_from = $groups_on_contig[$i -1]; + my $group_to = $groups_on_contig[$i]; + $group_order{$group_from}{$group_to}++; + # TODO: remove because you only need half the matix + $group_order{$group_to}{$group_from}++; + } + } + } + return \%group_order; +} + +sub create_matrix +{ + my ($self) = @_; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; diff --git a/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm new file mode 100644 index 0000000..f67161a --- /dev/null +++ b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm @@ -0,0 +1,32 @@ +package Bio::PanGenome::ParseGFFAnnotationRole; + +# ABSTRACT: A role for parsing a gff file efficiently + +=head1 SYNOPSIS + +with 'Bio::PanGenome::ParseGFFAnnotationRole'; + +=cut +use Moose::Role; +use Bio::Tools::GFF; + +has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 ); + +has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => 'CDS' ); +has '_gff_parser' => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy => 1, builder => '_build__gff_parser' ); +has '_awk_filter' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__awk_filter' ); + +sub _gff_fh_input_string { + my ($self) = @_; + return 'sed -n \'/##gff-version 3/,/##FASTA/p\' '.$self->gff_file.'| grep -v \'##FASTA\''." | " . $self->_awk_filter; +} + +sub _build__awk_filter { + my ($self) = @_; + return + 'awk \'BEGIN {FS="\t"};{ if ($3 ~/' + . $self->_tags_to_filter + . '/) print $9;}\' '; +} + +1; diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm index 3a226d9..4661afb 100644 --- a/lib/Bio/PanGenome/PostAnalysis.pm +++ b/lib/Bio/PanGenome/PostAnalysis.pm @@ -17,6 +17,7 @@ use Bio::PanGenome::Output::OneGenePerGroupFasta; use Bio::PanGenome::GroupStatistics; use Bio::PanGenome::Output::GroupsMultifastasNucleotide; use Bio::PanGenome::Output::NumberOfGroups; +use Bio::PanGenome::OrderGenes; has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); @@ -64,6 +65,12 @@ sub run { ); $analyse_groups_obj->create_plots(); + my $order_genes_obj = Bio::PanGenome::OrderGenes->new( + analyse_groups_obj => $analyse_groups_obj, + gff_files => $self->input_files, + ); + $order_genes_obj->group_order; + my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new( analyse_groups => $analyse_groups_obj, diff --git a/t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t b/t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t new file mode 100644 index 0000000..3b6273e --- /dev/null +++ b/t/Bio/PanGenome/ContigsToGeneIDsFromGFF.t @@ -0,0 +1,22 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Data::Dumper; +use File::Slurp; + +BEGIN { unshift( @INC, './lib' ) } + +BEGIN { + use Test::Most; + use_ok('Bio::PanGenome::ContigsToGeneIDsFromGFF'); +} + + +ok(my $obj = Bio::PanGenome::ContigsToGeneIDsFromGFF->new( + gff_file => 't/data/query_1.gff' +),'Initialise contigs to gene ids obj'); + +print Dumper $obj->contig_to_ids; + + +done_testing(); diff --git a/t/Bio/PanGenome/OrderGenes.t b/t/Bio/PanGenome/OrderGenes.t new file mode 100644 index 0000000..0206962 --- /dev/null +++ b/t/Bio/PanGenome/OrderGenes.t @@ -0,0 +1,27 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Data::Dumper; +use File::Slurp; + +BEGIN { unshift( @INC, './lib' ) } + +BEGIN { + use Test::Most; + use_ok('Bio::PanGenome::OrderGenes'); + use Bio::PanGenome::AnalyseGroups; +} + +my $analyse_groups = Bio::PanGenome::AnalyseGroups->new( + fasta_files => ['t/data/query_1.fa','t/data/query_2.fa','t/data/query_3.fa'], + groups_filename => 't/data/query_groups' +); + +ok(my $obj = Bio::PanGenome::OrderGenes->new( + analyse_groups_obj => $analyse_groups, + gff_files => ['t/data/query_1.gff','t/data/query_2.gff','t/data/query_3.gff'], +),'Initialise order genes object'); + +print Dumper $obj->group_order; + +done_testing(); From d5472dee24063b2c2a2047bca6593688acd23171 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Thu, 14 Nov 2013 13:51:56 +0000 Subject: [PATCH 2/5] breath first search --- .../PanGenome/CommandLine/CreatePanGenome.pm | 2 +- lib/Bio/PanGenome/External/PostAnalysis.pm | 1 + lib/Bio/PanGenome/GroupStatistics.pm | 15 ++- lib/Bio/PanGenome/OrderGenes.pm | 91 ++++++++++++++++++- lib/Bio/PanGenome/ParseGFFAnnotationRole.pm | 2 +- lib/Bio/PanGenome/PostAnalysis.pm | 27 +++--- 6 files changed, 117 insertions(+), 21 deletions(-) diff --git a/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm b/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm index aaf5958..f5b673b 100644 --- a/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm +++ b/lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm @@ -51,7 +51,7 @@ sub BUILD { ); if ( @{ $self->args } == 0 ) { - $self->_error_message("Error: You need to provide a FASTA file"); + $self->_error_message("Error: You need to provide a GFF file"); } $self->output_filename($output_filename) if ( defined($output_filename) ); diff --git a/lib/Bio/PanGenome/External/PostAnalysis.pm b/lib/Bio/PanGenome/External/PostAnalysis.pm index 8cd7b9a..68380a8 100644 --- a/lib/Bio/PanGenome/External/PostAnalysis.pm +++ b/lib/Bio/PanGenome/External/PostAnalysis.pm @@ -97,6 +97,7 @@ sub run { my ($self) = @_; my @commands_to_run; push( @commands_to_run, $self->_command_to_run ); + print $self->_command_to_run."\n"; my $job_runner_obj = $self->_job_runner_class->new( commands_to_run => \@commands_to_run, diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm index 5671c90..8d70704 100644 --- a/lib/Bio/PanGenome/GroupStatistics.pm +++ b/lib/Bio/PanGenome/GroupStatistics.pm @@ -26,6 +26,7 @@ use Bio::PanGenome::AnnotateGroups; has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnnotateGroups', required => 1 ); has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups', required => 1 ); has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'group_statitics.csv' ); +has 'groups_to_contigs' => ( is => 'ro', isa => 'Maybe[HashRef]'); has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' ); has '_text_csv_obj' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__text_csv_obj' ); @@ -49,7 +50,7 @@ sub _build__text_csv_obj { sub fixed_headers { my ($self) = @_; my @header = - ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate' ); + ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome', 'QC' ); return \@header; } @@ -130,10 +131,18 @@ sub _row { my $annotated_group_name = $self->annotate_groups_obj->_groups_to_consensus_gene_names->{$group}; my $duplicate_gene_name = $self->_non_unique_name_for_group($annotated_group_name); - + + my $genome_number = ''; + my $qc_comment = ''; + if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name})) + { + $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label}; + $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment}; + } + my @row = ( $annotated_group_name, $duplicate_gene_name, $annotation, - $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate + $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$qc_comment ); for my $filename ( @{ $self->_sorted_file_names } ) { diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm index 0f98472..cb28b4f 100644 --- a/lib/Bio/PanGenome/OrderGenes.pm +++ b/lib/Bio/PanGenome/OrderGenes.pm @@ -11,7 +11,7 @@ Take in the analyse groups and create a matrix of what genes are beside what oth analyse_groups_obj => $analyse_groups_obj, gff_files => ['file1.gff','file2.gff'] ); - $obj->create_matrix; + $obj->groups_to_contigs; =cut @@ -19,10 +19,26 @@ use Moose; use Bio::PanGenome::Exceptions; use Bio::PanGenome::AnalyseGroups; use Bio::PanGenome::ContigsToGeneIDsFromGFF; +use Boost::Graph; has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups', required => 1 ); has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order'); +has 'group_graphs' => ( is => 'ro', isa => 'Boost::Graph', lazy => 1, builder => '_build_group_graphs'); +has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs'); + +has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups'); + +sub _build_groups +{ + my ($self) = @_; + my %groups; + for my $group_name (@{$self->analyse_groups_obj->_groups}) + { + $groups{$group_name}++; + } + return \%groups; +} sub _build_group_order { @@ -57,14 +73,83 @@ sub _build_group_order } } } + return \%group_order; } -sub create_matrix +sub _build_group_graphs { - my ($self) = @_; + my($self) = @_; + return Boost::Graph->new(directed=>0); +} + + +sub _add_groups_to_graph +{ + my($self) = @_; + + for my $current_group (keys %{$self->group_order()}) + { + for my $group_to (keys %{$self->group_order->{$current_group}}) + { + my $weight = 1.0/ ($self->group_order->{$current_group}->{$group_to}) ; + $self->group_graphs->add_edge(node1=>$current_group, node2=>$group_to, weight=>$weight); + } + } + +} + + +sub _get_connected_nodes +{ + my($self) = @_; + + my @all_groups = keys %{$self->_groups}; + my $search_group = $all_groups[0]; + + my $connected_groups = $self->group_graphs->breadth_first_search($search_group); + for my $group_name (@{$connected_groups }) + { + delete($self->_groups->{$group_name}); + } + if(defined($self->_groups->{$search_group})) + { + delete($self->_groups->{$search_group}); + push(@{$connected_groups},$search_group ); + } + + return $connected_groups; +} + +sub _build_groups_to_contigs +{ + my($self) = @_; + $self->_add_groups_to_graph; + + my %groups_to_contigs; + my $counter = 1; + while((keys %{$self->_groups} ) > 0) + { + my $contig_groups = $self->_get_connected_nodes; + for my $group_name (@{$contig_groups}) + { + $groups_to_contigs{$group_name}{label} = $counter; + $groups_to_contigs{$group_name}{comment} = ''; + if(@{$contig_groups} == 1) + { + $groups_to_contigs{$group_name}{comment} = 'Contamination'; + } + } + $counter++; + } + + $self->group_graphs->connected_components; + + + return \%groups_to_contigs; } + no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm index f67161a..154ec92 100644 --- a/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm +++ b/lib/Bio/PanGenome/ParseGFFAnnotationRole.pm @@ -12,7 +12,7 @@ use Bio::Tools::GFF; has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 ); -has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => 'CDS' ); +has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA)' ); has '_gff_parser' => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy => 1, builder => '_build__gff_parser' ); has '_awk_filter' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__awk_filter' ); diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm index 4661afb..51f6329 100644 --- a/lib/Bio/PanGenome/PostAnalysis.pm +++ b/lib/Bio/PanGenome/PostAnalysis.pm @@ -69,7 +69,7 @@ sub run { analyse_groups_obj => $analyse_groups_obj, gff_files => $self->input_files, ); - $order_genes_obj->group_order; + $order_genes_obj->groups_to_contigs; my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new( @@ -81,7 +81,8 @@ sub run { my $group_statistics = Bio::PanGenome::GroupStatistics->new( output_filename => $self->output_statistics_filename, annotate_groups_obj => $annotate_groups, - analyse_groups_obj => $analyse_groups_obj + analyse_groups_obj => $analyse_groups_obj, + groups_to_contigs => $order_genes_obj->groups_to_contigs ); $group_statistics->create_spreadsheet; @@ -100,17 +101,17 @@ sub run { $group_multifastas_nucleotides->create_files(); } - unlink($output_mcl_filename); - unlink($output_inflate_clusters_filename); - unlink($output_group_labels_filename); - unlink($output_combined_filename); - unlink( $self->clusters_filename); - unlink( $self->clusters_filename . '.clstr' ); - unlink( $self->clusters_filename . '.bak.clstr' ); - unlink('_gff_files'); - unlink('_fasta_files'); - unlink('_clustered_filtered.fa'); - unlink($input_cd_hit_groups_file); + # unlink($output_mcl_filename); + # unlink($output_inflate_clusters_filename); + # unlink($output_group_labels_filename); + # unlink($output_combined_filename); + # unlink( $self->clusters_filename); + # unlink( $self->clusters_filename . '.clstr' ); + # unlink( $self->clusters_filename . '.bak.clstr' ); + # unlink('_gff_files'); + # unlink('_fasta_files'); + # unlink('_clustered_filtered.fa'); + # unlink($input_cd_hit_groups_file); } From 94e8819824383999f6347a41d623a6e6cc391bea Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Fri, 15 Nov 2013 09:25:44 +0000 Subject: [PATCH 3/5] boost --- lib/Bio/PanGenome/GroupStatistics.pm | 6 +- lib/Bio/PanGenome/OrderGenes.pm | 135 +++++++++++++++++++-------- 2 files changed, 101 insertions(+), 40 deletions(-) diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm index 8d70704..2bf5491 100644 --- a/lib/Bio/PanGenome/GroupStatistics.pm +++ b/lib/Bio/PanGenome/GroupStatistics.pm @@ -50,7 +50,7 @@ sub _build__text_csv_obj { sub fixed_headers { my ($self) = @_; my @header = - ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome', 'QC' ); + ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'QC' ); return \@header; } @@ -134,15 +134,17 @@ sub _row { my $genome_number = ''; my $qc_comment = ''; + my $order_within_fragement = ''; if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name})) { $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label}; $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment}; + $order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{order}; } my @row = ( $annotated_group_name, $duplicate_gene_name, $annotation, - $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$qc_comment + $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$qc_comment ); for my $filename ( @{ $self->_sorted_file_names } ) { diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm index cb28b4f..849e117 100644 --- a/lib/Bio/PanGenome/OrderGenes.pm +++ b/lib/Bio/PanGenome/OrderGenes.pm @@ -20,12 +20,14 @@ use Bio::PanGenome::Exceptions; use Bio::PanGenome::AnalyseGroups; use Bio::PanGenome::ContigsToGeneIDsFromGFF; use Boost::Graph; +use Data::Dumper; has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups', required => 1 ); has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order'); has 'group_graphs' => ( is => 'ro', isa => 'Boost::Graph', lazy => 1, builder => '_build_group_graphs'); has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs'); +has '_groups_to_file_contigs' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__groups_to_file_contigs'); has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups'); @@ -40,10 +42,11 @@ sub _build_groups return \%groups; } -sub _build_group_order + +sub _build__groups_to_file_contigs { my ($self) = @_; - my %group_order; + my @groups_to_contigs; # Open each GFF file for my $filename (@{$self->gff_files}) @@ -62,15 +65,34 @@ sub _build_group_order next unless(defined($group_name)); push(@groups_on_contig, $group_name); } + push(@groups_to_contigs,\@groups_on_contig); + } + } - for(my $i = 1; $i < @groups_on_contig; $i++) - { - my $group_from = $groups_on_contig[$i -1]; - my $group_to = $groups_on_contig[$i]; - $group_order{$group_from}{$group_to}++; - # TODO: remove because you only need half the matix - $group_order{$group_to}{$group_from}++; - } + return \@groups_to_contigs; + +} + +sub _build_group_order +{ + my ($self) = @_; + my %group_order; + + for my $groups_on_contig (@{$self->_groups_to_file_contigs}) + { + for(my $i = 1; $i < @{$groups_on_contig}; $i++) + { + my $group_from = $groups_on_contig->[$i -1]; + my $group_to = $groups_on_contig->[$i]; + $group_order{$group_from}{$group_to}++; + # TODO: remove because you only need half the matix + $group_order{$group_to}{$group_from}++; + } + if(@{$groups_on_contig} == 1) + { + my $group_from = $groups_on_contig->[0]; + my $group_to = $groups_on_contig->[0]; + $group_order{$group_from}{$group_to}++; } } @@ -92,7 +114,7 @@ sub _add_groups_to_graph { for my $group_to (keys %{$self->group_order->{$current_group}}) { - my $weight = 1.0/ ($self->group_order->{$current_group}->{$group_to}) ; + my $weight = 1.0/($self->group_order->{$current_group}->{$group_to} ); $self->group_graphs->add_edge(node1=>$current_group, node2=>$group_to, weight=>$weight); } } @@ -100,27 +122,6 @@ sub _add_groups_to_graph } -sub _get_connected_nodes -{ - my($self) = @_; - - my @all_groups = keys %{$self->_groups}; - my $search_group = $all_groups[0]; - - my $connected_groups = $self->group_graphs->breadth_first_search($search_group); - for my $group_name (@{$connected_groups }) - { - delete($self->_groups->{$group_name}); - } - if(defined($self->_groups->{$search_group})) - { - delete($self->_groups->{$search_group}); - push(@{$connected_groups},$search_group ); - } - - return $connected_groups; -} - sub _build_groups_to_contigs { my($self) = @_; @@ -128,25 +129,83 @@ sub _build_groups_to_contigs my %groups_to_contigs; my $counter = 1; - while((keys %{$self->_groups} ) > 0) + + # connected_components seg faults if you dont run breadth_first_search first + my @all_groups = keys %{$self->_groups}; + $self->group_graphs->breadth_first_search($all_groups[0]); + my $groups_to_contigs = $self->group_graphs->connected_components; + + for my $groups_to_contig (@{ $groups_to_contigs}) { - my $contig_groups = $self->_get_connected_nodes; - for my $group_name (@{$contig_groups}) + my $contig_groups = $groups_to_contig; + my $order_counter = 1; + my $reordered_group = $self->_reorder_contig($contig_groups); + + for my $group_name (@{$reordered_group}) { $groups_to_contigs{$group_name}{label} = $counter; $groups_to_contigs{$group_name}{comment} = ''; - if(@{$contig_groups} == 1) + $groups_to_contigs{$group_name}{order} = $order_counter; + if(@{$contig_groups} <= 4) { $groups_to_contigs{$group_name}{comment} = 'Contamination'; } + $order_counter++; } $counter++; } + + return \%groups_to_contigs; +} + + +sub _reorder_contig +{ + my($self,$groups_to_contigs) = @_; + + my $longest_path ; + my %current_groups_to_contigs; + + for my $group (@{$groups_to_contigs}) + { + $current_groups_to_contigs{$group}++; + } + + my @elements = keys(%current_groups_to_contigs); + my $starting_node = $elements[rand @elements]; - $self->group_graphs->connected_components; + for(my $i = 0; ($i < 1000 && $i < @elements) ; $i++) + { + my $ending_node = $elements[rand @elements]; + my $current_path = $self->group_graphs->dijkstra_shortest_path($starting_node, $ending_node); + next if(! defined($current_path)); + if(!defined($longest_path)) + { + $longest_path = $current_path; + } + + + if(@{$current_path->{path}} > @{$longest_path->{path}}) + { + $longest_path = $current_path; + } + print Dumper "$i ".@{$longest_path->{path}}.""; + + } + + + my @output_order; + for my $group(@{$longest_path->{path}}) + { + push( @output_order,$group); + } + for my $group (keys(%current_groups_to_contigs)) + { + push( @output_order,$group); + } + return \@output_order; - return \%groups_to_contigs; } From 41b539fa0a6bb0a6603ad95ed2e9db61f2f5fbce Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Fri, 15 Nov 2013 16:38:29 +0000 Subject: [PATCH 4/5] minimum spanning trees --- lib/Bio/PanGenome/GroupStatistics.pm | 9 +- lib/Bio/PanGenome/OrderGenes.pm | 165 ++++++++++++++++++--------- 2 files changed, 116 insertions(+), 58 deletions(-) diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm index 2bf5491..d489ffd 100644 --- a/lib/Bio/PanGenome/GroupStatistics.pm +++ b/lib/Bio/PanGenome/GroupStatistics.pm @@ -50,7 +50,7 @@ sub _build__text_csv_obj { sub fixed_headers { my ($self) = @_; my @header = - ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'QC' ); + ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'Accessory Fragement','Accessory Order with Fragment', 'QC' ); return \@header; } @@ -135,16 +135,21 @@ sub _row { my $genome_number = ''; my $qc_comment = ''; my $order_within_fragement = ''; + my $accessory_order_within_fragement = ''; + my $accessory_genome_number = ''; if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name})) { $genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label}; $qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment}; $order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{order}; + + $accessory_genome_number = $self->groups_to_contigs->{$annotated_group_name}->{accessory_label}; + $accessory_order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{accessory_order}; } my @row = ( $annotated_group_name, $duplicate_gene_name, $annotation, - $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$qc_comment + $num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$accessory_genome_number,$accessory_order_within_fragement,$qc_comment ); for my $filename ( @{ $self->_sorted_file_names } ) { diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm index 849e117..30ef268 100644 --- a/lib/Bio/PanGenome/OrderGenes.pm +++ b/lib/Bio/PanGenome/OrderGenes.pm @@ -19,17 +19,24 @@ use Moose; use Bio::PanGenome::Exceptions; use Bio::PanGenome::AnalyseGroups; use Bio::PanGenome::ContigsToGeneIDsFromGFF; -use Boost::Graph; -use Data::Dumper; +use Graph; has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups', required => 1 ); has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order'); -has 'group_graphs' => ( is => 'ro', isa => 'Boost::Graph', lazy => 1, builder => '_build_group_graphs'); +has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs'); has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs'); has '_groups_to_file_contigs' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__groups_to_file_contigs'); has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups'); +has 'number_of_files' => => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files'); + + +sub _build_number_of_files +{ + my ($self) = @_; + return @{$self->gff_files}; +} sub _build_groups { @@ -102,7 +109,7 @@ sub _build_group_order sub _build_group_graphs { my($self) = @_; - return Boost::Graph->new(directed=>0); + return Graph->new(undirected => 1); } @@ -115,13 +122,56 @@ sub _add_groups_to_graph for my $group_to (keys %{$self->group_order->{$current_group}}) { my $weight = 1.0/($self->group_order->{$current_group}->{$group_to} ); - $self->group_graphs->add_edge(node1=>$current_group, node2=>$group_to, weight=>$weight); + $self->group_graphs->add_weighted_edge($current_group,$group_to, $weight); } } } +sub _reorder_connected_components +{ + my($self, $graph_groups) = @_; + + my @ordered_graph_groups; + + for my $graph_group( @{$graph_groups}) + { + if(@{$graph_group} < 3) + { + push( @ordered_graph_groups,$graph_group ); + next; + } + + my $graph = Graph->new(undirected => 1); + my %groups; + $groups{$_}++ for (@{$graph_group}); + + for my $current_group (keys %groups) + { + for my $group_to (keys %{$self->group_order->{$current_group}}) + { + next if(! defined($groups{$group_to})); + next if($graph->has_edge($group_to,$current_group)); + my $current_weight = $self->group_order->{$current_group}->{$group_to} ; + $current_weight = $self->number_of_files if($current_weight > $self->number_of_files); + my $weight = ($self->number_of_files - $current_weight) +1; + + $graph->add_weighted_edge($current_group,$group_to, $weight); + } + } + + my $minimum_spanning_tree = $graph->minimum_spanning_tree; + my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree); + + my @reordered_dfs_groups = $dfs_obj->dfs; + + push(@ordered_graph_groups, \@reordered_dfs_groups); + + } + return \@ordered_graph_groups; +} + sub _build_groups_to_contigs { my($self) = @_; @@ -130,85 +180,88 @@ sub _build_groups_to_contigs my %groups_to_contigs; my $counter = 1; - # connected_components seg faults if you dont run breadth_first_search first - my @all_groups = keys %{$self->_groups}; - $self->group_graphs->breadth_first_search($all_groups[0]); - my $groups_to_contigs = $self->group_graphs->connected_components; + # Accessory + my $accessory_graph = $self->_create_accessory_graph; + my @group_graphs = $accessory_graph->connected_components(); + my $reordered_graphs = $self->_reorder_connected_components(\@group_graphs); - for my $groups_to_contig (@{ $groups_to_contigs}) + for my $contig_groups (@{$reordered_graphs}) { - my $contig_groups = $groups_to_contig; my $order_counter = 1; - my $reordered_group = $self->_reorder_contig($contig_groups); - - for my $group_name (@{$reordered_group}) + + for my $group_name (@{$contig_groups}) + { + $groups_to_contigs{$group_name}{accessory_label} = $counter; + $groups_to_contigs{$group_name}{accessory_order} = $order_counter; + $order_counter++; + } + $counter++; + } + + # Core + accessory + my @group_graphs_all = $self->group_graphs->connected_components(); + my $reordered_graphs_all = $self->_reorder_connected_components(\@group_graphs_all); + + $counter = 1; + for my $contig_groups (@{$reordered_graphs_all}) + { + my $order_counter = 1; + + for my $group_name (@{$contig_groups}) { $groups_to_contigs{$group_name}{label} = $counter; $groups_to_contigs{$group_name}{comment} = ''; $groups_to_contigs{$group_name}{order} = $order_counter; - if(@{$contig_groups} <= 4) + if(@{$contig_groups} <= 2) { - $groups_to_contigs{$group_name}{comment} = 'Contamination'; + $groups_to_contigs{$group_name}{comment} = 'Investigate'; } $order_counter++; } $counter++; } + return \%groups_to_contigs; } - -sub _reorder_contig +sub _create_accessory_graph { - my($self,$groups_to_contigs) = @_; - - my $longest_path ; - my %current_groups_to_contigs; + my($self) = @_; + my $graph = Graph->new(undirected => 1); + + my %core_groups; - for my $group (@{$groups_to_contigs}) + for my $current_group (keys %{$self->group_order()}) { - $current_groups_to_contigs{$group}++; + my $sum_of_weights = 0; + for my $group_to (keys %{$self->group_order->{$current_group}}) + { + $sum_of_weights += $self->group_order->{$current_group}->{$group_to}; + } + if($sum_of_weights >= $self->number_of_files ) + { + $core_groups{$current_group}++; + } } - my @elements = keys(%current_groups_to_contigs); - my $starting_node = $elements[rand @elements]; - - for(my $i = 0; ($i < 1000 && $i < @elements) ; $i++) + for my $current_group (keys %{$self->group_order()}) { - my $ending_node = $elements[rand @elements]; - my $current_path = $self->group_graphs->dijkstra_shortest_path($starting_node, $ending_node); - next if(! defined($current_path)); - - if(!defined($longest_path)) - { - $longest_path = $current_path; - } - - - if(@{$current_path->{path}} > @{$longest_path->{path}}) + next if(defined($core_groups{$current_group})); + for my $group_to (keys %{$self->group_order->{$current_group}}) { - $longest_path = $current_path; + next if(defined($core_groups{$group_to})); + my $weight = 1.0/($self->group_order->{$current_group}->{$group_to} ); + $graph->add_weighted_edge($current_group,$group_to, $weight); } - print Dumper "$i ".@{$longest_path->{path}}.""; - - } - - - my @output_order; - for my $group(@{$longest_path->{path}}) - { - push( @output_order,$group); - } - for my $group (keys(%current_groups_to_contigs)) - { - push( @output_order,$group); - } - return \@output_order; - + } + + return $graph; } + + no Moose; __PACKAGE__->meta->make_immutable; From 18d98565d0c971a6d7d6e437fedf75cf8e29c57b Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Mon, 18 Nov 2013 12:23:25 +0000 Subject: [PATCH 5/5] tests for gene ordering --- lib/Bio/PanGenome/OrderGenes.pm | 11 +- lib/Bio/PanGenome/Output/EmblGroups.pm | 106 ++++++++++++++++++ lib/Bio/PanGenome/PostAnalysis.pm | 24 +++- t/Bio/PanGenome/OrderGenes.t | 1 - t/Bio/PanGenome/ReorderSpreadsheet.t | 6 +- t/data/expected_group_statitics.csv | 16 +-- ...expected_group_statitics_missing_genes.csv | 16 +-- ...d_set_difference_common_set_statistics.csv | 8 +- ...t_difference_unique_set_one_statistics.csv | 4 +- ...t_difference_unique_set_two_statistics.csv | 8 +- t/data/overall_group_statisics.csv | 30 ++--- t/data/reorder_isolates_expected_output.csv | 16 +-- t/data/reorder_isolates_input.csv | 16 +-- 13 files changed, 197 insertions(+), 65 deletions(-) create mode 100644 lib/Bio/PanGenome/Output/EmblGroups.pm diff --git a/lib/Bio/PanGenome/OrderGenes.pm b/lib/Bio/PanGenome/OrderGenes.pm index 30ef268..571de90 100644 --- a/lib/Bio/PanGenome/OrderGenes.pm +++ b/lib/Bio/PanGenome/OrderGenes.pm @@ -179,13 +179,14 @@ sub _build_groups_to_contigs my %groups_to_contigs; my $counter = 1; + my $overall_counter = 1 ; # Accessory my $accessory_graph = $self->_create_accessory_graph; my @group_graphs = $accessory_graph->connected_components(); my $reordered_graphs = $self->_reorder_connected_components(\@group_graphs); - for my $contig_groups (@{$reordered_graphs}) + for my $contig_groups (sort { @{$b} <=> @{$a} } @{$reordered_graphs}) { my $order_counter = 1; @@ -193,7 +194,10 @@ sub _build_groups_to_contigs { $groups_to_contigs{$group_name}{accessory_label} = $counter; $groups_to_contigs{$group_name}{accessory_order} = $order_counter; + $groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter; + $order_counter++; + $overall_counter++; } $counter++; } @@ -202,8 +206,9 @@ sub _build_groups_to_contigs my @group_graphs_all = $self->group_graphs->connected_components(); my $reordered_graphs_all = $self->_reorder_connected_components(\@group_graphs_all); + $overall_counter = 1; $counter = 1; - for my $contig_groups (@{$reordered_graphs_all}) + for my $contig_groups (sort {@{$b} <=> @{$a} } @{$reordered_graphs_all}) { my $order_counter = 1; @@ -212,11 +217,13 @@ sub _build_groups_to_contigs $groups_to_contigs{$group_name}{label} = $counter; $groups_to_contigs{$group_name}{comment} = ''; $groups_to_contigs{$group_name}{order} = $order_counter; + $groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter; if(@{$contig_groups} <= 2) { $groups_to_contigs{$group_name}{comment} = 'Investigate'; } $order_counter++; + $overall_counter++; } $counter++; } diff --git a/lib/Bio/PanGenome/Output/EmblGroups.pm b/lib/Bio/PanGenome/Output/EmblGroups.pm new file mode 100644 index 0000000..3288096 --- /dev/null +++ b/lib/Bio/PanGenome/Output/EmblGroups.pm @@ -0,0 +1,106 @@ +package Bio::PanGenome::Output::EmblGroups; + +# ABSTRACT: Create a tab/embl file with the features for drawing pretty pictures + +=head1 SYNOPSIS + +reate a tab/embl file with the features for drawing pretty pictures + use Bio::PanGenome::Output::EmblGroups; + + my $obj = Bio::PanGenome::Output::EmblGroups->new( + output_filename => 'group_statitics.csv', + annotate_groups_obj => $annotate_groups_obj, + analyse_groups_obj => $analyse_groups_obj + ); + $obj->create_file; + +=cut + +use Moose; +use POSIX; +use Bio::PanGenome::Exceptions; +use Bio::PanGenome::AnalyseGroups; +use Bio::PanGenome::AnnotateGroups; + +has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnnotateGroups', required => 1 ); +has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::PanGenome::AnalyseGroups', required => 1 ); +has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory.tab' ); +has 'groups_to_contigs' => ( is => 'ro', isa => 'Maybe[HashRef]'); +has 'ordering_key' => ( is => 'ro', isa => 'Str', default => 'core_accessory_overall_order' ); + +has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' ); +has '_sorted_file_names' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_file_names' ); +has '_groups_to_files' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_files' ); + +sub _build__output_fh { + my ($self) = @_; + open( my $fh, '>', $self->output_filename ) + or Bio::PanGenome::Exceptions::CouldntWriteToFile->throw( + error => "Couldnt write output file:" . $self->output_filename ); + return $fh; +} + +sub _build__sorted_file_names { + my ($self) = @_; + my @sorted_file_names = sort( @{ $self->analyse_groups_obj->fasta_files } ); + return \@sorted_file_names; +} + +sub _build__groups_to_files { + my ($self) = @_; + my %groups_to_files; + for my $group ( @{ $self->annotate_groups_obj->_groups } ) { + my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group}; + my %filenames; + for my $gene_name ( @{$genes} ) { + my $filename = $self->analyse_groups_obj->_genes_to_file->{$gene_name}; + push( @{ $filenames{$filename} }, $gene_name ); + } + $groups_to_files{$group} = \%filenames; + } + return \%groups_to_files; +} + +sub _block { + my ( $self, $group ) = @_; + my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group}; + my @taxon_names_array; + my $annotated_group_name = $self->annotate_groups_obj->_groups_to_consensus_gene_names->{$group}; + + return '' if(!(defined($self->groups_to_contigs->{$annotated_group_name}) && defined($self->groups_to_contigs->{$annotated_group_name}->{$self->ordering_key}) )); + + my $coordindates = $self->groups_to_contigs->{$annotated_group_name}->{$self->ordering_key}; + + for my $filename ( @{ $self->_sorted_file_names } ) { + my $group_to_file_genes = $self->_groups_to_files->{$group}->{$filename}; + + if ( defined($group_to_file_genes) && @{$group_to_file_genes} > 0 ) { + my $filename_cpy = $filename; + $filename_cpy =~ s!\.gff\.proteome\.faa!!; + push( @taxon_names_array, $filename_cpy ); + next; + } + } + + my $taxon_names = join(" ",@taxon_names_array); + my $tab_file_entry = "FT variation $coordindates\n"; + $tab_file_entry .= "FT /colour=4\n"; + $tab_file_entry .= "FT /taxa=\"$taxon_names\"\n"; + + return $tab_file_entry; +} + +sub create_file { + my ($self) = @_; + + for my $group ( @{ $self->annotate_groups_obj->_groups }) + { + print { $self->_output_fh } $self->_block($group) + } + close( $self->_output_fh ); +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm index 51f6329..958dccf 100644 --- a/lib/Bio/PanGenome/PostAnalysis.pm +++ b/lib/Bio/PanGenome/PostAnalysis.pm @@ -18,6 +18,7 @@ use Bio::PanGenome::GroupStatistics; use Bio::PanGenome::Output::GroupsMultifastasNucleotide; use Bio::PanGenome::Output::NumberOfGroups; use Bio::PanGenome::OrderGenes; +use Bio::PanGenome::Output::EmblGroups; has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); @@ -69,9 +70,8 @@ sub run { analyse_groups_obj => $analyse_groups_obj, gff_files => $self->input_files, ); - $order_genes_obj->groups_to_contigs; - - + $order_genes_obj->groups_to_contigs; + my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new( analyse_groups => $analyse_groups_obj, output_filename => $self->output_pan_geneome_filename @@ -90,6 +90,24 @@ sub run { group_statistics_obj => $group_statistics ); $gene_pool_expansion->create_output_files; + + my $core_accessory_tab_obj = Bio::PanGenome::Output::EmblGroups->new( + output_filename => 'core_accessory.tab', + annotate_groups_obj => $annotate_groups, + analyse_groups_obj => $analyse_groups_obj, + ordering_key => 'core_accessory_overall_order', + groups_to_contigs => $order_genes_obj->groups_to_contigs + ); + $core_accessory_tab_obj->create_file; + + my $accessory_tab_obj = Bio::PanGenome::Output::EmblGroups->new( + output_filename => 'accessory.tab', + annotate_groups_obj => $annotate_groups, + analyse_groups_obj => $analyse_groups_obj, + ordering_key => 'accessory_overall_order', + groups_to_contigs => $order_genes_obj->groups_to_contigs + ); + $accessory_tab_obj->create_file; if($self->output_multifasta_files) { diff --git a/t/Bio/PanGenome/OrderGenes.t b/t/Bio/PanGenome/OrderGenes.t index 0206962..d6b6051 100644 --- a/t/Bio/PanGenome/OrderGenes.t +++ b/t/Bio/PanGenome/OrderGenes.t @@ -22,6 +22,5 @@ ok(my $obj = Bio::PanGenome::OrderGenes->new( gff_files => ['t/data/query_1.gff','t/data/query_2.gff','t/data/query_3.gff'], ),'Initialise order genes object'); -print Dumper $obj->group_order; done_testing(); diff --git a/t/Bio/PanGenome/ReorderSpreadsheet.t b/t/Bio/PanGenome/ReorderSpreadsheet.t index 67cd898..9e1dc3e 100644 --- a/t/Bio/PanGenome/ReorderSpreadsheet.t +++ b/t/Bio/PanGenome/ReorderSpreadsheet.t @@ -20,12 +20,14 @@ ok( 'initialise reordering the spreadsheet' ); -is_deeply($obj->_column_mappings,[0,1,2,3,4,5,6,8,9,7],'Column mappings with fixed in same order and end columns ordered by tree file'); + + + +is_deeply($obj->_column_mappings,[0,1,2,3,4,5,6,7,8,9,10,11,13,12],'Column mappings with fixed in same order and end columns ordered by tree file'); ok( $obj->reorder_spreadsheet(), 'run the reorder method' ); ok( -e $obj->output_filename, 'check the output file exists' ); - is( read_file('t/data/reorder_isolates_expected_output.csv'), read_file('reorder_isolates_output.csv'), diff --git a/t/data/expected_group_statitics.csv b/t/data/expected_group_statitics.csv index 2bdead5..424ffde 100644 --- a/t/data/expected_group_statitics.csv +++ b/t/data/expected_group_statitics.csv @@ -1,8 +1,8 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa" -"hly","","Alpha-toxin","3","3","1","1_1","2_1","3_1" -"argF","","Ornithine carbamoyltransferase","2","2","1","1_3","","3_3" -"group_4","","","2","2","1","","2_4","3_4" -"speH","","hypothetical protein","2","2","1","1_2","2_2","" -"group_7","","Gonococcal growth inhibitor III","1","1","1","","2_7","" -"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","3_5" -"group_6","","Gonococcal growth inhibitor III","1","1","1","1_6","","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa" +"hly","","Alpha-toxin","3","3","1","","","","","","1_1","2_1","3_1" +"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","","3_3" +"group_4","","","2","2","1","","","","","","","2_4","3_4" +"speH","","hypothetical protein","2","2","1","","","","","","1_2","2_2","" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","2_7","" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","","3_5" +"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","","" diff --git a/t/data/expected_group_statitics_missing_genes.csv b/t/data/expected_group_statitics_missing_genes.csv index 138fb4b..949b607 100644 --- a/t/data/expected_group_statitics_missing_genes.csv +++ b/t/data/expected_group_statitics_missing_genes.csv @@ -1,8 +1,8 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa","t/data/query_4_missing_genes.fa" -"hly","","Alpha-toxin","4","4","1","1_1","2_1","3_1","4_1" -"argF","","Ornithine carbamoyltransferase","2","2","1","1_3","","3_3","" -"group_4","","","2","2","1","","2_4","3_4","" -"speH","","hypothetical protein","2","2","1","1_2","2_2","","" -"group_7","","Gonococcal growth inhibitor III","1","1","1","","2_7","","" -"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","3_5","" -"group_6","","Gonococcal growth inhibitor III","1","1","1","1_6","","","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","t/data/query_1.fa","t/data/query_2.fa","t/data/query_3.fa","t/data/query_4_missing_genes.fa" +"hly","","Alpha-toxin","4","4","1","","","","","","1_1","2_1","3_1","4_1" +"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","","3_3","" +"group_4","","","2","2","1","","","","","","","2_4","3_4","" +"speH","","hypothetical protein","2","2","1","","","","","","1_2","2_2","","" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","2_7","","" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","","3_5","" +"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","","","" diff --git a/t/data/expected_set_difference_common_set_statistics.csv b/t/data/expected_set_difference_common_set_statistics.csv index e171ced..c9e9fc8 100644 --- a/t/data/expected_set_difference_common_set_statistics.csv +++ b/t/data/expected_set_difference_common_set_statistics.csv @@ -1,4 +1,4 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa" -"group_1","","","3","3","1","1_1","2_1","3_1" -"group_3","","","2","2","1","1_3","","3_3" -"group_2","","","2","2","1","1_2","2_2","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa" +"group_1","","","3","3","1","","","","","","1_1","2_1","3_1" +"group_3","","","2","2","1","","","","","","1_3","","3_3" +"group_2","","","2","2","1","","","","","","1_2","2_2","" diff --git a/t/data/expected_set_difference_unique_set_one_statistics.csv b/t/data/expected_set_difference_unique_set_one_statistics.csv index 06be68e..e6eb346 100644 --- a/t/data/expected_set_difference_unique_set_one_statistics.csv +++ b/t/data/expected_set_difference_unique_set_one_statistics.csv @@ -1,2 +1,2 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa" -"group_6","","","1","1","1","1_6","","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa" +"group_6","","","1","1","1","","","","","","1_6","","" diff --git a/t/data/expected_set_difference_unique_set_two_statistics.csv b/t/data/expected_set_difference_unique_set_two_statistics.csv index 4c25fc4..1d06674 100644 --- a/t/data/expected_set_difference_unique_set_two_statistics.csv +++ b/t/data/expected_set_difference_unique_set_two_statistics.csv @@ -1,4 +1,4 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa" -"group_4","","","2","2","1","","2_4","3_4" -"group_5","","","1","1","1","","","3_5" -"group_7","","","1","1","1","","2_7","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1.fa.tmp.filtered.fa","query_2.fa.tmp.filtered.fa","query_3.fa.tmp.filtered.fa" +"group_4","","","2","2","1","","","","","","","2_4","3_4" +"group_5","","","1","1","1","","","","","","","","3_5" +"group_7","","","1","1","1","","","","","","","2_7","" diff --git a/t/data/overall_group_statisics.csv b/t/data/overall_group_statisics.csv index e776209..2881d7d 100644 --- a/t/data/overall_group_statisics.csv +++ b/t/data/overall_group_statisics.csv @@ -1,15 +1,15 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_6" -"argF","","Ornithine carbamoyltransferase","2","2","1","1_3","2_3","" -"group_5","","Gonococcal growth inhibitor III","2","2","1","1_6","2_7","" -"hly","","Alpha-toxin","2","2","1","1_1","2_1","" -"speH","","hypothetical protein","2","2","1","1_2","2_2","" -"group_13","","hypothetical protein","1","2","2","","","abc_00002 abc_00002" -"group_6","","","1","2","2","","abc_01705 abc_01705","" -"group_14","","hypothetical protein","1","2","2","","abc_00003 abc_00003","" -"group_9","","C4-dicarboxylate transporter/malic acid transport protein","1","2","2","","abc_00011 abc_00011","" -"group_8","","hypothetical protein","1","2","2","","abc_00010 abc_00010","" -"group_2","","superantigen-like protein","1","2","2","","abc_00004 abc_00004","" -"group_3","","superantigen-like protein","1","2","2","","abc_00006 abc_00006","" -"group_7","","","1","2","2","","abc_00013 abc_00013","" -"yfnB","","Putative HAD-hydrolase yfnB","1","2","2","","abc_00016 abc_00016","" -"arcC1","","Carbamate kinase 1","1","2","2","","abc_00008 abc_00008","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_2","query_6" +"argF","","Ornithine carbamoyltransferase","2","2","1","1","8",,,"","1_3","2_3","" +"group_5","","Gonococcal growth inhibitor III","2","2","1","1","3",,,"","1_6","2_7","" +"hly","","Alpha-toxin","2","2","1","1","1",,,"","1_1","2_1","" +"speH","","hypothetical protein","2","2","1","1","10",,,"","1_2","2_2","" +"group_13","","hypothetical protein","1","2","2","1","13",,,"","","","abc_00002 abc_00002" +"group_6","","","1","2","2","","","","","","","abc_01705 abc_01705","" +"group_14","","hypothetical protein","1","2","2","1","12",,,"","","abc_00003 abc_00003","" +"group_9","","C4-dicarboxylate transporter/malic acid transport protein","1","2","2","1","5",,,"","","abc_00011 abc_00011","" +"group_8","","hypothetical protein","1","2","2","1","6",,,"","","abc_00010 abc_00010","" +"group_2","","superantigen-like protein","1","2","2","1","11",,,"","","abc_00004 abc_00004","" +"group_3","","superantigen-like protein","1","2","2","1","9",,,"","","abc_00006 abc_00006","" +"group_7","","","1","2","2","1","4",,,"","","abc_00013 abc_00013","" +"yfnB","","Putative HAD-hydrolase yfnB","1","2","2","1","2",,,"","","abc_00016 abc_00016","" +"arcC1","","Carbamate kinase 1","1","2","2","1","7",,,"","","abc_00008 abc_00008","" diff --git a/t/data/reorder_isolates_expected_output.csv b/t/data/reorder_isolates_expected_output.csv index bccdd62..50af5b4 100644 --- a/t/data/reorder_isolates_expected_output.csv +++ b/t/data/reorder_isolates_expected_output.csv @@ -1,8 +1,8 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_3","query_4","query_2" -"hly","","Alpha-toxin","4","4","1","P","P","P","P" -"argF","","Ornithine carbamoyltransferase","2","2","1","P","P","","" -"group_4","","","2","2","1","","P","","P" -"speH","","hypothetical protein","2","2","1","P","","","P" -"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","P" -"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","P","","" -"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_3","query_2" +"hly","","Alpha-toxin","3","3","1","","","","","","1_1","3_1","2_1" +"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","3_3","" +"group_4","","","2","2","1","","","","","","","3_4","2_4" +"speH","","hypothetical protein","2","2","1","","","","","","1_2","","2_2" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","","2_7" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","3_5","" +"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","","" diff --git a/t/data/reorder_isolates_input.csv b/t/data/reorder_isolates_input.csv index db4212f..725f953 100644 --- a/t/data/reorder_isolates_input.csv +++ b/t/data/reorder_isolates_input.csv @@ -1,8 +1,8 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_3","query_4" -"hly","","Alpha-toxin","4","4","1","P","P","P","P" -"argF","","Ornithine carbamoyltransferase","2","2","1","P","","P","" -"group_4","","","2","2","1","","P","P","" -"speH","","hypothetical protein","2","2","1","P","P","","" -"group_7","","Gonococcal growth inhibitor III","1","1","1","","P","","" -"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","P","" -"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_2","query_3" +"hly","","Alpha-toxin","3","3","1","","","","","","1_1","2_1","3_1" +"argF","","Ornithine carbamoyltransferase","2","2","1","","","","","","1_3","","3_3" +"group_4","","","2","2","1","","","","","","","2_4","3_4" +"speH","","hypothetical protein","2","2","1","","","","","","1_2","2_2","" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","","","","2_7","" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","","","","","","3_5" +"group_6","","Gonococcal growth inhibitor III","1","1","1","","","","","","1_6","","" \ No newline at end of file