sanger-pathogens · andrewjpage · Mar 6, 2015 · Mar 6, 2015
diff --git a/lib/Bio/PanGenome.pm b/lib/Bio/PanGenome.pm
@@ -29,7 +29,7 @@ has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef', required =
 has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
 has 'output_filename'             => ( is => 'rw', isa => 'Str',      default  => 'clustered_proteins' );
 has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str',      default  => 'pan_genome.fa' );
-has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',      default  => 'group_statisics.csv' );
+has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',      default  => 'gene_presence_absence.csv' );
 has 'job_runner'                  => ( is => 'rw', isa => 'Str',      default  => 'LSF' );
 has 'cpus'                        => ( is => 'ro', isa => 'Int',      default => 1 );
 has 'makeblastdb_exec'            => ( is => 'rw', isa => 'Str',      default  => 'makeblastdb' );

diff --git a/lib/Bio/PanGenome/CommandLine/PanGenomeCoreAlignment.pm b/lib/Bio/PanGenome/CommandLine/PanGenomeCoreAlignment.pm
@@ -20,7 +20,7 @@ has 'script_name' => ( is => 'ro', isa => 'Str',      required => 1 );
 has 'help'        => ( is => 'rw', isa => 'Bool',     default  => 0 );
 
 has 'multifasta_base_directory' => ( is => 'rw', isa => 'Str', default => 'pan_genome_sequences' );
-has 'spreadsheet_filename'      => ( is => 'rw', isa => 'Str', default => 'group_statisics.csv' );
+has 'spreadsheet_filename'      => ( is => 'rw', isa => 'Str', default => 'gene_presence_absence.csv' );
 has 'output_filename'           => ( is => 'rw', isa => 'Str', default => 'core_gene_alignment.aln' );
 has '_error_message'            => ( is => 'rw', isa => 'Str' );
 
@@ -94,7 +94,7 @@ sub usage_text {
     pan_genome_core_alignment
 
     # Specify the directory containing the multifastas (-m), the spreadsheet (-s) and an output file name (-o)
-    pan_genome_core_alignment -m pan_genome_sequences -s group_statisics.csv -o output_alignment.aln
+    pan_genome_core_alignment -m pan_genome_sequences -s gene_presence_absence.csv -o output_alignment.aln
 
     # This help message
     pan_genome_core_alignment -h

diff --git a/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm b/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm
@@ -24,7 +24,7 @@ has 'fasta_files'                 => ( is => 'rw', isa => 'Str' );
 has 'input_files'                 => ( is => 'rw', isa => 'Str');
 has 'output_filename'             => ( is => 'rw', isa => 'Str',  default  => 'clustered_proteins' );
 has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str',  default  => 'pan_genome.fa' );
-has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',  default  => 'group_statisics.csv' );
+has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',  default  => 'gene_presence_absence.csv' );
 has 'output_multifasta_files'     => ( is => 'rw', isa => 'Bool', default  => 0 );
 has 'clusters_filename'           => ( is => 'rw', isa => 'Str' );
 has 'job_runner'                  => ( is => 'rw', isa => 'Str',  default  => 'LSF' );

diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm
@@ -23,7 +23,7 @@ has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef', required =
 has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
 has 'output_filename'             => ( is => 'rw', isa => 'Str',      default  => 'clustered_proteins' );
 has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str',      default  => 'pan_genome.fa' );
-has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',      default  => 'group_statisics.csv' );
+has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',      default  => 'gene_presence_absence.csv' );
 has 'output_multifasta_files'     => ( is => 'ro', isa => 'Bool',     default  => 0 );
 
 has 'clusters_filename'           => ( is => 'rw', isa => 'Str',      required => 1 );

diff --git a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t
@@ -25,15 +25,15 @@ system('touch empty_file');
       ' -j Local   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
         [ 'clustered_proteins', 't/data/clustered_proteins_pan_genome' ],
       ' -j Local   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
-        [ 'group_statisics.csv', 't/data/overall_group_statisics.csv' ],     
+        [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],     
       ' -t 1 -j Local   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
-        [ 'group_statisics.csv', 't/data/overall_group_statisics.csv' ],
+        [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
       ' -j Parallel   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' =>
         [ 'clustered_proteins', 't/data/clustered_proteins_pan_genome' ],
       ' -j Parallel   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
-        [ 'group_statisics.csv', 't/data/overall_group_statisics.csv' ],     
+        [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],     
       ' -t 1 -j Parallel   t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff    ' =>
-        [ 'group_statisics.csv', 't/data/overall_group_statisics.csv' ],
+        [ 'gene_presence_absence.csv', 't/data/overall_gene_presence_absence.csv' ],
       '-h' =>
         [ 'empty_file', 't/data/empty_file' ],
 );
@@ -71,7 +71,7 @@ sub cleanup_files
   unlink('example_1.faa.tmp.filtered.fa');
   unlink('example_2.faa.tmp.filtered.fa');
   unlink('example_3.faa.tmp.filtered.fa');
-  unlink('group_statisics.csv');
+  unlink('gene_presence_absence.csv');
   unlink('query_1.gff.proteome.faa');
   unlink('query_2.gff.proteome.faa');
   unlink('query_3.gff.proteome.faa');

diff --git a/t/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.t b/t/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.t
@@ -21,7 +21,7 @@ local $ENV{PATH} = "$ENV{PATH}:./bin";
 system('cp t/data/post_analysis/* .');
 system('touch empty_file');
 my %scripts_and_expected_files = (
-       '-o clustered_proteins -p pan_genome.fa -s group_statisics.csv -c _clustered.clstr  -i _gff_files -f _fasta_files  -j Local --dont_create_rplots' =>
+       '-o clustered_proteins -p pan_genome.fa -s gene_presence_absence.csv -c _clustered.clstr  -i _gff_files -f _fasta_files  -j Local --dont_create_rplots' =>
        [ 'clustered_proteins', 't/data/clustered_proteins_post_analysis' ], 
        '-h' =>
          [ 'empty_file', 't/data/empty_file' ],   
@@ -33,7 +33,7 @@ ok( -e 'number_of_unique_genes.Rtab', 'number_of_unique_genes.Rtab exists');
 ok( -e 'number_of_new_genes.Rtab', 'number_of_new_genes exists');
 ok( -e 'number_of_genes_in_pan_genome.Rtab', 'number_of_genes_in_pan_genome exists');
 ok( -e 'number_of_conserved_genes.Rtab','number_of_conserved_genes');
-ok( -e 'group_statisics.csv', 'group_statisics exists');
+ok( -e 'gene_presence_absence.csv', 'gene_presence_absence exists');
 ok( -e 'core_accessory.tab', 'core_accessory.tab exists');
 ok( -e 'core_accessory.header.embl','core_accessory.header.embl exists');
 ok( -e 'accessory.tab','accessory.tab exists');
@@ -49,7 +49,7 @@ cleanup_files();
 system('cp t/data/post_analysis/* .');
 system('touch empty_file');
 %scripts_and_expected_files = (
-       '-t 1 -o clustered_proteins -p pan_genome.fa -s group_statisics.csv -c _clustered.clstr  -i _gff_files -f _fasta_files  -j Local --dont_create_rplots' =>
+       '-t 1 -o clustered_proteins -p pan_genome.fa -s gene_presence_absence.csv -c _clustered.clstr  -i _gff_files -f _fasta_files  -j Local --dont_create_rplots' =>
        [ 'clustered_proteins', 't/data/clustered_proteins_post_analysis' ], 
        '-h' =>
          [ 'empty_file', 't/data/empty_file' ],   
@@ -61,7 +61,7 @@ ok( -e 'number_of_unique_genes.Rtab', 'number_of_unique_genes.Rtab exists');
 ok( -e 'number_of_new_genes.Rtab', 'number_of_new_genes exists');
 ok( -e 'number_of_genes_in_pan_genome.Rtab', 'number_of_genes_in_pan_genome exists');
 ok( -e 'number_of_conserved_genes.Rtab','number_of_conserved_genes');
-ok( -e 'group_statisics.csv', 'group_statisics exists');
+ok( -e 'gene_presence_absence.csv', 'gene_presence_absence exists');
 ok( -e 'core_accessory.tab', 'core_accessory.tab exists');
 ok( -e 'core_accessory.header.embl','core_accessory.header.embl exists');
 ok( -e 'accessory.tab','accessory.tab exists');
@@ -93,7 +93,7 @@ sub cleanup_files
   unlink('accessory.tab');
   unlink('core_accessory.header.embl');
   unlink('core_accessory.tab');
-  unlink('group_statisics.csv');
+  unlink('gene_presence_absence.csv');
   unlink('number_of_unique_genes.Rtab');
   unlink('number_of_new_genes.Rtab');
   unlink('number_of_genes_in_pan_genome.Rtab');

diff --git a/t/Bio/PanGenome/CommandLine/QueryPanGenome.t b/t/Bio/PanGenome/CommandLine/QueryPanGenome.t
@@ -63,7 +63,7 @@ mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files
 unlink('set_difference_unique_set_two')                if ( -e 'set_difference_unique_set_two' );
 unlink('set_difference_common_set')                    if ( -e 'set_difference_common_set' );
 unlink('pan_genome_results_group_5.fa')                if ( -e 'pan_genome_results_group_5.fa' );
-unlink('group_statisics.csv')                          if ( -e 'group_statisics.csv' );
+unlink('gene_presence_absence.csv')                          if ( -e 'gene_presence_absence.csv' );
 unlink('set_difference_unique_set_two_statistics.csv') if ( -e 'set_difference_unique_set_two_statistics.csv' );
 unlink('set_difference_unique_set_one_statistics.csv') if ( -e 'set_difference_unique_set_one_statistics.csv' );
 unlink('set_difference_common_set_statistics.csv')     if ( -e 'set_difference_common_set_statistics.csv' );

diff --git a/t/data/overall_gene_presence_absence.csv b/t/data/overall_gene_presence_absence.csv
@@ -0,0 +1,22 @@
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_2","query_5"
+"hly","","Alpha-toxin","2","2","1","1","13",,,"","1_1","2_1",""
+"group_14","","Gonococcal growth inhibitor III","2","2","1","1","2",,,"","1_6","2_7",""
+"speH","","hypothetical protein","2","2","1","1","10",,,"","1_2","2_2",""
+"argF","","Ornithine carbamoyltransferase","2","2","1","1","8",,,"","1_3","2_3",""
+"group_10","","hypothetical protein","1","2","2","1","6",,,"","","abc_00010	abc_00010",""
+"group_11","","C4-dicarboxylate transporter/malic acid transport protein","1","2","2","1","5",,,"","","abc_00011	abc_00011",""
+"group_12","","hypothetical protein","1","2","2","1","4",,,"","","abc_00012	abc_00012",""
+"group_13","","Gonococcal growth inhibitor III","1","2","2","1","3",,,"","","abc_00014	abc_00014",""
+"yfnB","","Putative HAD-hydrolase yfnB","1","2","2","1","1",,,"","","abc_00016	abc_00016",""
+"group_16","","hypothetical protein","1","1","1","2","3","1","6","","","","abc_50002"
+"group_17","argF","Ornithine carbamoyltransferase","1","1","1","2","4","1","5","","","","3_3"
+"group_18","","hypothetical protein","1","1","1","2","5","1","4","","","","abc_50010"
+"group_19","","hypothetical protein","1","1","1","2","6","1","3","","","","abc_50012"
+"group_2","","hypothetical protein","1","2","2","1","14",,,"","","abc_00002	abc_00002",""
+"group_20","","Gonococcal growth inhibitor III","1","1","1","2","2","1","2","","","","abc_50014"
+"group_21","yfnB","Putative HAD-hydrolase yfnB","1","1","1","2","1","1","1","","","","3_5"
+"group_3","","hypothetical protein","1","2","2","1","12",,,"","","abc_00003	abc_00003",""
+"group_4","","superantigen-like protein","1","2","2","1","11",,,"","","abc_00004	abc_00004",""
+"group_6","","superantigen-like protein","1","2","2","1","9",,,"","","abc_00006	abc_00006",""
+"arcC1","","Carbamate kinase 1","1","2","2","1","7",,,"","","abc_00008	abc_00008",""
+"group_9","","","1","2","2","","","","","","","abc_01705	abc_01705",""
diff --git a/t/data/post_analysis_expected/gene_presence_absence.csv b/t/data/post_analysis_expected/gene_presence_absence.csv
@@ -0,0 +1,14 @@
+"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragement","Accessory Order with Fragment","QC","query_1","query_2","query_6"
+"group_5","","Gonococcal growth inhibitor III","2","2","1","1","2",,,"","1_6","2_7",""
+"hly","","Alpha-toxin","2","2","1","1","12",,,"","1_1","2_1",""
+"speH","","hypothetical protein","2","2","1","1","8",,,"","1_2","2_2",""
+"argF","","Ornithine carbamoyltransferase","2","2","1","1","6",,,"","1_3","2_3",""
+"group_12","","hypothetical protein","1","2","2","1","11",,,"","","","abc_00002	abc_00002"
+"group_13","","hypothetical protein","1","2","2","1","10",,,"","","abc_00003	abc_00003",""
+"group_6","","","1","2","2","","","","","","","abc_01705	abc_01705",""
+"group_8","","C4-dicarboxylate transporter/malic acid transport protein","1","2","2","1","3",,,"","","abc_00011	abc_00011",""
+"group_2","","superantigen-like protein","1","2","2","1","9",,,"","","abc_00004	abc_00004",""
+"group_3","","superantigen-like protein","1","2","2","1","7",,,"","","abc_00006	abc_00006",""
+"yfnB","","Putative HAD-hydrolase yfnB","1","2","2","1","1",,,"","","abc_00016	abc_00016",""
+"group_7","","hypothetical protein","1","2","2","1","4",,,"","","abc_00010	abc_00010",""
+"arcC1","","Carbamate kinase 1","1","2","2","1","5",,,"","","abc_00008	abc_00008",""