sanger-pathogens · andrewjpage · Nov 4, 2013 · Oct 31, 2013 · Oct 31, 2013 · Oct 31, 2013
diff --git a/lib/Bio/PanGenome.pm b/lib/Bio/PanGenome.pm
@@ -9,6 +9,7 @@ Create a pan genome
 =cut
 
 use Moose;
+use File::Copy;
 use Bio::PanGenome::ParallelAllAgainstAllBlast;
 use Bio::PanGenome::CombinedProteome;
 use Bio::PanGenome::External::Cdhit;
@@ -21,6 +22,7 @@ use Bio::PanGenome::Output::OneGenePerGroupFasta;
 use Bio::PanGenome::GroupStatistics;
 use Bio::PanGenome::Output::GroupsMultifastasNucleotide;
 use Bio::PanGenome::External::PostAnalysis;
+use Bio::PanGenome::FilterFullClusters;
 
 has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
 has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
@@ -42,17 +44,30 @@ sub run {
     my $output_cd_hit_filename        = '_clustered';
     my $output_blast_results_filename = '_blast_results';
     my $output_mcl_filename           = '_uninflated_mcl_groups';
+    my $output_filtered_clustered_fasta  = '_clustered_filtered.fa';
+    my $cdhit_groups = $output_combined_filename.'.groups';
+    unlink($cdhit_groups);
 
     my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
         proteome_files  => $self->fasta_files,
         output_filename => $output_combined_filename,
     );
     $combine_fasta_files->create_combined_proteome_file;
 
+    my $number_of_input_files = @{$self->input_files};
+    $self->filter_complete_clusters($output_cd_hit_filename,1,    $output_combined_filename,$number_of_input_files,$output_filtered_clustered_fasta,1);
+
+    for( my $percent_match = 0.995; $percent_match > 0.90; $percent_match -= 0.005)
+    {
+      $self->filter_complete_clusters($output_cd_hit_filename,$percent_match,$output_combined_filename,$number_of_input_files,$output_filtered_clustered_fasta,0);
+    }
+
     my $cdhit_obj = Bio::PanGenome::External::Cdhit->new(
         input_file  => $output_combined_filename,
         job_runner  => $self->job_runner,
-        output_base => $output_cd_hit_filename
+        output_base => $output_cd_hit_filename,
+        _length_difference_cutoff    => 1,
+        _sequence_identity_threshold => 1 
     );
     $cdhit_obj->run();
 
@@ -92,6 +107,35 @@ sub run {
 
 }
 
+
+sub filter_complete_clusters
+{
+    my($self,$output_cd_hit_filename, $percentage_match,$output_combined_filename,$number_of_input_files,$output_filtered_clustered_fasta, $greater_than_or_equal) = @_;
+
+    my $cdhit_obj = Bio::PanGenome::External::Cdhit->new(
+        input_file  => $output_combined_filename,
+        job_runner  => $self->job_runner,
+        output_base => $output_cd_hit_filename,
+        _length_difference_cutoff => $percentage_match,
+        _sequence_identity_threshold => $percentage_match
+    );
+    $cdhit_obj->run();
+
+    my $filter_clusters = Bio::PanGenome::FilterFullClusters->new(
+        clusters_filename        => $cdhit_obj->clusters_filename,
+        fasta_file               =>  $output_cd_hit_filename,
+        number_of_input_files    => $number_of_input_files,
+        output_file => $output_filtered_clustered_fasta,
+        _greater_than_or_equal => $greater_than_or_equal,
+        cdhit_input_fasta_file => $output_combined_filename,
+        cdhit_output_fasta_file => $output_combined_filename.'.filtered',
+        output_groups_file      => $output_combined_filename.'.groups'
+      );
+
+    $filter_clusters->filter_complete_cluster_from_original_fasta();
+    move($filter_clusters->cdhit_output_fasta_file, $output_combined_filename);
+}
+
 no Moose;
 __PACKAGE__->meta->make_immutable;
 

diff --git a/lib/Bio/PanGenome/AnnotateGroups.pm b/lib/Bio/PanGenome/AnnotateGroups.pm
@@ -219,7 +219,7 @@ sub _split_groups_with_min_sub_group_size {
     my @groups = keys %{ $self->_groups_to_id_names };
     for my $group (@groups) {
         my $size_of_group = @{ $self->_groups_to_id_names->{$group} };
-        next if ( $size_of_group <= $self->_number_of_files );
+        next if ( $size_of_group <= ($self->_number_of_files) * 2);
         my $ids_grouped_by_gene_name = $self->_ids_grouped_by_gene_name_for_group($group);
 
         for my $gene_name ( keys %{$ids_grouped_by_gene_name} ) {
@@ -242,6 +242,8 @@ sub _split_groups_with_min_sub_group_size {
 
 sub _split_groups {
     my ($self) = @_;
+
+    $self->_split_groups_with_min_sub_group_size($self->_number_of_files);
 
     $self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
     $self->_ids_to_groups( $self->_generate__ids_to_groups );

diff --git a/lib/Bio/PanGenome/ClustersRole.pm b/lib/Bio/PanGenome/ClustersRole.pm
@@ -0,0 +1,70 @@
+package Bio::PanGenome::ClustersRole;
+# ABSTRACT: A role to read a clusters file from CD hit 
+
+=head1 SYNOPSIS
+
+A role to read a clusters file from CD hit 
+   with 'Bio::PanGenome::ClustersRole';
+
+=cut
+
+use Moose::Role;
+use Bio::PanGenome::Exceptions;
+
+has 'clusters_filename' => ( is => 'ro', isa => 'Str', required => 1 );
+has '_clustered_genes'  => ( is => 'ro',lazy => 1, builder => '_build__clustered_genes' );
+has '_clusters_fh'      => ( is => 'ro',lazy => 1, builder => '_build__clusters_fh' );
+
+sub _build__clusters_fh
+{
+  my($self) = @_;
+  open(my $fh, $self->clusters_filename) or Bio::PanGenome::Exceptions::FileNotFound->throw( error => 'Cant open file: ' . $self->clusters_filename );
+  return $fh;
+}
+
+sub _build__clustered_genes
+{
+  my($self) = @_;
+  my $fh = $self->_clusters_fh;
+  my %clustered_genes ;
+
+  my %raw_clusters;
+  my $current_cluster_name;
+  while(<$fh>)
+  {
+    my $line = $_;
+    if($line =~ /^>(.+)$/)
+    {
+      $current_cluster_name = $1;
+    }
+
+    #>Cluster 5
+    #0	4201aa, >6630_4#9_00008... *
+    #1	4201aa, >6631_1#23_00379... at 100.00%    
+
+    if($line =~ /[\d]+\t[\w]+, >(.+)\.\.\. (.+)$/)
+    {
+      my $gene_name = $1;
+      my $identity  = $2;
+
+      if($identity eq '*')
+      {
+        $raw_clusters{$current_cluster_name}{representative_gene_name} = $gene_name;
+      }
+      else
+      {
+        push(@{$raw_clusters{$current_cluster_name}{gene_names}}, $gene_name);
+      }
+    }
+  }
+
+  # iterate over the raw clusters and convert to a simple hash
+  for my $cluster_name (keys %raw_clusters)
+  {
+    $clustered_genes{$raw_clusters{$cluster_name}{representative_gene_name}} = $raw_clusters{$cluster_name}{gene_names};
+  }
+
+  return \%clustered_genes;
+}
+
+1;
diff --git a/lib/Bio/PanGenome/External/Blastp.pm b/lib/Bio/PanGenome/External/Blastp.pm
@@ -30,7 +30,7 @@ has 'blast_database'    => ( is => 'ro', isa => 'Str', required => 1 );
 has 'exec'              => ( is => 'ro', isa => 'Str', default  => 'blastp' );
 has '_evalue'           => ( is => 'ro', isa => 'Num', default  => 1E-6 );
 has '_num_threads'      => ( is => 'ro', isa => 'Int', default  => 1 );
-has '_max_target_seqs'  => ( is => 'ro', isa => 'Int', default  => 100 );
+has '_max_target_seqs'  => ( is => 'ro', isa => 'Int', default  => 1000 );
 has '_logging'          => ( is => 'ro', isa => 'Str', default  => '2> /dev/null' );
 has 'output_file'       => ( is => 'ro', isa => 'Str', default  => 'results.out' );
 
@@ -44,10 +44,10 @@ sub _command_to_run {
             '-db', $self->blast_database, 
             '-evalue', $self->_evalue,
             '-num_threads', $self->_num_threads,
-            '-out', $self->output_file,
             '-outfmt 6',
+            '-out', $self->output_file,
             '-max_target_seqs', $self->_max_target_seqs,
-            $self->_logging
+            $self->_logging,
         )
     );
 }

diff --git a/lib/Bio/PanGenome/External/Cdhit.pm b/lib/Bio/PanGenome/External/Cdhit.pm
@@ -25,8 +25,8 @@ has 'exec'                         => ( is => 'ro', isa => 'Str',  default  => '
 has '_number_of_threads'           => ( is => 'ro', isa => 'Int',  default  => 1 );
 has '_max_available_memory_in_mb'  => ( is => 'ro', isa => 'Int',  lazy => 1, builder => '_build__max_available_memory_in_mb' );
 has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default  => 1 );
-has '_length_difference_cutoff'    => ( is => 'ro', isa => 'Num',  default  => 0.99 );
-has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num',  default  => 0.99 );
+has '_length_difference_cutoff'    => ( is => 'ro', isa => 'Num',  default  => 1 );
+has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num',  default  => 1 );
 has '_description_length'          => ( is => 'ro', isa => 'Int',  default  => 256 );
 has '_logging'          => ( is => 'ro', isa => 'Str', default  => '2> /dev/null' );
 

diff --git a/lib/Bio/PanGenome/External/Mcl.pm b/lib/Bio/PanGenome/External/Mcl.pm
@@ -27,6 +27,8 @@ has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default  => 'mcxdeblast' );
 has 'mcl_exec'        => ( is => 'ro', isa => 'Str', default  => 'mcl' );
 has 'output_file'     => ( is => 'ro', isa => 'Str', default  => 'output_groups' );
 
+has '_score'     => ( is => 'ro', isa => 'Str', default  => 'r' );
+
 has '_inflation_value' => ( is => 'ro', isa => 'Num', default => 1.5 );
 has '_logging'         => ( is => 'ro', isa => 'Str', default  => '2> /dev/null' );
 
@@ -61,7 +63,7 @@ sub _command_to_run {
     return join(
         " ",
         (
-            $self->mcxdeblast_exec, '-m9', 
+            $self->mcxdeblast_exec, '-m9', '--score='.$self->_score,
             '--line-mode=abc', $self->blast_results, 
             '|', $self->mcl_exec, '-', '--abc',
             '-I', $self->_inflation_value, '-o', $self->output_file, 

diff --git a/lib/Bio/PanGenome/FilterFullClusters.pm b/lib/Bio/PanGenome/FilterFullClusters.pm
@@ -0,0 +1,144 @@
+package Bio::PanGenome::FilterFullClusters;
+
+# ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
+
+=head1 SYNOPSIS
+
+Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
+   use Bio::PanGenome::FilterFullClusters;
+
+   my $obj = Bio::PanGenome::FilterFullClusters->new(
+       clusters_filename        => $cluster_file,
+       fasta_file           => $fasta_file,
+       number_of_input_files => 10,
+       output_file => 'filtered_file'
+     );
+   $obj->filter_full_clusters_from_fasta();
+
+=cut
+
+use Moose;
+use Bio::SeqIO;
+with 'Bio::PanGenome::ClustersRole';
+
+has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 );
+has 'fasta_file'     => ( is => 'ro', isa => 'Str', required => 1 );
+has 'output_file'    => ( is => 'ro', isa => 'Str', required => 1 );
+has '_greater_than_or_equal' =>  ( is => 'ro', isa => 'Bool', default => 0 );
+has 'cdhit_input_fasta_file'    => ( is => 'ro', isa => 'Str', required => 1 );
+has 'cdhit_output_fasta_file'    => ( is => 'ro', isa => 'Str', required => 1 );
+
+has 'output_groups_file' => ( is => 'ro', isa => 'Str', required => 1 );
+
+has '_full_cluster_gene_names'    => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__full_cluster_gene_names' );
+has '_input_seqio'  => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
+has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );
+
+has '_all_full_cluster_genes'    => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__all_full_cluster_genes' );
+
+sub _build__full_cluster_gene_names
+{
+  my($self) = @_;
+
+  my %full_cluster_gene_names ;
+
+  for my $gene_name (keys %{$self->_clustered_genes})
+  {
+
+    if($self->_greater_than_or_equal == 0)
+    {
+      if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} >= ($self->number_of_input_files -1))
+      {
+        $full_cluster_gene_names{$gene_name}++;
+      }
+    }
+    else
+    {
+      if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} == ($self->number_of_input_files -1))
+      {
+        $full_cluster_gene_names{$gene_name}++;
+      }
+    }
+  }
+
+  return \%full_cluster_gene_names;
+}
+
+sub _build__input_seqio {
+    my ($self) = @_;
+    return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
+}
+
+sub _build__output_seqio {
+    my ( $self, $chunk_number ) = @_;
+    return Bio::SeqIO->new( -file => ">".$self->output_file, -format => 'Fasta' );
+}
+
+sub _build__all_full_cluster_genes
+{
+   my ($self) = @_;
+   my %full_cluster_genes;
+
+   for my $gene_name (keys %{$self->_full_cluster_gene_names})
+   {
+     $full_cluster_genes{$gene_name}++;
+     for my $cluster_gene_name (@{$self->_clustered_genes->{$gene_name}})
+     {
+       $full_cluster_genes{$cluster_gene_name}++;
+     }
+   }
+   return \%full_cluster_genes;
+}
+
+
+sub _create_groups_file
+{
+  my ($self) = @_;
+  open(my $out_fh, '>>', $self->output_groups_file);
+
+  for my $gene_name (keys %{$self->_full_cluster_gene_names})
+  {
+    print {$out_fh} $gene_name."\t". join("\t", @{$self->_clustered_genes->{$gene_name}}). "\n";
+  }
+  close($out_fh);
+}
+
+
+
+sub filter_complete_cluster_from_original_fasta
+{
+  my ($self) = @_;
+
+  my $input_seq_io  = Bio::SeqIO->new( -file => $self->cdhit_input_fasta_file, -format => 'Fasta' );
+  my $output_seq_io = Bio::SeqIO->new( -file => ">".$self->cdhit_output_fasta_file, -format => 'Fasta' );
+
+  while ( my $input_seq = $input_seq_io->next_seq() ) 
+  {
+    unless(defined($self->_all_full_cluster_genes->{$input_seq->display_id}))
+    {
+      $output_seq_io->write_seq($input_seq);
+    }
+  }
+
+  $self->_create_groups_file;
+  return $self;
+}
+
+sub filter_full_clusters_from_fasta
+{
+    my ($self) = @_;
+
+    while ( my $input_seq = $self->_input_seqio->next_seq() ) {
+      unless(defined($self->_full_cluster_gene_names->{$input_seq->display_id}))
+      {
+        $self->_output_seqio->write_seq($input_seq);
+      }
+    }
+    return $self;
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
+