Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefilter optimisation #47

Merged
46 changes: 45 additions & 1 deletion lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Create a pan genome
=cut

use Moose;
use File::Copy;
use Bio::PanGenome::ParallelAllAgainstAllBlast;
use Bio::PanGenome::CombinedProteome;
use Bio::PanGenome::External::Cdhit;
Expand All @@ -21,6 +22,7 @@ use Bio::PanGenome::Output::OneGenePerGroupFasta;
use Bio::PanGenome::GroupStatistics;
use Bio::PanGenome::Output::GroupsMultifastasNucleotide;
use Bio::PanGenome::External::PostAnalysis;
use Bio::PanGenome::FilterFullClusters;

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 );
has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 );
Expand All @@ -42,17 +44,30 @@ sub run {
my $output_cd_hit_filename = '_clustered';
my $output_blast_results_filename = '_blast_results';
my $output_mcl_filename = '_uninflated_mcl_groups';
my $output_filtered_clustered_fasta = '_clustered_filtered.fa';
my $cdhit_groups = $output_combined_filename.'.groups';
unlink($cdhit_groups);

my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
output_filename => $output_combined_filename,
);
$combine_fasta_files->create_combined_proteome_file;

my $number_of_input_files = @{$self->input_files};
$self->filter_complete_clusters($output_cd_hit_filename,1, $output_combined_filename,$number_of_input_files,$output_filtered_clustered_fasta,1);

for( my $percent_match = 0.995; $percent_match > 0.90; $percent_match -= 0.005)
{
$self->filter_complete_clusters($output_cd_hit_filename,$percent_match,$output_combined_filename,$number_of_input_files,$output_filtered_clustered_fasta,0);
}

my $cdhit_obj = Bio::PanGenome::External::Cdhit->new(
input_file => $output_combined_filename,
job_runner => $self->job_runner,
output_base => $output_cd_hit_filename
output_base => $output_cd_hit_filename,
_length_difference_cutoff => 1,
_sequence_identity_threshold => 1
);
$cdhit_obj->run();

Expand Down Expand Up @@ -92,6 +107,35 @@ sub run {

}


sub filter_complete_clusters
{
my($self,$output_cd_hit_filename, $percentage_match,$output_combined_filename,$number_of_input_files,$output_filtered_clustered_fasta, $greater_than_or_equal) = @_;

my $cdhit_obj = Bio::PanGenome::External::Cdhit->new(
input_file => $output_combined_filename,
job_runner => $self->job_runner,
output_base => $output_cd_hit_filename,
_length_difference_cutoff => $percentage_match,
_sequence_identity_threshold => $percentage_match
);
$cdhit_obj->run();

my $filter_clusters = Bio::PanGenome::FilterFullClusters->new(
clusters_filename => $cdhit_obj->clusters_filename,
fasta_file => $output_cd_hit_filename,
number_of_input_files => $number_of_input_files,
output_file => $output_filtered_clustered_fasta,
_greater_than_or_equal => $greater_than_or_equal,
cdhit_input_fasta_file => $output_combined_filename,
cdhit_output_fasta_file => $output_combined_filename.'.filtered',
output_groups_file => $output_combined_filename.'.groups'
);

$filter_clusters->filter_complete_cluster_from_original_fasta();
move($filter_clusters->cdhit_output_fasta_file, $output_combined_filename);
}

no Moose;
__PACKAGE__->meta->make_immutable;

Expand Down
4 changes: 3 additions & 1 deletion lib/Bio/PanGenome/AnnotateGroups.pm
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ sub _split_groups_with_min_sub_group_size {
my @groups = keys %{ $self->_groups_to_id_names };
for my $group (@groups) {
my $size_of_group = @{ $self->_groups_to_id_names->{$group} };
next if ( $size_of_group <= $self->_number_of_files );
next if ( $size_of_group <= ($self->_number_of_files) * 2);
my $ids_grouped_by_gene_name = $self->_ids_grouped_by_gene_name_for_group($group);

for my $gene_name ( keys %{$ids_grouped_by_gene_name} ) {
Expand All @@ -242,6 +242,8 @@ sub _split_groups_with_min_sub_group_size {

sub _split_groups {
my ($self) = @_;

$self->_split_groups_with_min_sub_group_size($self->_number_of_files);

$self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
$self->_ids_to_groups( $self->_generate__ids_to_groups );
Expand Down
70 changes: 70 additions & 0 deletions lib/Bio/PanGenome/ClustersRole.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package Bio::PanGenome::ClustersRole;
# ABSTRACT: A role to read a clusters file from CD hit

=head1 SYNOPSIS

A role to read a clusters file from CD hit
with 'Bio::PanGenome::ClustersRole';

=cut

use Moose::Role;
use Bio::PanGenome::Exceptions;

has 'clusters_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has '_clustered_genes' => ( is => 'ro',lazy => 1, builder => '_build__clustered_genes' );
has '_clusters_fh' => ( is => 'ro',lazy => 1, builder => '_build__clusters_fh' );

sub _build__clusters_fh
{
my($self) = @_;
open(my $fh, $self->clusters_filename) or Bio::PanGenome::Exceptions::FileNotFound->throw( error => 'Cant open file: ' . $self->clusters_filename );
return $fh;
}

sub _build__clustered_genes
{
my($self) = @_;
my $fh = $self->_clusters_fh;
my %clustered_genes ;

my %raw_clusters;
my $current_cluster_name;
while(<$fh>)
{
my $line = $_;
if($line =~ /^>(.+)$/)
{
$current_cluster_name = $1;
}

#>Cluster 5
#0 4201aa, >6630_4#9_00008... *
#1 4201aa, >6631_1#23_00379... at 100.00%

if($line =~ /[\d]+\t[\w]+, >(.+)\.\.\. (.+)$/)
{
my $gene_name = $1;
my $identity = $2;

if($identity eq '*')
{
$raw_clusters{$current_cluster_name}{representative_gene_name} = $gene_name;
}
else
{
push(@{$raw_clusters{$current_cluster_name}{gene_names}}, $gene_name);
}
}
}

# iterate over the raw clusters and convert to a simple hash
for my $cluster_name (keys %raw_clusters)
{
$clustered_genes{$raw_clusters{$cluster_name}{representative_gene_name}} = $raw_clusters{$cluster_name}{gene_names};
}

return \%clustered_genes;
}

1;
6 changes: 3 additions & 3 deletions lib/Bio/PanGenome/External/Blastp.pm
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ has 'blast_database' => ( is => 'ro', isa => 'Str', required => 1 );
has 'exec' => ( is => 'ro', isa => 'Str', default => 'blastp' );
has '_evalue' => ( is => 'ro', isa => 'Num', default => 1E-6 );
has '_num_threads' => ( is => 'ro', isa => 'Int', default => 1 );
has '_max_target_seqs' => ( is => 'ro', isa => 'Int', default => 100 );
has '_max_target_seqs' => ( is => 'ro', isa => 'Int', default => 1000 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );
has 'output_file' => ( is => 'ro', isa => 'Str', default => 'results.out' );

Expand All @@ -44,10 +44,10 @@ sub _command_to_run {
'-db', $self->blast_database,
'-evalue', $self->_evalue,
'-num_threads', $self->_num_threads,
'-out', $self->output_file,
'-outfmt 6',
'-out', $self->output_file,
'-max_target_seqs', $self->_max_target_seqs,
$self->_logging
$self->_logging,
)
);
}
Expand Down
4 changes: 2 additions & 2 deletions lib/Bio/PanGenome/External/Cdhit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ has 'exec' => ( is => 'ro', isa => 'Str', default => '
has '_number_of_threads' => ( is => 'ro', isa => 'Int', default => 1 );
has '_max_available_memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__max_available_memory_in_mb' );
has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default => 1 );
has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 0.99 );
has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 0.99 );
has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 1 );
has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 1 );
has '_description_length' => ( is => 'ro', isa => 'Int', default => 256 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

Expand Down
4 changes: 3 additions & 1 deletion lib/Bio/PanGenome/External/Mcl.pm
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );
has 'output_file' => ( is => 'ro', isa => 'Str', default => 'output_groups' );

has '_score' => ( is => 'ro', isa => 'Str', default => 'r' );

has '_inflation_value' => ( is => 'ro', isa => 'Num', default => 1.5 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

Expand Down Expand Up @@ -61,7 +63,7 @@ sub _command_to_run {
return join(
" ",
(
$self->mcxdeblast_exec, '-m9',
$self->mcxdeblast_exec, '-m9', '--score='.$self->_score,
'--line-mode=abc', $self->blast_results,
'|', $self->mcl_exec, '-', '--abc',
'-I', $self->_inflation_value, '-o', $self->output_file,
Expand Down
144 changes: 144 additions & 0 deletions lib/Bio/PanGenome/FilterFullClusters.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package Bio::PanGenome::FilterFullClusters;

# ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters

=head1 SYNOPSIS

Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
use Bio::PanGenome::FilterFullClusters;

my $obj = Bio::PanGenome::FilterFullClusters->new(
clusters_filename => $cluster_file,
fasta_file => $fasta_file,
number_of_input_files => 10,
output_file => 'filtered_file'
);
$obj->filter_full_clusters_from_fasta();

=cut

use Moose;
use Bio::SeqIO;
with 'Bio::PanGenome::ClustersRole';

has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 );
has 'fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_file' => ( is => 'ro', isa => 'Str', required => 1 );
has '_greater_than_or_equal' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'cdhit_input_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'cdhit_output_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );

has 'output_groups_file' => ( is => 'ro', isa => 'Str', required => 1 );

has '_full_cluster_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__full_cluster_gene_names' );
has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );

has '_all_full_cluster_genes' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__all_full_cluster_genes' );

sub _build__full_cluster_gene_names
{
my($self) = @_;

my %full_cluster_gene_names ;

for my $gene_name (keys %{$self->_clustered_genes})
{

if($self->_greater_than_or_equal == 0)
{
if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} >= ($self->number_of_input_files -1))
{
$full_cluster_gene_names{$gene_name}++;
}
}
else
{
if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} == ($self->number_of_input_files -1))
{
$full_cluster_gene_names{$gene_name}++;
}
}
}

return \%full_cluster_gene_names;
}

sub _build__input_seqio {
my ($self) = @_;
return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
}

sub _build__output_seqio {
my ( $self, $chunk_number ) = @_;
return Bio::SeqIO->new( -file => ">".$self->output_file, -format => 'Fasta' );
}

sub _build__all_full_cluster_genes
{
my ($self) = @_;
my %full_cluster_genes;

for my $gene_name (keys %{$self->_full_cluster_gene_names})
{
$full_cluster_genes{$gene_name}++;
for my $cluster_gene_name (@{$self->_clustered_genes->{$gene_name}})
{
$full_cluster_genes{$cluster_gene_name}++;
}
}
return \%full_cluster_genes;
}


sub _create_groups_file
{
my ($self) = @_;
open(my $out_fh, '>>', $self->output_groups_file);

for my $gene_name (keys %{$self->_full_cluster_gene_names})
{
print {$out_fh} $gene_name."\t". join("\t", @{$self->_clustered_genes->{$gene_name}}). "\n";
}
close($out_fh);
}



sub filter_complete_cluster_from_original_fasta
{
my ($self) = @_;

my $input_seq_io = Bio::SeqIO->new( -file => $self->cdhit_input_fasta_file, -format => 'Fasta' );
my $output_seq_io = Bio::SeqIO->new( -file => ">".$self->cdhit_output_fasta_file, -format => 'Fasta' );

while ( my $input_seq = $input_seq_io->next_seq() )
{
unless(defined($self->_all_full_cluster_genes->{$input_seq->display_id}))
{
$output_seq_io->write_seq($input_seq);
}
}

$self->_create_groups_file;
return $self;
}

sub filter_full_clusters_from_fasta
{
my ($self) = @_;

while ( my $input_seq = $self->_input_seqio->next_seq() ) {
unless(defined($self->_full_cluster_gene_names->{$input_seq->display_id}))
{
$self->_output_seqio->write_seq($input_seq);
}
}
return $self;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;

Loading