Skip to content

Commit

Permalink
output the sequence type instead of gene id
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewjpage committed Mar 8, 2013
1 parent e0f1247 commit 149f918
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 14 deletions.
3 changes: 0 additions & 3 deletions bin/get_sequence_type
Expand Up @@ -90,9 +90,6 @@ get_sequence_type -s "Clostridium difficile" -y my_assembly.fa
# Specify an output directory
get_sequence_type -s "Clostridium difficile" -o /path/to/results my_assembly.fa
# Instead of flagging multiple 100% matches as contamination, output the name of the alternative sequence
get_sequence_type --show_alternatives -s "Clostridium difficile" my_assembly.fa
# Match against multiple MLST databases
get_sequence_type -s "Clostridium botulinum, Clostridium difficile" my_assembly.fa
Expand Down
26 changes: 25 additions & 1 deletion lib/Bio/MLST/CompareAlleles.pm
Expand Up @@ -57,6 +57,7 @@ use Bio::Perl;
use Bio::MLST::Blast::Database;
use Bio::MLST::Blast::BlastN;
use Bio::MLST::Types;
use Bio::MLST::SequenceType;

has 'sequence_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
has 'allele_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
Expand All @@ -73,6 +74,7 @@ has 'contamination' => ( is => 'rw', isa => 'Bool', default => 0);
has 'contamination_sequence_names' => ( is => 'rw', isa => 'Maybe[ArrayRef]' );
has 'new_st' => ( is => 'rw', isa => 'Bool', default => 0);
has '_absent_loci' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__absent_loci' );
has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );

sub _build__blast_db_location
{
Expand Down Expand Up @@ -157,7 +159,7 @@ sub _build_matching_sequences
# more than 1 allele has 100% match
if(defined($top_blast_hit{contamination}))
{
$self->contamination(1);
$self->contamination(1);
$self->contamination_sequence_names($top_blast_hit{contamination});
}

Expand Down Expand Up @@ -191,6 +193,28 @@ sub _build_matching_sequences
return \%matching_sequence_names;
}

sub _translate_contamination_names_into_sequence_types
{
my ($self, $contamination_names) = @_;
my @contamination_sequence_types;

for my $allele_number (@{ $contamination_names})
{
my $st = Bio::MLST::SequenceType->new(
profiles_filename => $self->profiles_filename,
sequence_names => [$allele_number]
);

if(defined($st->sequence_type()))
{
push(@contamination_sequence_types, $st->sequence_type());
}
}

$self->contamination_sequence_names(\@contamination_sequence_types);
}


sub _get_blast_hit_sequence
{
my ($self, $contig_name, $start, $end, $word_size, $reverse_complement) = @_;
Expand Down
3 changes: 2 additions & 1 deletion lib/Bio/MLST/ProcessFasta.pm
Expand Up @@ -77,7 +77,8 @@ sub _build__compare_alleles
sequence_filename => $self->fasta_file,
allele_filenames => $self->_search_results->allele_filenames(),
makeblastdb_exec => $self->makeblastdb_exec,
blastn_exec => $self->blastn_exec
blastn_exec => $self->blastn_exec,
profiles_filename => $self->_search_results->profiles_filename()
);

my $output_fasta = Bio::MLST::OutputFasta->new(
Expand Down
12 changes: 8 additions & 4 deletions t/Output/SpreadsheetRow.t
Expand Up @@ -12,7 +12,8 @@ BEGIN {

my $compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk.tfa','t/data/purA.tfa','t/data/recA.tfa']
allele_filenames => ['t/data/adk.tfa','t/data/purA.tfa','t/data/recA.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
);
my $sequence_type_obj = Bio::MLST::SequenceType->new(
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
Expand Down Expand Up @@ -49,7 +50,8 @@ $compare_alleles->new_st(0);
# no match for adk
$compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk_less_than_95_percent.tfa','t/data/purA.tfa','t/data/recA.tfa']
allele_filenames => ['t/data/adk_less_than_95_percent.tfa','t/data/purA.tfa','t/data/recA.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
);
$sequence_type_obj = Bio::MLST::SequenceType->new(
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
Expand All @@ -65,7 +67,8 @@ is_deeply($spreadsheet_row_obj->genomic_row, ['contigs', 4,'Unknown','','U',
# near match
$compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk_contamination.tfa','t/data/purA.tfa','t/data/recA.tfa']
allele_filenames => ['t/data/adk_contamination.tfa','t/data/purA.tfa','t/data/recA.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
);
$sequence_type_obj = Bio::MLST::SequenceType->new(
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
Expand All @@ -80,7 +83,8 @@ is_deeply($spreadsheet_row_obj->genomic_row, ['contigs', 4,'Novel ST','Contamina

$compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs_novel.fa',
allele_filenames => ['t/data/adk.tfa','t/data/purA.tfa','t/data/recA.tfa']
allele_filenames => ['t/data/adk.tfa','t/data/purA.tfa','t/data/recA.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
);
$sequence_type_obj = Bio::MLST::SequenceType->new(
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
Expand Down
15 changes: 10 additions & 5 deletions t/SequenceTypes/CompareAlleles.t
Expand Up @@ -11,7 +11,8 @@ BEGIN {

ok((my $compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk.tfa','t/data/purA.tfa','t/data/recA.tfa']
allele_filenames => ['t/data/adk.tfa','t/data/purA.tfa','t/data/recA.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'initialise comparison');
is_deeply( $compare_alleles->found_sequence_names,sort(['adk-2','purA-3','recA-1']), 'correct sequences found');
is_deeply( $compare_alleles->non_matching_sequences, {}, 'no non matching sequences returned');
Expand All @@ -20,7 +21,8 @@ is($compare_alleles->contamination, 0, 'no contamination found');

ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk_top_hit_low_hit.tfa']
allele_filenames => ['t/data/adk_top_hit_low_hit.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'initialise comparison where there are multiple close matches');
is_deeply( $compare_alleles->found_sequence_names,sort(['adk-2']), 'correct sequences identified from closely related');
is_deeply( $compare_alleles->non_matching_sequences, {}, 'no non matching sequences returned');
Expand All @@ -29,7 +31,8 @@ is($compare_alleles->contamination, 0, 'contamination found');

ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk_contamination.tfa']
allele_filenames => ['t/data/adk_contamination.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'initialise comparison where there is contamination');
is_deeply( $compare_alleles->found_sequence_names,sort(['adk-3']), 'last top hit returned if more than 1 is 100%');
is($compare_alleles->new_st, 0, 'existing ST found');
Expand All @@ -39,7 +42,8 @@ is_deeply($compare_alleles->contamination_sequence_names, ['adk-3','adk-2'], 'Na

ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs.fa',
allele_filenames => ['t/data/adk_less_than_95_percent.tfa']
allele_filenames => ['t/data/adk_less_than_95_percent.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'initialise comparison where no hits are returned');
is_deeply( $compare_alleles->found_sequence_names, [], 'no matching sequences found');
is_deeply( $compare_alleles->non_matching_sequences, {'adk_less_than_95_percent' => 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'}, 'non matching sequences returned');
Expand All @@ -48,7 +52,8 @@ is($compare_alleles->contamination, 0, 'no contamination found');

ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
sequence_filename => 't/data/contigs_missing_locus.fa',
allele_filenames => ['t/data/Helicobacter_pylori/alleles/atpA.tfa',' t/data/Helicobacter_pylori/alleles/efp.tfa','t/data/Helicobacter_pylori/alleles/mutY.tfa']
allele_filenames => ['t/data/Helicobacter_pylori/alleles/atpA.tfa',' t/data/Helicobacter_pylori/alleles/efp.tfa','t/data/Helicobacter_pylori/alleles/mutY.tfa'],
profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'initialise comparison where profile has missing locus');
is_deeply( $compare_alleles->found_sequence_names,sort(['atpA-3','efp-9999','mutY-3']), 'correct sequences found');
is_deeply( $compare_alleles->non_matching_sequences, {}, 'no non matching sequences returned');
Expand Down

0 comments on commit 149f918

Please sign in to comment.