Skip to content

Commit

Permalink
Successful testing of Bio::AGP::LowLevel
Browse files Browse the repository at this point in the history
  • Loading branch information
Sheena Scroggins committed Mar 25, 2011
1 parent b5d533d commit 842f018
Show file tree
Hide file tree
Showing 6 changed files with 939 additions and 239 deletions.
244 changes: 6 additions & 238 deletions lib/Bio/AGP/LowLevel.pm
@@ -1,22 +1,18 @@
package CXGN::BioTools::AGP;
package Bio::AGP::LowLevel;
use strict;
use warnings;
use English;
use Carp;

use File::Basename;

use Bio::PrimarySeq;
use Bio::Seq::LargePrimarySeq;
use Bio::SeqFeature::Annotated;

use CXGN::Tools::List qw/str_in/;
use CXGN::Tools::Identifiers qw/identifier_namespace/;

use CXGN::Tools::File qw/is_filehandle/;

=head1 NAME
CXGN::BioTools::AGP - functions for dealing with AGP files
Bio::AGP::LowLevel - functions for dealing with AGP files
=head1 SYNOPSIS
Expand Down Expand Up @@ -47,17 +43,15 @@ BEGIN {
@EXPORT_OK = qw(
agp_parse
agp_to_seq
agp_to_seqs
agp_write
agp_format_part
agp_contigs
agp_contig_seq
agp_to_features
);
}

Expand Down Expand Up @@ -371,232 +365,6 @@ sub agp_contigs {
}


=head2 agp_contig_seq
Usage: my $seqs = agp_contig_seq($contig, %options);
Desc : given a contig arrayref like those returned by agp_contigs,
fetch the sequence
Args : a contig arrayref like [agp_file_line, agp_file_line, ...],
hash-style list of options like
fetch_<ns> => sub { }
# function for fetching sequences
# in namespace <ns> as returned by
# CXGN::Tools::Identifiers::identifier_namespace
# MANDATORY for each kind of identifier present
# in the agp file
# takes 1 arg, the identifier string, returns
# the sequence as a string
# if fetch_default
lowercase => 1, #< force lower-case sequence,
defaults to force uppercase
pad_short_sequences => 0, #< if true, N-pad sequences that are too
# short to cover a region in the
# AGP file. otherwise, die for
# seqs that are too short.
# default false.
Ret : a string sequence (not a Bio::SeqI object)
Side Effects: calls the fetch_* functions you provide
Example:
#get the contigs for a chromosome
my %contigs = named_contigs(4);
#and now make all the consensus sequences
my %sequences;
while( my ($contig_name,$contig) = each(%contigs) ) {
$sequences{$contig_name} = consensus_sequence($contig);
}
=cut

sub agp_contig_seq {
my ($members,%opt) = @_;

my $seq = '';
foreach my $member (sort {$a->{ostart} <=> $b->{ostart}} @$members) {
my $comp_type = identifier_namespace($member->{ident}) || 'default';
my $fetch_func = $opt{"fetch_$comp_type"}
or croak "cannot fetch sequence for $member->{ident} in namespace $comp_type, please provide a fetch_$comp_type, or check that this is the correct namespace";
my $comp_seq = $fetch_func->($member->{ident})
or croak "failed to fetch sequence for '$member->{ident}'";

unless( length($comp_seq) >= $member->{cstart}-1+$member->{length}) {
if( $opt{pad_short_sequences} ) {
my $pad_length = ($member->{cstart}-1+$member->{length}) - length($comp_seq);
$comp_seq .= 'N'x$pad_length;
} else {
die "making object $member->{objname}: invalid coordinates (start $member->{cstart},length $member->{length}) for $member->{ident} (".length($comp_seq)." bases)";
}
}
$comp_seq = substr($comp_seq,$member->{cstart}-1,$member->{length});
if($member->{orient} eq '-') {
$comp_seq = Bio::PrimarySeq->new( -seq => $comp_seq, -id => 'foo')->revcom->seq;
# warn "revcom for $member->{ident}\n";
}
$seq .= $opt{lowercase} ? lc $comp_seq : uc $comp_seq;
}

return $seq;
}


=head2 agp_to_seqs
Usage: my $seq = agp_to_seq( $filename, %options );
Desc : parse an AGP file (or filehandle) and make a pseudomolecule sequence with
it, fetching sequences using the given subroutine(s)
Args : the AGP filename, plus hash-style options as:
lowercase => 1, #< force lower-case sequence,
defaults to force uppercase
no_seq_char => 'N', #< character to use to fill gaps
gap_length => 0, #< override to make all gaps this
length, defaults to 0, meaning
make gaps as big as the file
says
no_large_seqs => 0, #< don't use Bio::Seq::LargePrimarySeq
objects to hold object sequences.
If there are many short objects,
using LargePrimarySeq objects for
them can sometimes use up all the
system's file descriptors.
fetch_<ns> => sub { }
#function for fetching sequences
#in namespace <ns> as returned by
#CXGN::Tools::Identifiers::identifier_namespace
pad_short_sequences => 0 # for sequences that are too
# short to cover the region
# specified in the AGP,
# pad with Ns if true,
# die if false. default
# false (die)
Ret : list of Bio::Seq::LargePrimarySeq objects
Side Effects: warns and dies on errors
=cut

sub agp_to_seqs {
my ($agpfile,%opt) = @_;
$opt{gap_length} ||= 0;
$opt{lowercase} ||= 0;
$opt{no_seq_char} ||= $opt{lowercase} ? 'n' : 'N';

my $agplines = agp_parse($agpfile)
or die "error parsing AGP file $agpfile\n";

my $seq_class =
$opt{no_large_seqs} ? 'Bio::PrimarySeq'
: 'Bio::Seq::LargePrimarySeq';

my @seqs;

for my $line (@$agplines) {
next if exists $line->{comment}; #< skip comments

push @seqs, $seq_class->new( -id => $line->{objname} )
unless $seqs[-1] && $seqs[-1]->id eq $line->{objname};

my $seq_obj = $seqs[-1];

if ( $line->{typedesc} =~ /^(un)?known_gap$/ ) {
my $gap_size_to_use = $opt{gap_length} || $line->{length};
_add_sequence_as_string( $seq_obj, $opt{no_seq_char} x $gap_size_to_use );
} else {
_add_sequence_as_string( $seq_obj, agp_contig_seq([$line], %opt) );
}
}

return @seqs;
}


sub _add_sequence_as_string {
my ( $seq_obj, $additional_seq ) = @_;

if( $seq_obj->can('add_sequence_as_string') ) {
$seq_obj->add_sequence_as_string( $additional_seq );
} else {
my $s = $seq_obj->seq || '';
$seq_obj->seq( $s . $additional_seq );
}
}

=head2 agp_to_seq
Like agp_to_seq, except only returns the first sequence in the AGP
file. Deprecated, but still exists for backward compatibility.
=cut

sub agp_to_seq {
( shift->agp_to_seq(@_) )[0]
}


=head2 agp_to_features
Usage: my @features = agp_to_features( $agp_file );
Desc : parse the AGP file (or filehandle) and return it as a list of
features located on the object that's being
built
Args : AGP file name,
gap_type => feature type for gaps, default: gap,
set undef to omit gap features
component_type => feature type for components, default: clone
set undef to omit component features
source_name => name to use for 'source' column,
default AGP,
Ret : list of Bio::SeqFeature::Annotated objects
Side Effects: none
=cut

sub agp_to_features {
my ( $agp_file, %opts ) = @_;

# set defaults for options
$opts{source_name} ||= 'AGP';
$opts{gap_type} = 'gap' unless exists $opts{gap_type};
$opts{component_type} = 'clone' unless exists $opts{component_type};

my $parsed_agp = agp_parse( $agp_file )
or die "could not parse AGP file '$agp_file'";

my @features;
foreach my $line ( @$parsed_agp ) {
next if $line->{comment};

if( $line->{is_gap} && $opts{gap_type} ) {
push @features,
Bio::SeqFeature::Annotated->new( -start => $line->{ostart},
-end => $line->{oend},
#-score => undef,
-type => $opts{gap_type},
-source => $opts{source_name},
-seq_id => $line->{objname},
#-annots => { ID => $self->_unique_bio_annotation_id("${hname}_${fwdrev}_alignment"),
# },
);

} elsif( $opts{component_type} ) {
push @features,
Bio::SeqFeature::Annotated->new( -start => $line->{ostart},
-end => $line->{oend},
#-score => undef,
-type => $opts{component_type},
-source => $opts{source_name},
-seq_id => $line->{objname},
-target => { -start => $line->{cstart},
-end => $line->{cend},
-target_id => $line->{ident},
},
#-annots => { ID => $self->_unique_bio_annotation_id("${hname}_${fwdrev}_alignment"),
# },
);
}
}

return @features;
}

=head1 AUTHOR(S)
Expand Down

0 comments on commit 842f018

Please sign in to comment.