From b5d533d98ad1cc69e3bc00f5240fbbf2d7f3d539 Mon Sep 17 00:00:00 2001
From: Sheena Scroggins <sheena.scroggins@gmail.com>
Date: Thu, 24 Mar 2011 23:14:10 -0700
Subject: [PATCH] initial import of CXGN::BioTools::AGP

---
 lib/Bio/AGP/LowLevel.pm | 609 ++++++++++++++++++++++++++++++++++++++++
 t/basic.t               | 172 ++++++++++++
 2 files changed, 781 insertions(+)
 create mode 100644 lib/Bio/AGP/LowLevel.pm
 create mode 100644 t/basic.t

diff --git a/lib/Bio/AGP/LowLevel.pm b/lib/Bio/AGP/LowLevel.pm
new file mode 100644
index 0000000..cf4ed1b
--- /dev/null
+++ b/lib/Bio/AGP/LowLevel.pm
@@ -0,0 +1,609 @@
+package CXGN::BioTools::AGP;
+use strict;
+use warnings;
+use English;
+use Carp;
+
+use File::Basename;
+
+use Bio::PrimarySeq;
+use Bio::Seq::LargePrimarySeq;
+use Bio::SeqFeature::Annotated;
+
+use CXGN::Tools::List qw/str_in/;
+use CXGN::Tools::Identifiers qw/identifier_namespace/;
+use CXGN::Tools::File qw/is_filehandle/;
+
+=head1 NAME
+
+CXGN::BioTools::AGP - functions for dealing with AGP files
+
+=head1 SYNOPSIS
+
+ $bio_large_seq = agp_to_seq($agp_file_name);
+
+ $lines_arrayref = agp_parse('my_agp_file.agp');
+
+ agp_write( $lines => 'my_agp_file.agp');
+
+ $seq = agp_contig_seq( $mycontig,
+                        fetch_bac_sequences => sub {...}
+                       )
+
+=head1 DESCRIPTION
+
+functions for working with AGP files.
+
+=head1 FUNCTIONS
+
+All functions below are EXPORT_OK.
+
+=cut
+
+use base qw/ Exporter /;
+
+our @EXPORT_OK;
+BEGIN {
+  @EXPORT_OK = qw(
+                  agp_parse
+
+                  agp_to_seq
+                  agp_to_seqs
+
+                  agp_write
+                  agp_format_part
+
+                  agp_contigs
+
+                  agp_contig_seq
+
+                  agp_to_features
+                 );
+}
+
+
+=head2 agp_parse
+
+  Usage: my $lines = agp_parse('~/myagp.agp',validate_syntax => 1, validate_identifiers => 1);
+  Desc : parse an agp file
+  Args : filename or filehandle, hash-style list of options as 
+                       validate_syntax => if true, error
+                           if there are any syntax errors,
+                       validate_identifiers => if true, error
+                          if there are any identifiers that
+                          CXGN::Tools::Identifiers doesn't recognize
+                          IMPLIES validate_syntax
+                       error_array => an arrayref.  if given, will push
+                          error descriptions onto this array instead of
+                          using warn to print them to stderr
+  Ret  : undef if error, otherwise return an
+         arrayref containing line records, each of which is like:
+         { comment => 'text' } if a comment,
+         or if a data line:
+         {  objname  => the name of the object being assembled
+                       (same for every record),
+            ostart   => start coordinate for this component (object),
+            oend     => end coordinate for this component   (object),
+            partnum  => the part number appearing in the 4th column,
+            linenum  => the line number in the file,
+            type     => letter type present in the file (/[ADFGNOPUW]/),
+            typedesc => description of the type, one of:
+                         - (A) active_finishing
+                         - (D) draft
+                         - (F) finished
+                         - (G) wgs_finishing
+                         - (N) known_gap
+                         - (O) other
+                         - (P) predraft
+                         - (U) unknown_gap
+                         - (W) wgs_contig
+            ident    => identifier of the component, if any,
+            length   => length of the component,
+            is_gap   => 1 if the line is some kind of gap, 0 if it
+                        is covered by a component,
+            gap_type => one of:
+                 fragment: gap between two sequence contigs (also
+                    called a "sequence gap"),
+                 clone: a gap between two clones that do not overlap.
+  		 contig: a gap between clone contigs (also called a
+  		    "layout gap").
+  		 centromere: a gap inserted for the centromere.
+  		 short_arm: a gap inserted at the start of an
+  		    acrocentric chromosome.
+  		 heterochromatin: a gap inserted for an especially
+     		    large region of heterochromatic sequence (may also
+  		    include the centromere).
+  		 telomere: a gap inserted for the telomere.
+  		 repeat: an unresolvable repeat.
+            cstart   => start coordinate relative to the component,
+            cend     => end coordinate relative to the component,
+            linkage  => 'yes' or 'no', only set for type of 'N',
+            orient   => '+', '-', 0, or 'na'
+                        orientation of the component
+                        relative to the object,
+         }
+
+  Side Effects: unless error_array is given, will print error
+                descriptions to STDERR with warn()
+  Example:
+
+=cut
+
+sub agp_parse {
+  my $agpfile = shift;
+  our %opt = @_;
+
+  $agpfile or croak 'must provide an AGP filename';
+
+  if($opt{validate_identifiers}) {
+    $opt{validate_syntax} = 1;
+  }
+
+  #if the argument is a filehandle, use it, otherwise try to use it as
+  #a filename
+  my $agp_in; #< filehandle for reading AGP
+  our $bn;    #< basename of file we're parsing
+  ($agp_in,$bn) = do {
+    if( is_filehandle($agpfile) ) {
+      ($agpfile,'<AGP>')
+    } else {
+      open my $f,$agpfile
+	or die "$! opening '$agpfile'\n";
+      ($f,$agpfile)
+    }
+  };
+
+  our $parse_error_flag = 0;
+  sub parse_error(@) {
+
+    return unless $opt{validate_syntax};
+
+    $parse_error_flag = 1;
+    my $errstr = "$bn:$.: ".join('',@_)."\n";
+    #if we're pushing errors onto an error_array, do that
+    if ($opt{error_array}) {
+      push @{$opt{error_array}},$errstr;
+    } else { # otherwise just warn
+      warn $errstr;
+    }
+  }
+
+  my @records;
+
+  my $last_end;
+  my $last_partnum;
+  my $last_objname;
+
+  my $assembled_sequence = '';
+  while (my $line = <$agp_in>) {
+    no warnings 'uninitialized';
+#    warn "parsing $line";
+    chomp $line;
+    $line =~ s/\r//g; #remove windows \r chars
+
+    #deal with comments
+    if($line =~ /#/) {
+      if( $line =~ s/^#// ) {
+	push @records, { comment => $line };
+	next;
+      }
+      parse_error("not a valid comment line, # must be first character on line");
+      next;
+    }
+
+    my @fields = split /\t/,$line,10;
+    @fields == 9
+      or parse_error "This line contains ".scalar(@fields)." columns.  All lines must have 9 columns.";
+
+    #if there just really aren't many columns, this probably isn't a valid AGP line
+    next unless @fields >= 5 && @fields <= 10;
+
+    my %r = (linenum => $.); #< the record we're building for this line, starting with line number
+
+    #parse and check the first 5 cols
+    @r{qw( objname ostart oend partnum type )} = splice @fields,0,5;
+    $r{objname}
+      or parse_error "'$r{obj_name}' is a valid object name";
+    #end
+    if ( defined $last_end && defined $last_objname && $r{objname} eq $last_objname ) {
+      $r{ostart} == $last_end+1
+	or parse_error "start coordinate not contiguous with previous line's end";
+    }
+    $last_end = $r{oend};
+    $last_objname = $r{objname};
+
+    #start
+    $r{oend} >= $r{ostart} or parse_error("end must be >= start");
+
+    #part num
+    $last_partnum ||= 0;
+    $r{partnum} == $last_partnum + 1
+      or parse_error("part numbers not sequential");
+
+    $last_partnum = $r{partnum};
+
+    #type
+    if ( $r{type} =~ /^[NU]$/ ) {
+      (@r{qw( length gap_type linkage)}, my $empty, my $undefined) = @fields;
+      @fields = ();
+      my %descmap = qw/ U unknown_gap N known_gap /;
+      $r{typedesc} = $descmap{$r{type}}
+	or parse_error("unregistered type $r{type}");
+      $r{is_gap}   = 1;
+
+      my $gap_size_to_use = $opt{gap_length} || $r{length};
+
+      $r{length} == $r{oend} - $r{ostart} + 1
+	or parse_error("gap size of '$r{length}' does not agree with ostart, oend of ($r{ostart},$r{oend})");
+
+      str_in($r{gap_type},qw/fragment clone contig centromere short_arm heterochromatin telomere repeat/)
+	or parse_error("invalid gap type '$r{gap_type}'");
+
+      str_in($r{linkage},qw/yes no/)
+	or parse_error("linkage (column 8) should be 'yes' or 'no'\n");
+
+      defined $empty && $empty eq ''
+	or parse_error("9th column should be present and empty\n");
+
+      push @records,\%r;
+
+  } elsif ( $r{type} =~ /^[ADFGOPW]$/ ) {
+      my %descmap = qw/A active_finishing D draft F finished G wgs_finishing N known_gap O other P predraft U unknown_gap W wgs_contig/;
+      $r{typedesc} = $descmap{$r{type}}
+	or parse_error("unregistered type $r{type}");
+      $r{is_gap} = 0;
+
+      @r{qw(ident cstart cend orient)} = @fields;
+      if($opt{validate_identifiers}) {
+	my $comp_type = identifier_namespace($r{ident})
+	  or parse_error("cannot guess type of '$r{ident}'");
+      } else {
+	$r{ident} or parse_error("invalid identifier '$r{ident}'");
+      }
+
+      str_in($r{orient},qw/+ - 0 na/)
+	or parse_error("orientation must be one of +,-,0,na");
+
+      $r{cstart} >= 1 && $r{cend} > $r{cstart}
+	or parse_error("invalid component start and/or end ($r{cstart},$r{cend})");
+
+      $r{length} = $r{cend}-$r{cstart}+1;
+
+      $r{length} == $r{oend} - $r{ostart} + 1
+	or parse_error("distance between object start, end ($r{ostart},$r{oend}) does not agree with distance between component start, end ($r{cstart},$r{cend})");
+
+      push @records, \%r;
+    } else {
+      parse_error("invalid component type '$r{type}', it should be one of {A D F G N O P U W}");
+    }
+  }
+
+  return if $parse_error_flag;
+
+  #otherwise, everything was well
+  return \@records;
+}
+
+
+=head2 agp_write
+
+  Usage: agp_write($lines,$file);
+  Desc : writes a properly formatted AGP file
+  Args : arrayref of line records to write, with the line records being
+             in the same format as those returned by agp_parse above,
+         filename or filehandle to write to,
+  Ret :  nothing meaningful
+
+  Side Effects: dies on failure.  if you gave it a filehandle, does
+                not close it
+  Example:
+
+=cut
+
+sub agp_write {
+  my ($lines,$file) = @_;
+  $file or confess "must provide file to write to!\n";
+
+  my $out_fh = is_filehandle($file) ? $file
+    : do {
+      open my $f,">$file" or croak "$! opening '$file' for writing";
+      $f
+    };
+
+  foreach my $line (@$lines) {
+      print $out_fh agp_format_part( $line );
+  }
+
+  return;
+}
+
+=head2 agp_format_part( $record )
+
+Format a single AGP part line (string terminated with a newline) from
+the given record hashref.
+
+=cut
+
+sub agp_format_part {
+    my ( $line ) = @_;
+
+    return "#$line->{comment}\n" if $line->{comment};
+
+    #and all other lines
+    my @fields = @{$line}{qw(objname ostart oend partnum type)};
+    if( $line->{type} =~ /^[NU]$/ ) {
+      push @fields, @{$line}{qw(length gap_type linkage)},'';
+    } else {
+      push @fields, @{$line}{qw(ident cstart cend orient)};
+    }
+
+    return join("\t", @fields)."\n";
+}
+
+
+=head2 agp_contigs
+
+  Usage: my @contigs = agp_contigs( agp_parse($agp_filename) );
+  Desc : extract and number contigs from a parsed AGP file
+  Args : arrayref of AGP lines, like those returned by agp_parse() above
+  Ret  : list of contigs, in the same order as they occur in the
+         file, formatted as:
+            [ agp_line_hashref, agp_line_hashref, ... ],
+            [ agp_line_hashref, agp_line_hashref, ... ],
+            ...
+
+=cut
+
+sub agp_contigs {
+  my $lines = shift;
+
+  my @contigs = ([]);
+  foreach my $l (@$lines) {
+    next if $l->{comment};
+    if( $l->{typedesc} =~ /_gap$/ ) {
+      push @contigs,[] if @{$contigs[-1]};
+    } else {
+      push @{$contigs[-1]},$l;
+    }
+  }
+  pop @contigs if @{$contigs[-1]} == 0;
+  return @contigs;
+}
+
+
+=head2 agp_contig_seq
+
+  Usage: my $seqs = agp_contig_seq($contig, %options);
+  Desc : given a contig arrayref like those returned by agp_contigs,
+         fetch the sequence
+  Args : a contig arrayref like [agp_file_line, agp_file_line, ...],
+         hash-style list of options like
+           fetch_<ns> => sub { }
+                          # function for fetching sequences
+                          # in namespace <ns> as returned by
+                          # CXGN::Tools::Identifiers::identifier_namespace
+                          # MANDATORY for each kind of identifier present
+                          #  in the agp file
+                          # takes 1 arg, the identifier string, returns
+                          # the sequence as a string
+                          # if fetch_default
+           lowercase   => 1,   #< force lower-case sequence,
+                                  defaults to force uppercase
+           pad_short_sequences => 0, #< if true, N-pad sequences that are too
+                                     #  short to cover a region in the
+                                     #  AGP file.  otherwise, die for
+                                     #  seqs that are too short.
+                                     #  default false.
+
+  Ret  : a string sequence (not a Bio::SeqI object)
+  Side Effects: calls the fetch_* functions you provide
+  Example:
+
+    #get the contigs for a chromosome
+    my %contigs = named_contigs(4);
+    #and now make all the consensus sequences
+    my %sequences;
+    while( my ($contig_name,$contig) = each(%contigs) ) {
+      $sequences{$contig_name} = consensus_sequence($contig);
+    }
+
+=cut
+
+sub agp_contig_seq {
+  my ($members,%opt) = @_;
+
+  my $seq = '';
+  foreach my $member (sort {$a->{ostart} <=> $b->{ostart}} @$members) {
+    my $comp_type = identifier_namespace($member->{ident}) || 'default';
+    my $fetch_func = $opt{"fetch_$comp_type"}
+      or croak "cannot fetch sequence for $member->{ident} in namespace $comp_type, please provide a fetch_$comp_type, or check that this is the correct namespace";
+    my $comp_seq = $fetch_func->($member->{ident})
+      or croak "failed to fetch sequence for '$member->{ident}'";
+
+    unless( length($comp_seq) >= $member->{cstart}-1+$member->{length}) {
+      if( $opt{pad_short_sequences} ) {
+	my $pad_length = ($member->{cstart}-1+$member->{length}) - length($comp_seq);
+	$comp_seq .= 'N'x$pad_length;
+      } else {
+	die "making object $member->{objname}: invalid coordinates (start $member->{cstart},length $member->{length}) for $member->{ident} (".length($comp_seq)." bases)";
+      }
+    }
+    $comp_seq = substr($comp_seq,$member->{cstart}-1,$member->{length});
+    if($member->{orient} eq '-') {
+      $comp_seq = Bio::PrimarySeq->new( -seq => $comp_seq, -id => 'foo')->revcom->seq;
+      # warn "revcom for $member->{ident}\n";
+    }
+    $seq .= $opt{lowercase} ? lc $comp_seq : uc $comp_seq;
+  }
+
+  return $seq;
+}
+
+
+=head2 agp_to_seqs
+
+  Usage: my $seq = agp_to_seq( $filename, %options );
+  Desc : parse an AGP file (or filehandle) and make a pseudomolecule sequence with
+         it, fetching sequences using the given subroutine(s)
+  Args : the AGP filename, plus hash-style options as:
+           lowercase   => 1,   #< force lower-case sequence,
+                                  defaults to force uppercase
+           no_seq_char => 'N', #< character to use to fill gaps
+           gap_length   => 0,  #< override to make all gaps this
+                                  length, defaults to 0, meaning
+                                  make gaps as big as the file
+                                  says
+           no_large_seqs => 0, #< don't use Bio::Seq::LargePrimarySeq
+                                  objects to hold object sequences.
+                                  If there are many short objects,
+                                  using LargePrimarySeq objects for
+                                  them can sometimes use up all the
+                                  system's file descriptors.
+           fetch_<ns> => sub { }
+                               #function for fetching sequences
+                               #in namespace <ns> as returned by
+                               #CXGN::Tools::Identifiers::identifier_namespace
+           pad_short_sequences => 0 # for sequences that are too
+                                    # short to cover the region
+                                    # specified in the AGP,
+                                    # pad with Ns if true,
+                                    # die if false.  default
+                                    # false (die)
+  Ret  : list of Bio::Seq::LargePrimarySeq objects
+  Side Effects: warns and dies on errors
+
+=cut
+
+sub agp_to_seqs {
+  my ($agpfile,%opt) = @_;
+  $opt{gap_length} ||= 0;
+  $opt{lowercase} ||= 0;
+  $opt{no_seq_char} ||= $opt{lowercase} ? 'n' : 'N';
+
+  my $agplines = agp_parse($agpfile)
+    or die "error parsing AGP file $agpfile\n";
+
+  my $seq_class =
+      $opt{no_large_seqs} ? 'Bio::PrimarySeq'
+                          : 'Bio::Seq::LargePrimarySeq';
+
+  my @seqs;
+
+  for my $line (@$agplines) {
+    next if exists $line->{comment}; #< skip comments
+
+    push @seqs, $seq_class->new( -id => $line->{objname} )
+        unless $seqs[-1] && $seqs[-1]->id eq $line->{objname};
+
+    my $seq_obj = $seqs[-1];
+
+    if ( $line->{typedesc} =~ /^(un)?known_gap$/ ) {
+        my $gap_size_to_use = $opt{gap_length} || $line->{length};
+        _add_sequence_as_string( $seq_obj, $opt{no_seq_char} x $gap_size_to_use );
+    } else {
+        _add_sequence_as_string( $seq_obj, agp_contig_seq([$line], %opt) );
+    }
+  }
+
+  return @seqs;
+}
+
+
+sub _add_sequence_as_string {
+    my ( $seq_obj, $additional_seq ) = @_;
+
+    if( $seq_obj->can('add_sequence_as_string') ) {
+        $seq_obj->add_sequence_as_string( $additional_seq );
+    } else {
+        my $s = $seq_obj->seq || '';
+        $seq_obj->seq( $s . $additional_seq );
+    }
+}
+
+=head2 agp_to_seq
+
+Like agp_to_seq, except only returns the first sequence in the AGP
+file.  Deprecated, but still exists for backward compatibility.
+
+=cut
+
+sub agp_to_seq {
+    ( shift->agp_to_seq(@_) )[0]
+}
+
+
+=head2 agp_to_features
+
+  Usage: my @features = agp_to_features( $agp_file );
+  Desc : parse the AGP file (or filehandle) and return it as a list of
+         features located on the object that's being
+         built
+  Args : AGP file name,
+         gap_type => feature type for gaps, default: gap,
+                     set undef to omit gap features
+         component_type => feature type for components, default: clone
+                        set undef to omit component features
+         source_name => name to use for 'source' column,
+                        default AGP,
+  Ret  : list of Bio::SeqFeature::Annotated objects
+  Side Effects: none
+
+=cut
+
+sub agp_to_features {
+  my ( $agp_file, %opts ) = @_;
+
+  # set defaults for options
+  $opts{source_name} ||= 'AGP';
+  $opts{gap_type}       = 'gap'   unless exists $opts{gap_type};
+  $opts{component_type} = 'clone' unless exists $opts{component_type};
+
+  my $parsed_agp = agp_parse( $agp_file )
+    or die "could not parse AGP file '$agp_file'";
+
+  my @features;
+  foreach my $line ( @$parsed_agp ) {
+    next if $line->{comment};
+
+    if( $line->{is_gap} && $opts{gap_type} ) {
+      push @features,
+	Bio::SeqFeature::Annotated->new( -start => $line->{ostart},
+					 -end   => $line->{oend},
+					 #-score => undef,
+					 -type  => $opts{gap_type},
+					 -source => $opts{source_name},
+					 -seq_id => $line->{objname},
+					 #-annots => { ID => $self->_unique_bio_annotation_id("${hname}_${fwdrev}_alignment"),
+					 #	   },
+				     );
+
+    } elsif( $opts{component_type} ) {
+      push @features,
+	Bio::SeqFeature::Annotated->new( -start => $line->{ostart},
+					 -end   => $line->{oend},
+					 #-score => undef,
+					 -type  => $opts{component_type},
+					 -source => $opts{source_name},
+					 -seq_id => $line->{objname},
+					 -target => { -start => $line->{cstart},
+						      -end   => $line->{cend},
+						      -target_id => $line->{ident},
+						    },
+					 #-annots => { ID => $self->_unique_bio_annotation_id("${hname}_${fwdrev}_alignment"),
+					 #	   },
+				       );
+    }
+  }
+
+  return @features;
+}
+
+=head1 AUTHOR(S)
+
+Robert Buels
+
+=cut
+
+###
+1;#do not remove
+###
diff --git a/t/basic.t b/t/basic.t
new file mode 100644
index 0000000..0925f35
--- /dev/null
+++ b/t/basic.t
@@ -0,0 +1,172 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+use FindBin;
+use File::Basename;
+use File::Spec;
+use File::Temp qw/tempfile/;
+
+my $testfiles_path = File::Spec->catdir($FindBin::RealBin,'data');
+my @agptests = (
+                { file => 'chr04.v3.agp',
+                  lines => 163,
+                  specs =>
+                  {
+                   5 => {qw(
+                            objname S.lycopersicum-chr4
+                            ostart  357124
+                            oend  407123
+                            partnum 5
+                            type N
+                            typedesc known_gap
+                            is_gap 1
+                            length 50000
+                            gap_type clone
+                            linkage yes
+                            linenum 5
+                           )},
+		       28 => {qw(
+				 objname S.lycopersicum-chr4
+				 ostart 1877407
+				 oend 1989558
+				 partnum 27
+				 linenum 28
+				 type F
+				 typedesc finished
+				 is_gap 0
+				 ident C04HBa0036C23.1
+				 cstart 2001
+				 cend 114152
+				 length 112152
+				 orient +
+				)},
+		       49 => {qw(
+				 objname S.lycopersicum-chr4
+				 ostart 3472176
+				 oend 3590122
+				 partnum 48
+				 linenum 49
+				 type F
+				 typedesc finished
+				 is_gap 0
+				 ident C04HBa0147F16.1
+				 cstart 1
+				 cend 117947
+				 orient -
+				 length 117947
+				)},
+		      },
+		      contigs => {
+				  7 => [
+					{
+					 'objname' => 'S.lycopersicum-chr4',
+					 'ostart' => 1737070,
+					 'cend' => 28696,
+					 'oend' => 1765765,
+					 is_gap => 0,
+					 'ident' => 'C04HBa0114G11.1',
+					 'length' => 28696,
+					 'typedesc' => 'finished',
+					 'orient' => '+',
+					 'linenum' => '26',
+					 'type' => 'F',
+					 'cstart' => 1,
+					 'partnum' => 25
+					},
+					{
+					 'objname' => 'S.lycopersicum-chr4',
+					 'ostart' => 1765766,
+					 'cend' => 113641,
+					 'oend' => 1877406,
+					 is_gap => 0,
+					 'ident' => 'C04HBa0050I18.1',
+					 'length' => 111641,
+					 'typedesc' => 'finished',
+					 'orient' => '+',
+					 'linenum' => '27',
+					 'type' => 'F',
+					 'cstart' => 2001,
+					 'partnum' => 26
+					},
+					{
+					 'objname' => 'S.lycopersicum-chr4',
+					 'ostart' => 1877407,
+					 'cend' => 114152,
+					 'oend' => 1989558,
+					 is_gap => 0,
+					 'ident' => 'C04HBa0036C23.1',
+					 'length' => 112152,
+					 'typedesc' => 'finished',
+					 'orient' => '+',
+					 'linenum' => '28',
+					 'type' => 'F',
+					 'cstart' => 2001,
+					 'partnum' => 27
+					},
+					{
+					 'objname' => 'S.lycopersicum-chr4',
+					 'ostart' => 1989559,
+					 'cend' => 81572,
+					 'oend' => 2069130,
+					 is_gap => 0,
+					 'ident' => 'C04HBa0008H22.1',
+					 'length' => 79572,
+					 'typedesc' => 'finished',
+					 'orient' => '+',
+					 'linenum' => '29',
+					 'type' => 'F',
+					 'cstart' => 2001,
+					 'partnum' => 28
+					}
+				       ],
+				 },
+		    },
+		    { file => 'chr09.v1.agp',
+		      lines => 155,
+		    },
+                    { file => 'test_seq_assembly.agp',
+                      lines => 5,
+                    },
+
+		  );
+$_->{file} = File::Spec->catfile($testfiles_path,$_->{file}) foreach @agptests;
+
+use Test::More;
+use File::Temp qw/tempfile/;
+
+use_ok(  'CXGN::BioTools::AGP' , qw/ agp_parse agp_write agp_contigs / )
+    or BAIL_OUT('could not include the module being tested');
+
+
+foreach my $test (@agptests) {
+  #diag "testing with $test->{file}\n";
+  my $lines = agp_parse($test->{file},validate_syntax => 1)
+    or die 'parse failed';
+  is(scalar(@$lines),$test->{lines},'correct line count');
+  while(my ($line,$spec) = each %{$test->{specs}}) {
+    is_deeply($lines->[$line-1],
+	      $spec,
+	      'check spec',
+	     );
+  }
+
+  my ($tfh,$tf) = tempfile(UNLINK => 1);
+  agp_write($lines,$tfh);
+  close $tfh;
+
+  #now do a parse of the new written file
+  my $morelines = agp_parse($tf,validate_syntax => 1);
+  is_deeply($lines,$morelines,'read-write-read round-trip is lossless');
+
+  #extract the contigs and check them
+  my @contigs = agp_contigs($morelines);
+  while( my ($idx,$ctg) = each %{$test->{contigs}}) {
+    is_deeply($contigs[$idx],$ctg,'contig '.($idx+1).' extracted OK');
+  }
+
+}
+
+done_testing;
+exit;
+