ray1729 committed Nov 6, 2010
1 parent 3e6d5d3 commit efa9ae5d2442677568729d3da56398535e90e52b
Showing with 7 additions and 5 deletions.
  1. +7 −5 src/Bio/sequence.clj
@@ -1,15 +1,17 @@
(ns Bio.sequence)
-;; The heuristic below is borrowed from BioPerl's Bio::PrimarySeq::_guess_alphabet
+;; The heuristic below is borrowed from BioPerl's Bio::PrimarySeq->_guess_alphabet.
+;; It suffers the same probem ("N" represents "aNy" in a DNA sequence or Asparagine
+;; in a protein sequence).
(defn guess-alphabet [sequence]
"Guess the alphabet (:dna, :rna or :protein) of sequence; returns nil for
empty sequence."
(when-let [s (seq (filter (comp not #{\. \-}) (seq sequence)))]
(let [total (count s)
- u (count (filter #{\U \u} s))
- acgt (count (filter #{\A \C \G \T \N \a \c \g \t \n} s))]
+ acgu (filter #{\A \C \G \U \N \a \c \g \u \n} s)
+ acgt (filter #{\A \C \G \T \N \a \c \g \t \n} s)]
- (> (/ acgt total) 0.85) :dna
- (> (/ (+ acgt u) total) 0.85) :rna
+ (> (/ (count acgt) total) 0.85) :dna
+ (> (/ (count acgu) total) 0.85) :rna
:else :protein))))

