Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Heng Li
committed
Jul 15, 2010
1 parent
dbdec8b
commit 07dc1c6
Showing
1 changed file
with
205 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
\documentclass[10pt]{article} | ||
|
||
\addtolength{\textwidth}{3.2cm} | ||
\addtolength{\hoffset}{-1.6cm} | ||
\addtolength{\textheight}{4cm} | ||
\addtolength{\voffset}{-2cm} | ||
|
||
\makeindex | ||
|
||
\title{The SAM Format Specification (v1.3 draft)} | ||
|
||
\begin{document} | ||
|
||
\maketitle | ||
|
||
\section{Terminologies and Concepts} | ||
|
||
\begin{description} | ||
\item[Template] A DNA/RNA sequence part of which is sequenced on a | ||
sequencing machine. | ||
\item[Fragment] A (sub)sequence on a template which is | ||
sequenced. Fragments on a template are said to be \emph{ordered} if | ||
the their relative positions on the template are known. In this case, | ||
the template is also said to be ordered. | ||
\item[Read] A raw sequence that comes off a sequencing machine. A read | ||
may consist of multiple fragments. | ||
\item[1-based coordinate system] A coordinate system where the first | ||
base of a sequence is one. In this coordinate system, a region is | ||
specified by a closed interval. For example, the region between the 3rd | ||
and the 7th bases inclusive is $[3,7]$. The SAM and GFF formats are | ||
using the 1-based coordinate system. | ||
\item[0-based coordinate system] A coordinate system where the first | ||
base of a sequence is zero. In this coordinate system, a region is | ||
specified by a half-close-half-open interval. For example, the region | ||
between the 3rd and the 7th bases inclusive is $[2,7)$. The BED, | ||
Wiggle and PSL formats are using the 0-based coordinate system. | ||
\end{description} | ||
|
||
\section{The SAM Format Specification} | ||
\subsection{The header} | ||
The header section can be absent. | ||
\begin{center} | ||
\begin{tabular}{|l|l|p{13.5cm}|} | ||
\hline | ||
\multicolumn{2}{|l|}{\bf Tag} & {\bf Description} \\ | ||
\hline | ||
\multicolumn{2}{|l}{\tt @HD} & The header line. \\\cline{2-3} | ||
& {\tt VN}* & Formate version. \emph{Accepted format}: {\tt /\char94[0-9]+\char92.[0-9]+\$/}.\\\cline{2-3} | ||
& {\tt SO} & Sorting order. \emph{Valid values}: {\tt unsorted}, {\tt queryname} and {\tt coordinate}. \\\hline | ||
\multicolumn{2}{|l}{\tt @SQ} & Reference sequence dictionary. \\\cline{2-3} | ||
& {\tt SN}* & Reference sequence name. Unique among all | ||
sequence records in the file. The value of this field is used in the | ||
alignment records. \\\cline{2-3} | ||
& {\tt LN}* & Reference sequence length. \emph{Range}: {\tt [1,2$^{29}$-1]}\\\cline{2-3} | ||
& {\tt AS} & Genome assembly identifier. \\\cline{2-3} | ||
& {\tt M5} & MD5 checksum of the sequence in the uppercase, with gaps and spaces removed.\\\cline{2-3} | ||
& {\tt SP} & Species.\\\cline{2-3} | ||
& {\tt UR} & URI of the sequence.\\\hline | ||
\multicolumn{2}{|l}{\tt @RG} & Read group. \\\cline{2-3} | ||
& {\tt ID}* & Unique read group identifier. The value of ID | ||
is used in the RG tags of alignment records. \\\cline{2-3} | ||
& {\tt CN} & Name of sequencing center producing the read.\\\cline{2-3} | ||
& {\tt DS} & Description.\\\cline{2-3} | ||
& {\tt DT} & Date the run was produced (ISO8601 date or date/time).\\\cline{2-3} | ||
& {\tt LB} & Library.\\\cline{2-3} | ||
& {\tt PI} & Predicted median insert size.\\\cline{2-3} | ||
& {\tt PL} & Platform/technology used to produce the read. \emph{Valid values}: | ||
{\tt ILLUMINA}, {\tt SOLID}, {\tt LS454}, {\tt HELICOS} and {\tt PACBIO}.\\\cline{2-3} | ||
& {\tt PU} & Platform unit (e.g. lane for Illumina or slide for SOLiD). Unique identifier.\\\cline{2-3} | ||
& {\tt SM} & Sample. Use pool name where a pool is being sequenced.\\\hline | ||
\multicolumn{2}{|l}{\tt @PG} & Program. \\\cline{2-3} | ||
& {\tt ID}* & Program name \\\cline{2-3} | ||
& {\tt VN} & Program version \\\cline{2-3} | ||
& {\tt CL} & Command line \\\hline | ||
\multicolumn{2}{|l}{\tt @CO} & One-line text comment.\\ | ||
\hline | ||
\end{tabular} | ||
\end{center} | ||
|
||
\subsection{The mandatory fields} | ||
The following table gives an overview of the mandatory fields in | ||
the SAM format: | ||
\begin{center} | ||
\begin{tabular}{rllll} | ||
\hline | ||
{\bf Col} & {\bf Field} & {\bf Type} & {\bf Regexp/Range} & {\bf Brief description} \\ | ||
\hline | ||
1 & {\sf QNAME} & String & {\tt [!-?A-\char126]+} & Query template NAME\\ | ||
2 & {\sf FLAG} & Int/Chr & {\tt [0,2$^{16}$-1]}/{\tt [*pPuUrR12sfd]} & bitwise FLAG \\ | ||
3 & {\sf RNAME} & String & {\tt [!-\char126]+} & Reference sequence NAME\\ | ||
4 & {\sf POS} & Int & {\tt [0,2$^{29}$-1]} & 1-based leftmost mapping POSition \\ | ||
5 & {\sf MAPQ} & Int & {\tt [0,2$^8$-1]} & MAPping Quality \\ | ||
6 & {\sf CIGAR} & String & {\tt \char92*|([0-9]+[MIDNSHPX=])+} & CIGAR string \\ | ||
7 & {\sf RNEXT} & String & {\tt [!-\char126]+} & Ref. name of the mate/next fragment\\ | ||
8 & {\sf PNEXT} & Int & {\tt [0,2$^{29}$-1]} & Position of the mate/next fragment \\ | ||
9 & {\sf TLEN} & Int & {\tt [0,2$^{29}$-1]} & observed Template LENgth \\ | ||
10 & {\sf SEQ} & String & {\tt \char92*|[A-Za-z=]+} & fragment SEQuence\\ | ||
11 & {\sf QUAL} & String & {\tt [!-\char126]+} & ASCII of base QUALity+33 \\ | ||
\hline | ||
\end{tabular} | ||
\end{center} | ||
|
||
\begin{enumerate} | ||
\item {\sf QNAME}: Query template NAME. Each template has a unique name. | ||
\item {\sf FLAG}: bitwise FLAG. Each bit is explained in the following | ||
table: | ||
\begin{center} | ||
\begin{tabular}{rcl} | ||
\hline | ||
Bit & Char & Description\\ | ||
\hline | ||
0x1 & p & template having multiple fragments in sequencing \\ | ||
0x2 & P & each fragment properly aligned according to the aligner \\ | ||
0x4 & u & fragment unmapped \\ | ||
0x8 & U & next fragment in the template unmapped \\ | ||
0x10 & r & {\sf SEQ} being reverse complemented \\ | ||
0x20 & R & {\sf SEQ} of the next fragment in the template being reversed \\ | ||
0x40 & 1 & the first fragment in the template \\ | ||
0x80 & 2 & the last fragment in the template \\ | ||
0x100 & s & secondary alignment\\ | ||
0x200 & f & not passing quality controls \\ | ||
0x400 & d & PCR or optical duplicate \\ | ||
\hline | ||
\end{tabular} | ||
\end{center} | ||
\begin{itemize} | ||
\item Bit 0x4 is the only reliable place to tell whether the fragment is unmapped. | ||
\item If 0x40 and 0x80 are both set, the fragment is part of a linear | ||
template, but it is neither the first nor the last fragment. If both | ||
0x40 and 0x80 are unset, the index of the fragment in the template | ||
is unknown. This may happen for a non-linear template or the index | ||
is lost in data processing. | ||
\item Bit 0x100 marks the alignment not to be used in certain analyses | ||
when the tools in use are aware of this bit. | ||
\item \emph{Implicit rules}: if 0x1 is unset, 0x2, 0x8, 0x20, 0x40, | ||
0x80 are all regarded to be unset; if 0x4 or 0x8 is set, 0x2 is | ||
regarded to be unset. | ||
\item Bits 0x10 and 0x20 only indicate the strand of the | ||
fragment. Unmapped reads may have these two bits set. | ||
\end{itemize} | ||
\item {\sf RNAME}: Reference sequence NAME of the alignment. An unmapped | ||
fragment without coordinate has a `*' at this field. However, an | ||
unmapped fragment may also have an ordinary coordinate such that it | ||
can be placed at a desired position after sorting. | ||
\item {\sf POS}: 1-based leftmost mapping POSition of the first matching | ||
base. The first base in a reference sequence has coordinate 1. {\sf | ||
POS} is set as 0 for an unmapped read without | ||
coordinate. \emph{Implicit rules}: if {\sf RNAME} is `*', {\sf POS} is | ||
regarded to be 0, and vice versa. | ||
\item {\sf MAPQ}: MAPping Quality. It equals | ||
$-10\log_{10}\Pr\{\mbox{mapping position is wrong}\}$, rounded to the | ||
nearest integer. | ||
\item {\sf CIGAR}: CIGAR string. The CIGAR operations are given in the | ||
following table: | ||
\begin{center} | ||
\begin{tabular}{cl} | ||
\hline | ||
Op & Description\\ | ||
\hline | ||
{\tt M} & alignment match (can be a sequence match or mismatch)\\ | ||
{\tt I} & insertion to the reference \\ | ||
{\tt D} & deletion from the reference \\ | ||
{\tt N} & skipped region from the reference \\ | ||
{\tt S} & soft clipping (clipped sequences present in {\sf SEQ})\\ | ||
{\tt H} & hard clipping (clipped sequences NOT present in {\sf SEQ})\\ | ||
{\tt P} & padding (silent deletion from padded reference)\\ | ||
{\tt =} & sequence match \\ | ||
{\tt X} & sequence mismatch \\ | ||
\hline | ||
\end{tabular} | ||
\end{center} | ||
\begin{itemize} | ||
\item S/H can only be the first or the last operation. | ||
\end{itemize} | ||
\item {\sf RNEXT}: Reference sequence name of the NEXT fragment in the | ||
template. This field is set as `*' when the information is | ||
unavailable. | ||
\item {\sf PNEXT}: Position of the NEXT fragment in the template. Set as | ||
0 when the information is unavailable. \emph{Implicit rules}: if {\sf | ||
RNEXT} is `*', {\sf PNEXT} is regarded to be 0, and vice versa. | ||
\item {\sf TLEN}: observed Template LENgth. It is set as 0 for | ||
single-fragment template or when the information is unavailable. | ||
\item {\sf SEQ}: fragment SEQuence. This field can be a `*' when the | ||
sequence is not stored. If not a `*', the length of the sequence must | ||
equal the sum of lengths of M/I/S/=/X operations in {\sf CIGAR}. | ||
\item {\sf QUAL}: ASCII of base QUALity plus 33. A base quality equals | ||
$-10\log_{10}\Pr\{\mbox{base is wrong}\}$. This field can be a `*' | ||
when quality is not stored. If not a `*', {\sf SEQ} is not a `*' and | ||
the length of the quality string must equal the length of {\sf SEQ}. | ||
\end{enumerate} | ||
|
||
\subsection{Optional fields} | ||
All optional fields can be absent. | ||
\begin{center} | ||
\begin{tabular}{ll} | ||
\hline | ||
{\bf Tag} & {\bf Description} \\ | ||
\hline | ||
\hline | ||
\end{tabular} | ||
\end{center} | ||
|
||
\section{The SAM Format Standards} | ||
|
||
\end{document} |