Stats improvements

Computes the percentage of properly paired reads. Includes '=' and 'X' CIGAR operators when counting mapped bases. Implements overlap remove logic for paired-end reads in coverage calculation and mapped base counting. Overlap removal can be turned on using the stats `-p` or `--remove-overlaps` option. When a target file is in use calculates and prints the number of bases inside the target, and the percentage of target bases with coverage over a given threshold. The coverage threshold can be set using the `-g` or `--cov-threshold` option. Adds more unit tests and augments some of the old ones.
samtools · Jun 20, 2018 · 22caad0 · 22caad0
1 parent 475464c
commit 22caad0
Show file tree

Hide file tree

Showing 24 changed files with 3,437 additions and 23 deletions.
diff --git a/stats.c b/stats.c
diff --git a/test/stat/1.stats.expected b/test/stat/1.stats.expected
@@ -39,6 +39,7 @@ SN	inward oriented pairs:	1
 SN	outward oriented pairs:	0
 SN	pairs with other orientation:	0
 SN	pairs on different chromosomes:	0
+SN	percentage of properly paired reads (%):	100.0
 # First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.
 # Columns correspond to qualities and rows to cycles. First column is the cycle number.
 FFQ	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1

diff --git a/test/stat/10.stats.expected b/test/stat/10.stats.expected
@@ -39,6 +39,7 @@ SN	inward oriented pairs:	2
 SN	outward oriented pairs:	0
 SN	pairs with other orientation:	0
 SN	pairs on different chromosomes:	0
+SN	percentage of properly paired reads (%):	100.0
 # First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.
 # Columns correspond to qualities and rows to cycles. First column is the cycle number.
 FFQ	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2

diff --git a/test/stat/10_map_cigar.sam_s1_a_1.expected.bamstat b/test/stat/10_map_cigar.sam_s1_a_1.expected.bamstat
@@ -39,6 +39,7 @@ SN	inward oriented pairs:	1
 SN	outward oriented pairs:	0
 SN	pairs with other orientation:	0
 SN	pairs on different chromosomes:	0
+SN	percentage of properly paired reads (%):	100.0
 # First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.
 # Columns correspond to qualities and rows to cycles. First column is the cycle number.
 FFQ	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1

diff --git a/test/stat/10_map_cigar.sam_s1_b_1.expected.bamstat b/test/stat/10_map_cigar.sam_s1_b_1.expected.bamstat
@@ -39,6 +39,7 @@ SN	inward oriented pairs:	1
 SN	outward oriented pairs:	0
 SN	pairs with other orientation:	0
 SN	pairs on different chromosomes:	0
+SN	percentage of properly paired reads (%):	100.0
 # First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.
 # Columns correspond to qualities and rows to cycles. First column is the cycle number.
 FFQ	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1

diff --git a/test/stat/11.stats.expected b/test/stat/11.stats.expected
@@ -39,6 +39,9 @@ SN	inward oriented pairs:	12
 SN	outward oriented pairs:	0
 SN	pairs with other orientation:	0
 SN	pairs on different chromosomes:	1
+SN	percentage of properly paired reads (%):	92.3
+SN	bases inside the target:	42
+SN	percentage of target genome with coverage > 0 (%):	100.00
 # First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.
 # Columns correspond to qualities and rows to cycles. First column is the cycle number.
 FFQ	1	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0

diff --git a/test/stat/11.stats.g4.expected b/test/stat/11.stats.g4.expected
@@ -0,0 +1,166 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	cb2d2d82	bcd83869	62ec814e
+# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
+SN	raw total sequences:	26
+SN	filtered sequences:	0
+SN	sequences:	26
+SN	is sorted:	1
+SN	1st fragments:	14
+SN	last fragments:	12
+SN	reads mapped:	26
+SN	reads mapped and paired:	26	# paired-end technology bit set + both mates mapped
+SN	reads unmapped:	0
+SN	reads properly paired:	24	# proper-pair bit set
+SN	reads paired:	26	# paired-end technology bit set
+SN	reads duplicated:	0	# PCR or optical duplicate bit set
+SN	reads MQ0:	1	# mapped and MQ=0
+SN	reads QC failed:	0
+SN	non-primary alignments:	0
+SN	total length:	260	# ignores clipping
+SN	total first fragment length:	140	# ignores clipping
+SN	total last fragment length:	120	# ignores clipping
+SN	bases mapped:	260	# ignores clipping
+SN	bases mapped (cigar):	206	# more accurate
+SN	bases trimmed:	0
+SN	bases duplicated:	0
+SN	mismatches:	0	# from NM fields
+SN	error rate:	0.000000e+00	# mismatches / bases mapped (cigar)
+SN	average length:	10
+SN	average first fragment length:	10
+SN	average last fragment length:	10
+SN	maximum length:	10
+SN	maximum first fragment length:	10
+SN	maximum last fragment length:	10
+SN	average quality:	13.1
+SN	insert size average:	34.0
+SN	insert size standard deviation:	0.0
+SN	inward oriented pairs:	12
+SN	outward oriented pairs:	0
+SN	pairs with other orientation:	0
+SN	pairs on different chromosomes:	1
+SN	percentage of properly paired reads (%):	92.3
+SN	bases inside the target:	42
+SN	percentage of target genome with coverage > 4 (%):	85.71
+# First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.
+# Columns correspond to qualities and rows to cycles. First column is the cycle number.
+FFQ	1	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	2	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	3	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	4	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	5	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	6	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	7	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	8	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	9	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+FFQ	10	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0
+# Last Fragment Qualities. Use `grep ^LFQ | cut -f 2-` to extract this part.
+# Columns correspond to qualities and rows to cycles. First column is the cycle number.
+LFQ	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	2	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	3	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	4	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	5	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	6	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	7	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	8	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	9	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+LFQ	10	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0
+# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.
+GCF	19.85	0
+GCF	44.72	1
+GCF	54.77	4
+GCF	64.82	2
+GCF	74.87	4
+GCF	84.92	3
+# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.
+GCL	19.85	0
+GCL	44.72	2
+GCL	59.80	3
+GCL	74.87	4
+# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; and N and O counts as a percentage of all A/C/G/T bases [%]
+GCC	1	19.23	42.31	15.38	23.08	0.00	0.00
+GCC	2	19.23	19.23	46.15	15.38	0.00	0.00
+GCC	3	23.08	38.46	19.23	19.23	0.00	0.00
+GCC	4	23.08	23.08	38.46	15.38	0.00	0.00
+GCC	5	11.54	38.46	26.92	23.08	0.00	0.00
+GCC	6	23.08	23.08	34.62	19.23	0.00	0.00
+GCC	7	15.38	34.62	26.92	23.08	0.00	0.00
+GCC	8	26.92	23.08	38.46	11.54	0.00	0.00
+GCC	9	23.08	26.92	26.92	23.08	0.00	0.00
+GCC	10	23.08	23.08	38.46	15.38	0.00	0.00
+# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; and N and O counts as a percentage of all A/C/G/T bases [%]
+FBC	1	21.43	42.86	21.43	14.29	0.00	0.00
+FBC	2	7.14	28.57	42.86	21.43	0.00	0.00
+FBC	3	28.57	35.71	21.43	14.29	0.00	0.00
+FBC	4	14.29	28.57	35.71	21.43	0.00	0.00
+FBC	5	14.29	35.71	35.71	14.29	0.00	0.00
+FBC	6	14.29	28.57	28.57	28.57	0.00	0.00
+FBC	7	14.29	28.57	35.71	21.43	0.00	0.00
+FBC	8	21.43	28.57	28.57	21.43	0.00	0.00
+FBC	9	21.43	21.43	35.71	21.43	0.00	0.00
+FBC	10	14.29	28.57	35.71	21.43	0.00	0.00
+# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; and N and O counts as a percentage of all A/C/G/T bases [%]
+LBC	1	16.67	41.67	8.33	33.33	0.00	0.00
+LBC	2	33.33	8.33	50.00	8.33	0.00	0.00
+LBC	3	16.67	41.67	16.67	25.00	0.00	0.00
+LBC	4	33.33	16.67	41.67	8.33	0.00	0.00
+LBC	5	8.33	41.67	16.67	33.33	0.00	0.00
+LBC	6	33.33	16.67	41.67	8.33	0.00	0.00
+LBC	7	16.67	41.67	16.67	25.00	0.00	0.00
+LBC	8	33.33	16.67	50.00	0.00	0.00	0.00
+LBC	9	25.00	33.33	16.67	25.00	0.00	0.00
+LBC	10	33.33	16.67	41.67	8.33	0.00	0.00
+# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs
+IS	0	0	0	0	0
+IS	1	0	0	0	0
+IS	2	0	0	0	0
+IS	3	0	0	0	0
+IS	4	0	0	0	0
+IS	5	0	0	0	0
+IS	6	0	0	0	0
+IS	7	0	0	0	0
+IS	8	0	0	0	0
+IS	9	0	0	0	0
+IS	10	0	0	0	0
+IS	11	0	0	0	0
+IS	12	0	0	0	0
+IS	13	0	0	0	0
+IS	14	0	0	0	0
+IS	15	0	0	0	0
+IS	16	0	0	0	0
+IS	17	0	0	0	0
+IS	18	0	0	0	0
+IS	19	0	0	0	0
+IS	20	0	0	0	0
+IS	21	0	0	0	0
+IS	22	0	0	0	0
+IS	23	0	0	0	0
+IS	24	0	0	0	0
+IS	25	0	0	0	0
+IS	26	0	0	0	0
+IS	27	0	0	0	0
+IS	28	0	0	0	0
+IS	29	0	0	0	0
+IS	30	0	0	0	0
+IS	31	0	0	0	0
+IS	32	0	0	0	0
+IS	33	0	0	0	0
+IS	34	12	12	0	0
+# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count
+RL	10	26
+# Read lengths - first fragments. Use `grep ^FRL | cut -f 2-` to extract this part. The columns are: read length, count
+FRL	10	14
+# Read lengths - last fragments. Use `grep ^LRL | cut -f 2-` to extract this part. The columns are: read length, count
+LRL	10	12
+# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions
+# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)
+# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.
+COV	[1-1]	1	1
+COV	[2-2]	2	1
+COV	[3-3]	3	2
+COV	[4-4]	4	2
+COV	[5-5]	5	23
+COV	[6-6]	6	13
+# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile
+GCD	0.0	100.000	0.000	0.000	0.000	0.000	0.000