Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jkbonfield vcf int64 #1003

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RANLIB = ranlib
htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl

CPPFLAGS =
# TODO: make the 64-bit support for VCF optional via configure, for now add -DVCF_ALLOW_INT64=1 to CFLAGS manually
# TODO: probably update cram code to make it compile cleanly with -Wc++-compat
# For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600
#CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600
Expand Down
4 changes: 3 additions & 1 deletion README.large_positions.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ which have, or are expected to have, chromosomes longer than two gigabases.
Currently 64 bit positions can only be stored in SAM and VCF format files.
Binary BAM, CRAM and BCF cannot be used due to limitations in the formats
themselves. As SAM and VCF are text formats, they have no limit on the
size of numeric values.
size of numeric values. Note that while 64 bit positions are supported by
default for SAM, for VCF they must be enabled explicitly at compile time
by editing Makefile and adding -DVCF_ALLOW_INT64=1 to CFLAGS.

# Compatibility issues to check

Expand Down
2 changes: 1 addition & 1 deletion htslib/vcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1411,7 +1411,7 @@ static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
*q = (uint8_t*)p + 4;
return le_to_i32(p);
} else if (type == BCF_BT_INT64) {
*q = (uint8_t*)p + 4;
*q = (uint8_t*)p + 8;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an additional bug to the one I found. I wonder why it wasn't breaking things. Is this function only used when decoding the BCF file itself, in which case it's not triggerable as it's not actually a valid thing to have in the file?

(But still worth fixing as clearly the code is incorrect.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was breaking things when reading BCF_BT_INT64 values from an unofficial 64-bit flavor of BCF.

return le_to_i64(p);
} else { // Invalid type.
return 0;
Expand Down
2 changes: 1 addition & 1 deletion test/longrefs/index.expected2.vcf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45
2 changes: 1 addition & 1 deletion test/longrefs/index.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,4 @@
1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29
1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29
1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45
25 changes: 13 additions & 12 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -660,18 +660,19 @@ sub test_view
testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110";
testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam";

# VCF round trip
unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index
testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf";
testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz";
testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_";

# Build index and compare with on-the-fly one made earlier.
test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1;

# test_view can't do indexed look-ups on vcf, but we can use tabix
test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1;
test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1;
# 64-bit positions are currently not compiled in by default for VCF
# # VCF round trip
# unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index
# testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf";
# testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz";
# testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_";
#
# # Build index and compare with on-the-fly one made earlier.
# test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1;
#
# # test_view can't do indexed look-ups on vcf, but we can use tabix
# test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1;
# test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1;

if ($test_view_failures == 0) {
passed($opts, "large position tests");
Expand Down
123 changes: 104 additions & 19 deletions vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,26 @@ HTSLIB_EXPORT
uint32_t bcf_float_vector_end = 0x7F800002;

HTSLIB_EXPORT
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };

/*
Partial support for 64-bit POS and Number=1 INFO tags.
Notes:
- the support for 64-bit values is motivated by POS and INFO/END for large genomes
- the use of 64-bit values does not conform to the specification
- cannot output 64-bit BCF and if it does, it is not compatible with anything
- experimental, use at your risk
*/
#ifdef VCF_ALLOW_INT64
#define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */
#define BCF_MIN_BT_INT64 -9223372036854775800 /* INT64_MIN + 8, for internal use only */
#endif

#define BCF_IS_64BIT (1<<30)


static const char *dump_char(char *buffer, char c)
{
switch (c) {
Expand Down Expand Up @@ -1251,6 +1267,12 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
if (end - p < 4) return -1;
*q = p + 4;
*val = le_to_i32(p);
#ifdef VCF_ALLOW_INT64
} else if (t == BCF_BT_INT64) {
if (end - p < 4) return -1;
*q = p + 4;
*val = le_to_i32(p);
#endif
} else {
return -1;
}
Expand Down Expand Up @@ -1290,6 +1312,9 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
uint32_t i, reports;
const uint32_t is_integer = ((1 << BCF_BT_INT8) |
(1 << BCF_BT_INT16) |
#ifdef VCF_ALLOW_INT64
(1 << BCF_BT_INT64) |
#endif
(1 << BCF_BT_INT32));
const uint32_t is_valid_type = (is_integer |
(1 << BCF_BT_NULL) |
Expand Down Expand Up @@ -1728,6 +1753,12 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
}
bcf1_sync(v); // check if the BCF record was modified

if ( v->unpacked & BCF_IS_64BIT )
{
hts_log_error("Data contains 64-bit values not representable in BCF. Please use VCF instead");
return -1;
}

BGZF *fp = hfp->fp.bgzf;
union {
uint32_t i;
Expand Down Expand Up @@ -2040,6 +2071,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
return 0; // FIXME: check for errs in this function
}

#ifdef VCF_ALLOW_INT64
static int bcf_enc_long1(kstring_t *s, int64_t x) {
uint32_t e = 0;
if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
Expand All @@ -2057,6 +2089,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) {
}
return e == 0 ? 0 : -1;
}
#endif

static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
uint8_t *p;
Expand Down Expand Up @@ -2169,6 +2202,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
{
if ( !bcf_hdr_nsamples(h) ) return 0;

static int extreme_int_warned = 0;
char *r, *t;
int j, l, m, g;
khint_t k;
Expand Down Expand Up @@ -2362,7 +2396,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
int32_t *x = (int32_t*)(z->buf + z->size * m);
for (l = 0;; ++t) {
if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
else x[l++] = strtol(t, &t, 10);
else // x[l++] = strtol(t, &t, 10);
{
errno = 0;
char *te;
long int tmp_val = strtol(t, &te, 10);
Copy link
Contributor

@jkbonfield jkbonfield Dec 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be

int64_t tmp_val = strtoll(t, &te, 10);

so 32-bit systems are still doing the check with64-bit quantities. This is how code elsewhere works. (Eg see the v->pos assignment).

(I can amend this myself though.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For FORMAT fields I kept and agree with the original @daviesrob's approach of not supporting 64-bit values. We try to express FORMAT fields using int8_t whenever possible because this is the size bottleneck for VCF, would not like to encourage 64-bit values there at all.

(Having said that, of course, there is the PS tag which can be any number but usually would refer to a genomic coordinate and therefore could be 64-bit in principle. However, this can wait as it is not in widespread use for human data, not speaking about the hypothetical large genomes.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok fair enough. I don't know what's supported and what isn't, but if it's just permitted as 32-bit only then it's fine. Thanks.

if ( te==t || errno!=0 || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
{
if ( !extreme_int_warned )
{
hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,h->id[BCF_DT_ID][fmt[j-1].key].key,bcf_seqname(h,v), v->pos+1);
extreme_int_warned = 1;
}
tmp_val = bcf_int32_missing;
}
x[l++] = tmp_val;
t = te;
}
if (*t != ',') break;
}
if ( !l ) x[l++] = bcf_int32_missing;
Expand Down Expand Up @@ -2469,6 +2519,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p

int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
{
static int extreme_int_warned = 0;
int i = 0;
char *p, *q, *r, *t;
kstring_t *str;
Expand Down Expand Up @@ -2526,6 +2577,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
} else {
v->pos -= 1;
}
if (v->pos >= INT32_MAX)
v->unpacked |= BCF_IS_64BIT;
} else if (i == 2) { // ID
if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
else bcf_enc_size(str, 0, BCF_BT_CHAR);
Expand Down Expand Up @@ -2672,31 +2725,61 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
val_a = z;
}
if ((y>>4&0xf) == BCF_HT_INT) {
// Allow first value only to be 64 bit
// (for large END value)
int64_t v64 = strtoll(val, &te, 10);
if ( te==val ) { // conversion failed
val_a[0] = bcf_int32_missing;
v64 = bcf_int64_missing;
} else {
val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing;
i = 0, t = val;
#ifdef VCF_ALLOW_INT64
int64_t val1;
if ( n_val==1 )
{
errno = 0;
long long int tmp_val = strtoll(val, &te, 10);
if ( te==val ) tmp_val = bcf_int64_missing;
else if ( te==val || errno!=0 || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
{
if ( !extreme_int_warned )
{
hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1);
extreme_int_warned = 1;
}
tmp_val = bcf_int64_missing;
}
val1 = tmp_val;
t = te;
i = 1; // this is just to avoid adding another nested block...
}
for (t = te; *t && *t != ','; t++);
if (*t == ',') ++t;
for (i = 1; i < n_val; ++i, ++t)
#else
int32_t val1;
#endif
for (; i < n_val; ++i, ++t)
{
val_a[i] = strtol(t, &te, 10);
if ( te==t ) // conversion failed
val_a[i] = bcf_int32_missing;
errno = 0;
long int tmp_val = strtol(t, &te, 10);
if ( te==t ) tmp_val = bcf_int32_missing;
else if ( errno!=0 || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
{
if ( !extreme_int_warned )
{
hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1);
extreme_int_warned = 1;
}
tmp_val = bcf_int32_missing;
}
val_a[i] = tmp_val;
for (t = te; *t && *t != ','; t++);
}
if (n_val == 1) {
bcf_enc_long1(str, v64);
#ifdef VCF_ALLOW_INT64
if ( val1<INT32_MIN || val1>BCF_MAX_BT_INT32 )
v->unpacked |= BCF_IS_64BIT;
bcf_enc_long1(str, val1);
#else
val1 = val_a[0];
bcf_enc_int1(str, val1);
#endif
} else {
bcf_enc_vint(str, n_val, val_a, -1);
}
if (strcmp(key, "END") == 0)
v->rlen = v64 - v->pos;
if (n_val==1 && strcmp(key, "END") == 0)
v->rlen = val1 - v->pos;
} else if ((y>>4&0xf) == BCF_HT_REAL) {
float *val_f = (float *)val_a;
for (i = 0, t = val; i < n_val; ++i, ++t)
Expand Down Expand Up @@ -3835,6 +3918,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v
else
bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
}
#ifdef VCF_ALLOW_INT64
else if ( type==BCF_HT_LONG )
{
if (n != 1) {
Expand All @@ -3843,6 +3927,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v
}
bcf_enc_long1(&str, *(int64_t *) values);
}
#endif
else
{
hts_log_error("The type %d not implemented yet", type);
Expand Down