Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed a raft of integer overflows in VCF land. #1044

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions htslib/kstring.h
Expand Up @@ -38,7 +38,7 @@
#include "hts_defs.h"

#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x),(x)=(x)?(x):(uint32_t)-1)
#endif

#ifndef kroundup_size_t
Expand All @@ -49,7 +49,7 @@
(x)|=(x)>>(sizeof(size_t)), /* 4 or 8 */ \
(x)|=(x)>>(sizeof(size_t)*2), /* 8 or 16 */ \
(x)|=(x)>>(sizeof(size_t)*4), /* 16 or 32 */ \
++(x))
++(x),(x)=(x)?(x):(size_t)-1)
#endif

#if defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4))
Expand Down
52 changes: 37 additions & 15 deletions vcf.c
Expand Up @@ -2193,8 +2193,7 @@ static inline int align_mem(kstring_t *s)
int e = 0;
if (s->l&7) {
uint64_t zero = 0;
int l = ((s->l + 7)>>3<<3) - s->l;
e = kputsn((char*)&zero, l, s) < 0;
e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
}
return e == 0 ? 0 : -1;
}
Expand Down Expand Up @@ -2338,9 +2337,26 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
v->errcode |= BCF_ERR_TAG_INVALID;
return -1;
}
align_mem(mem);
if (align_mem(mem) < 0) {
hts_log_error("Memory allocation failure");
v->errcode |= BCF_ERR_LIMITS;
return -1;
}
f->offset = mem->l;
ks_resize(mem, mem->l + v->n_sample * f->size);

// Limit the total memory to ~2Gb per VCF row. This should mean
// malformed VCF data is less likely to take excessive memory and/or
// time.
if (v->n_sample * (uint64_t)f->size > INT_MAX) {
hts_log_error("Excessive memory required by FORMAT fields");
v->errcode |= BCF_ERR_LIMITS;
return -1;
}
if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
hts_log_error("Memory allocation failure");
v->errcode |= BCF_ERR_LIMITS;
return -1;
}
mem->l += v->n_sample * f->size;
}
for (j = 0; j < v->n_fmt; ++j)
Expand All @@ -2367,9 +2383,15 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
while ( t < end )
{
fmt_aux_t *z = &fmt[j++];
if (!z->buf) {
hts_log_error("Memory allocation failure for FORMAT field type %d",
z->y>>4&0xf);
v->errcode |= BCF_ERR_LIMITS;
return -1;
}
if ((z->y>>4&0xf) == BCF_HT_STR) {
if (z->is_gt) { // genotypes
int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * (size_t)m);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

m is a local variable. Rather than all this casting, can it be declared more accurately as size_t m instead? This might trickle down to fmt_aux_t::max_m and/or fmt_aux_t::size and suggest changing them to size_t too, but that doesn't seem like a bad thing…

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function is so vast I lose track of the scope of these variables, so I just went with the minimalist change.

I'll take a look at changing the type for local var m. I don't wish to start changing data types though as that requires a deeper understanding of vcf which I don't currently have time to, nor the desire due to the cost of -10 sanity points!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm looking at it m is overloaded. The bit of code you're referring to uses m as sample id. In other places it's just listed as a vector (unspecified which). If I change type of m, should I also change type of g? of r? of l? or j? etc. I could create a new var sid say for sample id and use that throughout the specific block of code you refer to, but it's just as much code changing to read.

I think I'll keep this as is and request that @pd3 looks at changing types if he feels it is valid. I simply don't understand all the knock on implications of changing type here given the overloading of var names.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aarrgggghhhhh… fair enough

for (l = 0;; ++t) {
if (*t == '.') {
++t, x[l++] = is_phased;
Expand All @@ -2391,12 +2413,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
if ( !l ) x[l++] = 0; // An empty field, insert missing value
for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
} else {
char *x = (char*)z->buf + z->size * m;
char *x = (char*)z->buf + z->size * (size_t)m;
for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
for (; l < z->size; ++l) x[l] = 0;
}
} else if ((z->y>>4&0xf) == BCF_HT_INT) {
int32_t *x = (int32_t*)(z->buf + z->size * m);
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
for (l = 0;; ++t) {
if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
else
Expand All @@ -2421,7 +2443,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
if ( !l ) x[l++] = bcf_int32_missing;
for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
} else if ((z->y>>4&0xf) == BCF_HT_REAL) {
float *x = (float*)(z->buf + z->size * m);
float *x = (float*)(z->buf + z->size * (size_t)m);
for (l = 0;; ++t) {
if (*t == '.' && !isdigit_c(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
else x[l++] = strtod(t, &t);
Expand Down Expand Up @@ -2454,20 +2476,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
fmt_aux_t *z = &fmt[j];
if ((z->y>>4&0xf) == BCF_HT_STR) {
if (z->is_gt) {
int32_t *x = (int32_t*)(z->buf + z->size * m);
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
if (z->size) x[0] = bcf_int32_missing;
for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
} else {
char *x = (char*)z->buf + z->size * m;
char *x = (char*)z->buf + z->size * (size_t)m;
if ( z->size ) x[0] = '.';
for (l = 1; l < z->size; ++l) x[l] = 0;
}
} else if ((z->y>>4&0xf) == BCF_HT_INT) {
int32_t *x = (int32_t*)(z->buf + z->size * m);
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
x[0] = bcf_int32_missing;
for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
} else if ((z->y>>4&0xf) == BCF_HT_REAL) {
float *x = (float*)(z->buf + z->size * m);
float *x = (float*)(z->buf + z->size * (size_t)m);
bcf_float_set_missing(x[0]);
for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
}
Expand All @@ -2485,12 +2507,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
bcf_enc_int1(str, z->key);
if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
bcf_enc_size(str, z->size, BCF_BT_CHAR);
kputsn((char*)z->buf, z->size * v->n_sample, str);
kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
} else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
} else {
bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
if (serialize_float_array(str, (z->size>>2) * v->n_sample,
if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
(float *) z->buf) != 0) {
v->errcode |= BCF_ERR_LIMITS;
hts_log_error("Out of memory");
Expand Down Expand Up @@ -3038,7 +3060,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
if (gt_i == i)
bcf_format_gt(f,j,s);
else
bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
}
if ( first ) kputc('.', s);
}
Expand Down