Skip to content

Commit

Permalink
Warn if bgzf_getline() returned apparently UTF-16-encoded text
Browse files Browse the repository at this point in the history
Text files badly transferred from Windows may occasionally be
UTF-16-encoded, and this may not be easily noticed by the user.
HTSlib should not accept such encoding (as other tools surely don't,
hence doing so would cause interoperability problems), but it should
ideally emit a warning or error message identifying the problem.

Reading text from a htsFile/samFile/vcfFile will already have failed
with EFTYPE/ENOEXEC if the text file is UTF-16-encoded, as the encoding
will not have been recognised by hts_detect_format().

OTOH bgzf_getline() will return a UTF-16-encoded text line. Add a
suitable context-dependent diagnostic to the BGZF-based bgzf_getline()
calls in HTSlib: in hts_readlist()/hts_readlines(), emit a warning
(once, on the first line); in tbx.c, emit a more specific error message
if get_intv() parsing failure is due to UTF-16 encoding.

[TODO] If utf16_text_format were added to htsFormatCategory,
the new is_utf16_text() function is suitable for detecting it.
  • Loading branch information
jmarshall committed May 11, 2024
1 parent b204d55 commit 714fc63
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 2 deletions.
31 changes: 31 additions & 0 deletions hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,27 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim)
return 1;
}

static inline int
alternate_zeros(const unsigned char *u, const unsigned char *ulim)
{
for (; u < ulim; u += 2)
if (*u != '\0') return 0;
return 1;
}

static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
{
if (ulim - u >= 6 &&
((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
(u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
return 2;
else if (ulim - u >= 8 &&
(alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
return 1;
else
return 0;
}

static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
{
const unsigned char *eol = memchr(u, '\n', ulim - u);
Expand Down Expand Up @@ -1961,6 +1982,12 @@ hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
return fp;
}

int hts_is_utf16_text(const kstring_t *str)
{
const unsigned char *u = (const unsigned char *) (str->s);
return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
}

// For VCF/BCF backward sweeper. Not exposing these functions because their
// future is uncertain. Things will probably have to change with hFILE...
BGZF *hts_get_bgzfp(htsFile *fp)
Expand Down Expand Up @@ -2030,6 +2057,8 @@ char **hts_readlist(const char *string, int is_file, int *_n)
while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
{
if (str.l == 0) continue;
if (n == 0 && hts_is_utf16_text(&str))
hts_log_warning("'%s' appears to be encoded as UTF-16", string);
if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
goto err;
s[n] = strdup(str.s);
Expand Down Expand Up @@ -2089,6 +2118,8 @@ char **hts_readlines(const char *fn, int *_n)
str.s = 0; str.l = str.m = 0;
while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
if (str.l == 0) continue;
if (n == 0 && hts_is_utf16_text(&str))
hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
goto err;
s[n] = strdup(str.s);
Expand Down
3 changes: 3 additions & 0 deletions hts_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ typedef struct hts_cram_idx_t {
struct cram_fd *cram;
} hts_cram_idx_t;

// Determine whether the string's contents appear to be UTF-16-encoded text.
// Returns 1 if they are, 2 if there is also a BOM, or 0 otherwise.
int hts_is_utf16_text(const kstring_t *str);

// Entry point to hFILE_multipart backend.
struct hFILE *hopen_htsget_redirect(struct hFILE *hfile, const char *mode);
Expand Down
7 changes: 5 additions & 2 deletions tbx.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,11 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
case TBX_UCSC: type = "TBX_UCSC"; break;
default: type = "TBX_GENERIC"; break;
}
hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"",
type, str->s);
if (hts_is_utf16_text(str))
hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type);
else
hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"",
type, str->s);
return -1;
}
}
Expand Down

0 comments on commit 714fc63

Please sign in to comment.