Skip to content

Commit

Permalink
bgzip text compression mode
Browse files Browse the repository at this point in the history
Co-authored-by: Mike Lin <dna@mlin.net>

Compressing text now promotes alignment of BGZF blocks with the
uncompressed text lines. BGZF blocks start at the beginning of an
input line and end after some subsequent newline (except when the
block's first line overflows the BGZF block size).

This ensures it's possible to specify byte ranges of a BGZF file that
decompress into complete text records -- useful for parallel
processing and "slicing" from remote servers.

To disable this feature and provide a way to produce identical output
with 1.15 and earlier, the --binary option forces text data to be
processed as if it were binary.

The idea and initial implementation was Mike Lin's, with the current
revised implementation by James Bonfield.
  • Loading branch information
jkbonfield authored and whitwham committed Sep 1, 2022
1 parent 2ff03d3 commit e495718
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 20 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ htsfile: htsfile.o libhts.a
tabix: tabix.o libhts.a
$(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread

bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h)
bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h)
htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h)

Expand Down
11 changes: 11 additions & 0 deletions bgzip.1
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ after decompression completes the input file will be removed.

.SH OPTIONS
.TP 10
.B "--binary"
Bgzip will attempt to ensure BGZF blocks end on a newline when the
input is a text file. The exception to this is where a single line is
larger than a BGZF block (64Kb). This can aid tools that use the
index to perform random access on the compressed stream, as the start
of a block is likely to also be the start of a text record.

This option processes text files as if they were binary content,
ignoring the location of newlines. This also restores the behaviour
for text files to bgzip version 1.15 and earlier.
.TP
.BI "-b, --offset " INT
Decompress to standard output from virtual file position (0-based uncompressed
offset).
Expand Down
130 changes: 111 additions & 19 deletions bgzip.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@
#include <inttypes.h>
#include "htslib/bgzf.h"
#include "htslib/hts.h"
#include "htslib/hfile.h"

#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#endif

static const int WINDOW_SIZE = 64 * 1024;
static const int WINDOW_SIZE = BGZF_BLOCK_SIZE;

static void error(const char *format, ...)
{
Expand Down Expand Up @@ -121,15 +122,16 @@ static int bgzip_main_usage(FILE *fp, int status)
fprintf(fp, " -r, --reindex (re)index compressed file\n");
fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n");
fprintf(fp, " -t, --test test integrity of compressed file\n");
fprintf(fp, " --binary Don't align blocks with text lines\n");
fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n");
return status;
}

int main(int argc, char **argv)
{
int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep;
int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary;
BGZF *fp;
void *buffer;
char *buffer;
long start, end, size;
char *index_fname = NULL;
int threads = 1;
Expand All @@ -151,10 +153,11 @@ int main(int argc, char **argv)
{"test", no_argument, NULL, 't'},
{"version", no_argument, NULL, 1},
{"keep", no_argument, NULL, 'k'},
{"binary", no_argument, NULL, 2},
{NULL, 0, NULL, 0}
};

compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0;
compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0;
while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){
switch(c){
case 'd': compress = 0; break;
Expand All @@ -175,6 +178,7 @@ int main(int argc, char **argv)
"bgzip (htslib) %s\n"
"Copyright (C) 2022 Genome Research Ltd.\n", hts_version());
return EXIT_SUCCESS;
case 2: binary = 1; break;
case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS);
case '?': return bgzip_main_usage(stderr, EXIT_FAILURE);
}
Expand All @@ -185,7 +189,7 @@ int main(int argc, char **argv)
return 1;
}
if (compress == 1) {
int f_src = fileno(stdin);
hFILE* f_src = NULL;
char out_mode[3] = "w\0";
char out_mode_exclusive[4] = "wx\0";

Expand All @@ -198,13 +202,13 @@ int main(int argc, char **argv)
out_mode_exclusive[2] = compress_level + '0';
}

if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) {
fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}

if ( argc>optind )
{
if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}

if (pstdout)
fp = bgzf_open("-", out_mode);
else
Expand Down Expand Up @@ -250,18 +254,103 @@ int main(int argc, char **argv)
bgzf_mt(fp, threads, 256);

buffer = malloc(WINDOW_SIZE);
#ifdef _WIN32
_setmode(f_src, O_BINARY);
#endif
if (!buffer)
return 1;
if (rebgzip){
if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);

while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0)
if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
}
else {
while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
htsFormat fmt;
int textual = 0;
if (!binary
&& hts_detect_format(f_src, &fmt) == 0
&& fmt.compression == no_compression) {
switch(fmt.format) {
case text_format:
case sam:
case vcf:
case bed:
case fasta_format:
case fastq_format:
case fai_format:
case fqi_format:
textual = 1;
break;
default: break; // silence clang warnings
}
}

if (binary || !textual) {
// Binary data, either detected or explicit
while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0)
if (bgzf_write(fp, buffer, c) < 0)
error("Could not write %d bytes: Error %d\n",
c, fp->errcode);
} else {
/* Text mode, try a flush after a newline */
int in_header = 1, n = 0, long_line = 0;
while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) {
int c2 = c+n;
int flush = 0;
if (in_header &&
(long_line || buffer[0] == '@' || buffer[0] == '#')) {
// Scan forward to find the last header line.
int last_start = 0;
n = 0;
while (n < c2) {
if (buffer[n++] != '\n')
continue;

last_start = n;
if (n < c2 &&
!(buffer[n] == '@' || buffer[n] == '#')) {
in_header = 0;
break;
}
}
if (!last_start) {
n = c2;
long_line = 1;
} else {
n = last_start;
flush = 1;
long_line = 0;
}
} else {
// Scan backwards to find the last newline.
n += c; // c read plus previous n overflow
while (--n >= 0 && ((char *)buffer)[n] != '\n')
;

if (n >= 0) {
flush = 1;
n++;
} else {
n = c2;
}
}

// Pos n is either at the end of the buffer with flush==0,
// or the first byte after a newline and a flush point.
if (bgzf_write(fp, buffer, n) < 0)
error("Could not write %d bytes: Error %d\n",
n, fp->errcode);
if (flush)
if (bgzf_flush_try(fp, 65536) < 0) // force
return -1;

memmove(buffer, buffer+n, c2-n);
n = c2-n;
}

// Trailing data.
if (bgzf_write(fp, buffer, n) < 0)
error("Could not write %d bytes: Error %d\n",
n, fp->errcode);
}
}
if ( index )
{
Expand All @@ -270,13 +359,16 @@ int main(int argc, char **argv)
error("Could not write index to '%s'\n", index_fname);
} else {
if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0)
error("Could not write index to '%s.gz.gzi'", argv[optind]);
error("Could not write index to '%s.gz.gzi'\n",
argv[optind]);
}
}
if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
if (bgzf_close(fp) < 0)
error("Output close failed: Error %d\n", fp->errcode);
if (hclose(f_src) < 0)
error("Input close failed\n");
if (argc > optind && !pstdout && !keep) unlink(argv[optind]);
free(buffer);
close(f_src);
return 0;
}
else if ( reindex )
Expand Down
24 changes: 24 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,30 @@ sub test_bgzip {
}
passed($opts,$test);

# Round-trip test of text in binary mode
my $test = sprintf('%s %2s threads', 'bgzip text mode round-trip',
$threads ? $threads : 'no');
print "$test: ";
my $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'";
my ($ret, $out) = _cmd($c);
if ($ret) {
failed($opts, $test, "non-zero exit from $c");
return;
}
$c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'";
($ret, $out) = _cmd($c);
if ($ret) {
failed($opts, $test, "non-zero exit from $c");
return;
}
$c = "cmp '$data' '$uncompressed'";
($ret, $out) = _cmd($c);
if ($ret) {
failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ");
return;
}
passed($opts,$test);

# Extract from an offset
$test = sprintf('%s %2s threads', 'bgzip -b',
$threads ? $threads : 'no');
Expand Down

0 comments on commit e495718

Please sign in to comment.