Skip to content

Commit

Permalink
[FIX] compression streams writing wrong format
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Mar 23, 2021
1 parent 1487246 commit c3cb612
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 17 deletions.
2 changes: 1 addition & 1 deletion include/seqan3/io/detail/misc_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <string>
#include <tuple>


#ifdef SEQAN3_HAS_BZIP2
#include <seqan3/contrib/stream/bz2_istream.hpp>
#endif
Expand All @@ -31,6 +30,7 @@
#include <seqan3/contrib/stream/gz_istream.hpp>
#endif
#include <seqan3/io/detail/magic_header.hpp>
#include <seqan3/io/exception.hpp>
#include <seqan3/utility/detail/exposition_only_concept.hpp>

namespace seqan3::detail
Expand Down
33 changes: 21 additions & 12 deletions include/seqan3/io/detail/misc_output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@

#pragma once

#include <seqan3/std/filesystem>
#include <iostream>
#include <string>
#include <tuple>

#include <seqan3/utility/detail/exposition_only_concept.hpp>
#ifdef SEQAN3_HAS_BZIP2
#include <seqan3/contrib/stream/bz2_ostream.hpp>
#endif
#ifdef SEQAN3_HAS_ZLIB
#include <seqan3/contrib/stream/bgzf_ostream.hpp>
#include <seqan3/contrib/stream/gz_ostream.hpp>
#endif
#include <seqan3/std/filesystem>
#include <seqan3/io/exception.hpp>
#include <seqan3/utility/detail/exposition_only_concept.hpp>

namespace seqan3::detail
{
Expand All @@ -46,26 +47,34 @@ inline auto make_secondary_ostream(std::basic_ostream<char_t> & primary_stream,

std::string extension = filename.extension().string();

if ((extension == ".gz") || (extension == ".bgzf") || (extension == ".bam"))
if (extension == ".gz")
{
#ifdef SEQAN3_HAS_ZLIB
#ifdef SEQAN3_HAS_ZLIB
filename.replace_extension("");
return {new contrib::basic_gz_ostream<char_t>{primary_stream}, stream_deleter_default};
#else
throw file_open_error{"Trying to write a gzipped file, but no ZLIB available."};
#endif
}
else if ((extension == ".bgzf") || (extension == ".bam"))
{
#ifdef SEQAN3_HAS_ZLIB
if (extension != ".bam") // remove extension except for bam
filename.replace_extension("");

return {new contrib::basic_bgzf_ostream<char_t>{primary_stream},
stream_deleter_default};
#else
throw file_open_error{"Trying to write a gzipped file, but no ZLIB available."};
#endif
return {new contrib::basic_bgzf_ostream<char_t>{primary_stream}, stream_deleter_default};
#else
throw file_open_error{"Trying to write a bgzf'ed file, but no ZLIB available."};
#endif
}
else if (extension == ".bz2")
{
#ifdef SEQAN3_HAS_BZIP2
#ifdef SEQAN3_HAS_BZIP2
filename.replace_extension("");
return {new contrib::basic_bz2_ostream<char_t>{primary_stream}, stream_deleter_default};
#else
#else
throw file_open_error{"Trying to write a bzipped file, but no libbz2 available."};
#endif
#endif
}
else if (extension == ".zst")
{
Expand Down
1 change: 1 addition & 0 deletions test/unit/io/detail/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
seqan3_test(detail_record_test.cpp)
seqan3_test(in_file_iterator_test.cpp)
seqan3_test(misc_test.cpp)
seqan3_test(misc_output_test.cpp)
seqan3_test(out_file_iterator_test.cpp)
seqan3_test(ignore_output_iterator_test.cpp)
seqan3_test(record_like_test.cpp)
Expand Down
68 changes: 68 additions & 0 deletions test/unit/io/detail/misc_output_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

#include <gtest/gtest.h>

#include <fstream>
#include <vector>

#include <seqan3/core/debug_stream.hpp>

#include <seqan3/io/detail/misc_input.hpp>
#include <seqan3/io/detail/misc_output.hpp>
#include <seqan3/test/tmp_filename.hpp>

inline seqan3::test::tmp_filename tmp_compressed_file(std::string const & file_extension)
{
std::string const tmp_file_name = "io_misc_output_test.txt." + file_extension;
seqan3::test::tmp_filename tmp_file{tmp_file_name.c_str()};

// We need a copy of the path because `make_secondary_ostream` will strip the compression extension.
auto file_path = tmp_file.get_path();
std::ofstream filestream{file_path};
auto stream_ptr = seqan3::detail::make_secondary_ostream(filestream, file_path);
*stream_ptr << std::string(8, 'a') << '\n';

return tmp_file;
}

inline std::vector<char> read_file_content(std::filesystem::path const & path)
{
std::ifstream filestream{path};
using char_t = decltype(filestream)::char_type;
return {std::istreambuf_iterator{filestream}, std::istreambuf_iterator<char_t>{}};
}

#ifdef SEQAN3_HAS_ZLIB
TEST(misc_output, issue2455_gz)
{
seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("gz");
std::vector<char> const file_content = read_file_content(compressed_file.get_path());

EXPECT_TRUE(seqan3::detail::starts_with(file_content, seqan3::detail::gz_compression::magic_header));
// gz should not have a valid bgzf header (the gz header is a prefix of the bgzf header)
EXPECT_FALSE(seqan3::detail::bgzf_compression::validate_header(std::span{file_content}));
}

TEST(misc_output, issue2455_bgzf)
{
seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("bgzf");
std::vector<char> const file_content = read_file_content(compressed_file.get_path());

EXPECT_TRUE(seqan3::detail::bgzf_compression::validate_header(std::span{file_content}));
}
#endif

#ifdef SEQAN3_HAS_BZIP2
TEST(misc_output, issue2455_bz)
{
seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("bz2");
std::vector<char> const file_content = read_file_content(compressed_file.get_path());

EXPECT_TRUE(seqan3::detail::starts_with(file_content, seqan3::detail::bz2_compression::magic_header));
}
#endif
1 change: 0 additions & 1 deletion test/unit/io/detail/misc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

#include <seqan3/io/detail/magic_header.hpp>
#include <seqan3/io/detail/misc.hpp>
#include <seqan3/std/ranges>
#include <seqan3/test/tmp_filename.hpp>

struct dummy_file
Expand Down
2 changes: 1 addition & 1 deletion test/unit/io/sam_file/sam_file_output_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ TEST(compression, by_filename_gz)

std::string buffer = compression_by_filename_impl(filename);
buffer[9] = '\x00'; // zero out OS byte.
EXPECT_EQ(buffer, expected_bgzf);
EXPECT_EQ(buffer, expected_gz);
}

TEST(compression, by_stream_gz)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/io/sequence_file/sequence_file_output_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ TEST(compression, by_filename_gz)

std::string buffer = compression_by_filename_impl(filename);
buffer[9] = '\x00'; // zero out OS byte
EXPECT_EQ(buffer, expected_bgzf);
EXPECT_EQ(buffer, expected_gz);
}

TEST(compression, by_stream_gz)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/io/structure_file/structure_file_output_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ TEST_F(structure_file_output_compression, by_filename_gz)
seqan3::test::tmp_filename filename{"structure_file_output_test.dbn.gz"};
std::string buffer = compression_by_filename_impl(filename);
buffer[9] = '\x00'; // zero out OS byte
EXPECT_EQ(buffer, expected_bgzf);
EXPECT_EQ(buffer, expected_gz);
}

TEST_F(structure_file_output_compression, by_stream_gz)
Expand Down

0 comments on commit c3cb612

Please sign in to comment.