seqan · marehr · Mar 4, 2021 · Mar 3, 2021 · eseiler · Mar 3, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -80,6 +80,7 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
 
 * Requesting the alignment without also requesting the sequence for BAM files containing empty CIGAR strings does now
   not result in erroneous parsing ([\#2418](https://github.com/seqan/seqan3/pull/2418)).
+* BAM files with 64 references are now parsed correctly ([\#2423](https://github.com/seqan/seqan3/pull/2423)).
 
 ## API changes
 

diff --git a/include/seqan3/io/sam_file/format_sam_base.hpp b/include/seqan3/io/sam_file/format_sam_base.hpp
@@ -493,16 +493,40 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
                                          ref_seqs_type & /*ref_id_to_pos_map*/)
 {
     auto it = std::ranges::begin(stream_view);
+    auto end = std::ranges::end(stream_view);
+    std::vector<char> string_buffer{};
 
-    auto parse_tag_value = [&stream_view, &it, this] (auto & value) // helper function to parse the next tag value
+    auto take_until_predicate = [&it, &string_buffer] (auto const & predicate)
     {
-        detail::consume(stream_view | views::take_until_or_throw(is_char<':'>)); // skip tag name
-        std::ranges::next(std::ranges::begin(stream_view));                     // skip ':'
-        read_field(stream_view | views::take_until_or_throw(is_char<'\t'> || is_char<'\n'>), value);
-        it = std::ranges::begin(stream_view);                                    // make sure iterator is up2date
+        string_buffer.clear();
+        while (!predicate(*it))
+        {
+            string_buffer.push_back(*it);
+            ++it;
+        }
+    };
+
+    auto skip_tag_name_and_colon = [&it] ()
+    {
+        while (!is_char<':'>(*it))
+            ++it;
+        ++it;
+    };
+
+    auto parse_tag_value = [&] (auto & value) // helper function to parse the next tag value
+    {
+        skip_tag_name_and_colon();
+        take_until_predicate(is_char<'\t'> || is_char<'\n'>);
+        read_field(string_buffer, value);
+    };
+
+    auto parse_until_newline = [&] (auto & value)
+    {
+        take_until_predicate(is_char<'\n'>);
+        read_field(string_buffer, value);
     };
 
-    while (is_char<'@'>(*it))
+    while (it != end && is_char<'@'>(*it))
     {
         ++it;                                                           // skip @
 
@@ -552,7 +576,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
                 if (is_char<'\t'>(*it))                                 // read rest of the tags
                 {
                     ++it;                                               // skip tab
-                    read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), get<1>(info));
+                    parse_until_newline(get<1>(info));
                 }
                 ++it;                                                   // skip newline
 
@@ -594,7 +618,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
                 if (is_char<'\t'>(*it))                                 // read rest of the tags
                 {
                     ++it;
-                    read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), get<1>(tmp));
+                    parse_until_newline(get<1>(tmp));
                 }
                 ++it;                                                   // skip newline
 
@@ -653,7 +677,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
                 ++it;                                                   // skip C
                 ++it;                                                   // skip O
                 ++it;                                                   // skip :
-                read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), tmp);
+                parse_until_newline(tmp);
                 ++it;                                                   // skip newline
 
                 hdr.comments.emplace_back(std::move(tmp));

diff --git a/test/unit/io/sam_file/format_bam_test.cpp b/test/unit/io/sam_file/format_bam_test.cpp
@@ -180,6 +180,47 @@ struct sam_file_read<seqan3::format_bam> : public sam_file_data
         '\x66', '\x66', '\x66', '\x46', '\x40', '\x7A', '\x7A', '\x5A', '\x73', '\x74', '\x72', '\x00'
     };
 
+    std::string many_refs{[] ()
+    {
+        // X = non-printable character
+        std::array<char, 2> buffer;
+        //                   B       A       M       X      ----------- 1345 -----------     @       H       D
+        std::string result{'\x42', '\x41', '\x4D', '\x01', '\x41', '\x05', '\x00', '\x00', '\x40', '\x48', '\x44',
+        //                   \t      V       N       :       1       .       6       \n
+                           '\x09', '\x56', '\x4E', '\x3A', '\x31', '\x2E', '\x36', '\x0A'};
+        for (char i = '0'; i <= '9'; ++i)
+            //                      @       S       Q       \t      S       N       :       r       e       f
+            result += std::string{'\x40', '\x53', '\x51', '\x09', '\x53', '\x4E', '\x3A', '\x72', '\x65', '\x66',
+            //                      _          \t      L       N       :       1       0       0       \n
+                                  '\x5F', i, '\x09', '\x4C', '\x4E', '\x3A', '\x31', '\x30', '\x30', '\x0A'};
+        for (size_t i = 10; i < 64; ++i)
+        {
+            std::to_chars(buffer.data(), buffer.data() + buffer.size(), i);
+            //                      @       S       Q       \t      S       N       :       r       e       f
+            result += std::string{'\x40', '\x53', '\x51', '\x09', '\x53', '\x4E', '\x3A', '\x72', '\x65', '\x66',
+            //                      _                             \t      L       N       :       1       0       0
+                                  '\x5F', buffer[0], buffer[1], '\x09', '\x4C', '\x4E', '\x3A', '\x31', '\x30', '\x30',
+            //                      \n
+                                  '\x0A'};
+        }
+        //                     ------------ 64 ------------
+        result += std::string{'\x40', '\x00', '\x00', '\x00'};
+        for (char i = '0'; i <= '9'; ++i)
+            //                      X       X       X       X       r       e       f       _          X       d
+            result += std::string{'\x06', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x5F', i, '\x00', '\x64',
+            //                      X       X       X
+                                  '\x00', '\x00', '\x00'};
+        for (size_t i = 10; i < 64; ++i)
+        {
+            std::to_chars(buffer.data(), buffer.data() + buffer.size(), i);
+            //                      X       X       X       X       r       e       f       _
+            result += std::string{'\x07', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x5F', buffer[0], buffer[1],
+            //                      X       d       X       X       X
+                                  '\x00', '\x64', '\x00', '\x00', '\x00'};
+        }
+        return result;
+    }()};
+
     /* bytes were modified to a ref id of 8448: \x00 \x00 \x21 \x00*/
     std::string unknown_ref_header{
         '\x42', '\x41', '\x4D', '\x01', '\x1C', '\x00', '\x00', '\x00', '\x40', '\x48', '\x44', '\x09', '\x56',

diff --git a/test/unit/io/sam_file/format_sam_test.cpp b/test/unit/io/sam_file/format_sam_test.cpp
@@ -65,6 +65,14 @@ read3	43	ref	3	63	1S1M1P1M1I1M1I1D1M1S	ref	10	300	GGAGTATA	!!*+,-./
 
     std::string unknown_ref_header{"@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n*\t0\tunknown_ref\t1\t0\t4M\t*\t0\t0\tAAAA\t*\n"};
 
+    std::string many_refs{[] ()
+    {
+        std::string result{"@HD\tVN:1.6\n"};
+        for (size_t i = 0; i < 64; ++i)
+            result += "@SQ\tSN:ref_" + std::to_string(i) + "\tLN:100\n";
+        return result;
+    }()};
+
     // -----------------------------------------------------------------------------------------------------------------
     // formatted output
     // -----------------------------------------------------------------------------------------------------------------

diff --git a/test/unit/io/sam_file/sam_file_format_test_template.hpp b/test/unit/io/sam_file/sam_file_format_test_template.hpp
@@ -423,6 +423,17 @@ TYPED_TEST_P(sam_file_read, format_error_uneven_hexadecimal_tag)
     EXPECT_THROW((fin.begin()), seqan3::format_error);
 }
 
+// https://github.com/seqan/seqan3/pull/2423
+TYPED_TEST_P(sam_file_read, issue2423)
+{
+    typename TestFixture::stream_type istream{this->many_refs};
+    seqan3::sam_file_input fin{istream, TypeParam{}};
+    ASSERT_NO_THROW(fin.begin());
+
+    EXPECT_EQ(fin.header().ref_id_info.size(), 64u);
+    EXPECT_EQ(fin.header().ref_dict.size(), 64u);
+}
+
 // ----------------------------------------------------------------------------
 // sam_file_write
 // ----------------------------------------------------------------------------
@@ -727,7 +738,8 @@ REGISTER_TYPED_TEST_SUITE_P(sam_file_read,
                             read_mate_but_not_ref_id_without_ref,
                             cigar_vector,
                             format_error_ref_id_not_in_reference_information,
-                            format_error_uneven_hexadecimal_tag);
+                            format_error_uneven_hexadecimal_tag,
+                            issue2423);
 
 REGISTER_TYPED_TEST_SUITE_P(sam_file_write,
                             write_empty_members,