seqan · marehr · Apr 21, 2021 · Apr 20, 2021 · Apr 21, 2021 · eseiler
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -96,6 +96,7 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
 * Requesting the alignment without also requesting the sequence for BAM files containing empty CIGAR strings does now
   not result in erroneous parsing ([\#2418](https://github.com/seqan/seqan3/pull/2418)).
 * BAM files with 64 references are now parsed correctly ([\#2423](https://github.com/seqan/seqan3/pull/2423)).
+* BAM files not containing a plain text header are now accepted ([\#2536](https://github.com/seqan/seqan3/pull/2536)).
 * Writing `gz`-compressed output no longer results in `bgzf`-compressed output. This change may have following effects
   ([\#2458](https://github.com/seqan/seqan3/pull/2458)):
   * A noticeable slowdown when writing `gz`-compressed content since, in contrast to `bgzf`, `gz` does not feature

diff --git a/include/seqan3/io/sam_file/format_bam.hpp b/include/seqan3/io/sam_file/format_bam.hpp
@@ -333,24 +333,43 @@ inline void format_bam::read_alignment_record(stream_type & stream,
         if (!std::ranges::equal(stream_view | views::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
             throw format_error{"File is not in BAM format."};
 
-        int32_t tmp32{};
-        read_field(stream_view, tmp32);
+        int32_t l_text{}; // length of header text including \0 character
+        int32_t n_ref{}; // number of reference sequences
+        int32_t l_name{}; // 1 + length of reference name including \0 character
+        int32_t l_ref{}; // length of reference sequence
 
-        if (tmp32 > 0) // header text is present
-            read_header(stream_view | views::take_exactly_or_throw(tmp32), header, ref_seqs);
+        read_field(stream_view, l_text);
+
+        if (l_text > 0) // header text is present
+            read_header(stream_view | views::take_exactly_or_throw(l_text), header, ref_seqs);
 
-        int32_t n_ref;
         read_field(stream_view, n_ref);
 
         for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
         {
-            read_field(stream_view, tmp32); // l_name (length of reference name including \0 character)
+            read_field(stream_view, l_name);
 
-            string_buffer.resize(tmp32 - 1);
-            std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.data()); // copy without \0 character
+            string_buffer.resize(l_name - 1);
+            std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
             std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
 
-            read_field(stream_view, tmp32); // l_ref (length of reference sequence)
+            read_field(stream_view, l_ref);
+
+            if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
+            {
+                // If there was no header text, we parse reference sequences block as header information
+                if (l_text == 0)
+                {
+                    auto & reference_ids = header.ref_ids();
+                    // put the length of the reference sequence into ref_id_info
+                    header.ref_id_info.emplace_back(l_ref, "");
+                    // put the reference name into reference_ids
+                    reference_ids.push_back(string_buffer);
+                    // assign the reference name an ascending reference id (starts at index 0).
+                    header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
+                    continue;
+                }
+            }
 
             auto id_it = header.ref_dict.find(string_buffer);
 
@@ -367,7 +386,7 @@ inline void format_bam::read_alignment_record(stream_type & stream,
                                                      " does not correspond to the position ", id_it->second,
                                                      " in the header (header.ref_ids():", header.ref_ids(), ").")};
             }
-            else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32) // [unlikely]
+            else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
             {
                 throw format_error{"Provided reference has unequal length as specified in the header."};
             }

diff --git a/test/unit/io/sam_file/format_bam_test.cpp b/test/unit/io/sam_file/format_bam_test.cpp
@@ -666,3 +666,23 @@ TEST_F(bam_format, issue2417)
 
     EXPECT_EQ(num_records, 1u);
 }
+
+// https://github.com/seqan/seqan3/issues/1201
+TEST_F(bam_format, issue1201)
+{
+    // A BAM file with an l_text of 0
+    std::string const input{
+        // read1	117	ref	1	0	*	=	1	0	ACGTA	IIIII
+        '\x42', '\x41', '\x4d', '\x01', '\x00', '\x00', '\x00', '\x00', '\x01', '\x00', '\x00', '\x00', '\x04', '\x00',
+        '\x00', '\x00', '\x72', '\x65', '\x66', '\x00', '\x70', '\x07', '\x00', '\x00', '\x2e', '\x00', '\x00', '\x00',
+        '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x06', '\x00', '\x49', '\x12', '\x00', '\x00',
+        '\x75', '\x00', '\x05', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+        '\x00', '\x00', '\x00', '\x00', '\x72', '\x65', '\x61', '\x64', '\x31', '\x00', '\x12', '\x48', '\x10', '\x28',
+        '\x28', '\x28', '\x28', '\x28'
+    };
+
+    std::istringstream stream{input};
+    seqan3::sam_file_input fin{stream, seqan3::format_bam{}};
+
+    fin.header().format_version;
+}