-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FIX] Parsing BAM headers with 64 reference sequences #2423
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -493,16 +493,40 @@ inline void format_sam_base::read_header(stream_view_type && stream_view, | |
ref_seqs_type & /*ref_id_to_pos_map*/) | ||
{ | ||
auto it = std::ranges::begin(stream_view); | ||
auto end = std::ranges::end(stream_view); | ||
std::vector<char> string_buffer{}; | ||
|
||
auto parse_tag_value = [&stream_view, &it, this] (auto & value) // helper function to parse the next tag value | ||
auto take_until_predicate = [&it, &string_buffer] (auto const & predicate) | ||
{ | ||
detail::consume(stream_view | views::take_until_or_throw(is_char<':'>)); // skip tag name | ||
std::ranges::next(std::ranges::begin(stream_view)); // skip ':' | ||
read_field(stream_view | views::take_until_or_throw(is_char<'\t'> || is_char<'\n'>), value); | ||
it = std::ranges::begin(stream_view); // make sure iterator is up2date | ||
string_buffer.clear(); | ||
while (!predicate(*it)) | ||
{ | ||
string_buffer.push_back(*it); | ||
++it; | ||
} | ||
}; | ||
|
||
auto skip_tag_name_and_colon = [&it] () | ||
{ | ||
while (!is_char<':'>(*it)) | ||
++it; | ||
++it; | ||
}; | ||
|
||
auto parse_tag_value = [&] (auto & value) // helper function to parse the next tag value | ||
{ | ||
skip_tag_name_and_colon(); | ||
take_until_predicate(is_char<'\t'> || is_char<'\n'>); | ||
read_field(string_buffer, value); | ||
}; | ||
|
||
auto parse_until_newline = [&] (auto & value) | ||
{ | ||
take_until_predicate(is_char<'\n'>); | ||
read_field(string_buffer, value); | ||
}; | ||
|
||
while (is_char<'@'>(*it)) | ||
while (it != end && is_char<'@'>(*it)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
{ | ||
++it; // skip @ | ||
|
||
|
@@ -552,7 +576,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view, | |
if (is_char<'\t'>(*it)) // read rest of the tags | ||
{ | ||
++it; // skip tab | ||
read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), get<1>(info)); | ||
parse_until_newline(get<1>(info)); | ||
} | ||
++it; // skip newline | ||
|
||
|
@@ -594,7 +618,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view, | |
if (is_char<'\t'>(*it)) // read rest of the tags | ||
{ | ||
++it; | ||
read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), get<1>(tmp)); | ||
parse_until_newline(get<1>(tmp)); | ||
} | ||
++it; // skip newline | ||
|
||
|
@@ -653,7 +677,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view, | |
++it; // skip C | ||
++it; // skip O | ||
++it; // skip : | ||
read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), tmp); | ||
parse_until_newline(tmp); | ||
++it; // skip newline | ||
|
||
hdr.comments.emplace_back(std::move(tmp)); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,6 +180,47 @@ struct sam_file_read<seqan3::format_bam> : public sam_file_data | |
'\x66', '\x66', '\x66', '\x46', '\x40', '\x7A', '\x7A', '\x5A', '\x73', '\x74', '\x72', '\x00' | ||
}; | ||
|
||
std::string many_refs{[] () | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The BAM would be multiple hundred lines long, that's why it is generated. The comments + BAM specification make it feasible to understand. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could have also generated a bam file and added it as file :D |
||
{ | ||
// X = non-printable character | ||
std::array<char, 2> buffer; | ||
// B A M X ----------- 1345 ----------- @ H D | ||
std::string result{'\x42', '\x41', '\x4D', '\x01', '\x41', '\x05', '\x00', '\x00', '\x40', '\x48', '\x44', | ||
// \t V N : 1 . 6 \n | ||
'\x09', '\x56', '\x4E', '\x3A', '\x31', '\x2E', '\x36', '\x0A'}; | ||
for (char i = '0'; i <= '9'; ++i) | ||
// @ S Q \t S N : r e f | ||
result += std::string{'\x40', '\x53', '\x51', '\x09', '\x53', '\x4E', '\x3A', '\x72', '\x65', '\x66', | ||
// _ \t L N : 1 0 0 \n | ||
'\x5F', i, '\x09', '\x4C', '\x4E', '\x3A', '\x31', '\x30', '\x30', '\x0A'}; | ||
for (size_t i = 10; i < 64; ++i) | ||
{ | ||
std::to_chars(buffer.data(), buffer.data() + buffer.size(), i); | ||
// @ S Q \t S N : r e f | ||
result += std::string{'\x40', '\x53', '\x51', '\x09', '\x53', '\x4E', '\x3A', '\x72', '\x65', '\x66', | ||
// _ \t L N : 1 0 0 | ||
'\x5F', buffer[0], buffer[1], '\x09', '\x4C', '\x4E', '\x3A', '\x31', '\x30', '\x30', | ||
// \n | ||
'\x0A'}; | ||
} | ||
// ------------ 64 ------------ | ||
result += std::string{'\x40', '\x00', '\x00', '\x00'}; | ||
for (char i = '0'; i <= '9'; ++i) | ||
// X X X X r e f _ X d | ||
result += std::string{'\x06', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x5F', i, '\x00', '\x64', | ||
// X X X | ||
'\x00', '\x00', '\x00'}; | ||
for (size_t i = 10; i < 64; ++i) | ||
{ | ||
std::to_chars(buffer.data(), buffer.data() + buffer.size(), i); | ||
// X X X X r e f _ | ||
result += std::string{'\x07', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x5F', buffer[0], buffer[1], | ||
// X d X X X | ||
'\x00', '\x64', '\x00', '\x00', '\x00'}; | ||
} | ||
return result; | ||
}()}; | ||
|
||
/* bytes were modified to a ref id of 8448: \x00 \x00 \x21 \x00*/ | ||
std::string unknown_ref_header{ | ||
'\x42', '\x41', '\x4D', '\x01', '\x1C', '\x00', '\x00', '\x00', '\x40', '\x48', '\x44', '\x09', '\x56', | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically it's every number of references whose last byte is
\x40
, e.g. it's also affecting320
, but I didn't come up with a concise way to describe this.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(mod 64) ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, it works properly for 128. The first/last(?) byte has to look like
01000000
or11000000
, i.e. the first least significant bit to be set is the seventh...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you are right: (mod 256) == 64