Skip to content

Commit

Permalink
[MISC] Switch from uppercase seqan3::field names to lower case.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Dec 12, 2019
1 parent 752fad9 commit c582635
Show file tree
Hide file tree
Showing 87 changed files with 1,113 additions and 1,075 deletions.
8 changes: 5 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
#### I/O

* Asynchronous input (background file reading) supported via seqan3::view::async_input_buffer.
* Reading field::CIGAR into a vector over seqan3::cigar is supported via seqan3::alignment_file_input.
* Writing field::CIGAR into a vector over seqan3::cigar is supported via seqan3::alignment_file_output.
* Reading field::cigar into a vector over seqan3::cigar is supported via seqan3::alignment_file_input.
* Writing field::cigar into a vector over seqan3::cigar is supported via seqan3::alignment_file_output.

## API changes

Expand All @@ -64,7 +64,9 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.

* The field-based in- and output interface for structure files through std::get and std::tie has been removed.
Output can instead be achieved with seqan3::views:zip(), for input we will implement unzip() in the future.
* The `field::FLAG` of SAM/BAM input and output is now an **enum** instead of a simple integer (see seqan3::sam_flag).
* The `field::flag` of SAM/BAM input and output is now an **enum** instead of a simple integer (see seqan3::sam_flag).
* Uppercase seqan3::field names are deprecated. Use the lower case field names instead. You can easily find and replace
all occurrences by the following regex: find `field::([A-Z_]+)` replace `field::\L$1`.

#### Range

Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/alignment_file/alignment_file_read_cigar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ int main()
{
std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory

alignment_file_input fin{tmp_dir/"my.sam", fields<field::CIGAR>{}};
alignment_file_input fin{tmp_dir/"my.sam", fields<field::cigar>{}};

for (auto & [cigar] : fin)
debug_stream << cigar << std::endl;
Expand Down
8 changes: 4 additions & 4 deletions doc/tutorial/alignment_file/alignment_file_snippets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ int main()
//![writing]
auto filename = std::filesystem::temp_directory_path()/"out.sam";

alignment_file_output fout{filename, fields<field::FLAG, field::MAPQ>{}};
alignment_file_output fout{filename, fields<field::flag, field::mapq>{}};

size_t mymapq{0};
seqan3::sam_flag flag{seqan3::sam_flag::unmapped};
Expand Down Expand Up @@ -89,7 +89,7 @@ int main()
//![read_custom_fields]
auto filename = std::filesystem::temp_directory_path()/"example.sam";

alignment_file_input fin{filename, fields<field::ID, field::SEQ, field::FLAG>{}};
alignment_file_input fin{filename, fields<field::id, field::seq, field::flag>{}};

for (auto & [id, seq, flag /*order!*/] : fin)
{
Expand All @@ -104,7 +104,7 @@ int main()
//![alignments_without_ref]
auto filename = std::filesystem::temp_directory_path()/"example.sam";

alignment_file_input fin{filename, fields<field::ID, field::ALIGNMENT>{}};
alignment_file_input fin{filename, fields<field::id, field::alignment>{}};

for (auto & [ id, alignment ] : fin)
{
Expand All @@ -120,7 +120,7 @@ int main()
std::vector<std::string> ref_ids{"ref"}; // list of one reference name
std::vector<dna5_vector> ref_sequences{"AGAGTTCGAGATCGAGGACTAGCGACGAGGCAGCGAGCGATCGAT"_dna5};

alignment_file_input fin{filename, ref_ids, ref_sequences, fields<field::ALIGNMENT>{}};
alignment_file_input fin{filename, ref_ids, ref_sequences, fields<field::alignment>{}};

for (auto & [ alignment ] : fin)
{
Expand Down
4 changes: 2 additions & 2 deletions doc/tutorial/alignment_file/alignment_file_solution1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ int main()
{
std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory

alignment_file_input fin{tmp_dir/"my.sam", fields<field::MAPQ>{}};
alignment_file_input fin{tmp_dir/"my.sam", fields<field::mapq>{}};

double sum{};
size_t c{};

std::ranges::for_each(fin.begin(), fin.end(), [&sum, &c] (auto & rec) { sum += get<field::MAPQ>(rec); ++c; });
std::ranges::for_each(fin.begin(), fin.end(), [&sum, &c] (auto & rec) { sum += get<field::mapq>(rec); ++c; });

debug_stream << "Average: " << (sum/c) << std::endl;
}
Expand Down
8 changes: 4 additions & 4 deletions doc/tutorial/alignment_file/alignment_file_solution2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,15 @@ int main()

// read in reference information
sequence_file_input<my_traits> reference_file{tmp_dir/"reference.fasta"};
concatenated_sequences<std::string> ref_ids = get<field::ID>(reference_file);
std::vector<std::vector<dna5>> ref_seqs = get<field::SEQ>(reference_file);
concatenated_sequences<std::string> ref_ids = get<field::id>(reference_file);
std::vector<std::vector<dna5>> ref_seqs = get<field::seq>(reference_file);

alignment_file_input mapping_file{tmp_dir/"mapping.sam",
ref_ids,
ref_seqs,
fields<field::ID,field::REF_ID, field::MAPQ, field::ALIGNMENT>{}};
fields<field::id,field::ref_id, field::mapq, field::alignment>{}};

auto mapq_filter = std::views::filter([] (auto & rec) { return get<field::MAPQ>(rec) >= 30; });
auto mapq_filter = std::views::filter([] (auto & rec) { return get<field::mapq>(rec) >= 30; });

for (auto & [id, ref_id, mapq, alignment] : mapping_file | mapq_filter)
{
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/alignment_file/alignment_file_solution3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ int main()
std::vector<std::vector<dna4>> seqs = {"ACGATCGACTAGCTACGATCAGCTAGCAG"_dna4, "AGAAAGAGCGAGGCTATTTTAGCGAGTTA"_dna4};

auto tmp_dir = std::filesystem::temp_directory_path();
alignment_file_output fout{tmp_dir/"my.sam", fields<field::ID, field::SEQ>{}};
alignment_file_output fout{tmp_dir/"my.sam", fields<field::id, field::seq>{}};

for (size_t i = 0; i < ids.size(); ++i)
{
Expand Down
60 changes: 30 additions & 30 deletions doc/tutorial/alignment_file/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,22 @@ with the only difference that **the header is mandatory**.

The Alignment file abstraction supports writing the following fields:

1. field::SEQ
2. field::ID
3. field::OFFSET
4. field::REF_SEQ
5. field::REF_ID
6. field::REF_OFFSET
7. field::ALIGNMENT
8. field::MAPQ
9. field::FLAG
10. field::QUAL
11. field::MATE
12. field::TAGS
13. field::EVALUE
14. field::BIT_SCORE

There is an additional field called seqan3::field::HEADER_PTR.
1. field::seq
2. field::id
3. field::offset
4. field::ref_seq
5. field::ref_id
6. field::ref_offset
7. field::alignment
8. field::mapq
9. field::flag
10. field::qual
11. field::mate
12. field::tags
13. field::evalue
14. field::bit_score

There is an additional field called seqan3::field::header_ptr.
It is used to transfer header information from seqan3::alignment_file_input to seqan3::alignment_file_output,
but you needn't deal with this field manually.

Expand All @@ -82,17 +82,17 @@ To make things clearer, here is the table of SAM columns connected to the corres

| # | SAM Column ID | FIELD name |
|:--:|:--------------|:----------------------------------------------------------------------------------|
| 1 | QNAME | seqan3::field::ID |
| 2 | FLAG | seqan3::field::FLAG |
| 3 | RNAME | seqan3::field::REF_ID |
| 4 | POS | seqan3::field::REF_OFFSET |
| 5 | MAPQ | seqan3::field::MAPQ |
| 6 | CIGAR | implicitly stored in seqan3::field::ALIGNMENT or directly in seqan3::field::CIGAR |
| 7 | RNEXT | seqan3::field::MATE (tuple pos 0) |
| 8 | PNEXT | seqan3::field::MATE (tuple pos 1) |
| 9 | TLEN | seqan3::field::MATE (tuple pos 2) |
| 10 | SEQ | seqan3::field::SEQ |
| 11 | QUAL | seqan3::field::QUAL |
| 1 | QNAME | seqan3::field::id |
| 2 | FLAG | seqan3::field::flag |
| 3 | RNAME | seqan3::field::ref_id |
| 4 | POS | seqan3::field::ref_offset |
| 5 | MAPQ | seqan3::field::mapq |
| 6 | CIGAR | implicitly stored in seqan3::field::alignment or directly in seqan3::field::cigar |
| 7 | RNEXT | seqan3::field::mate (tuple pos 0) |
| 8 | PNEXT | seqan3::field::mate (tuple pos 1) |
| 9 | TLEN | seqan3::field::mate (tuple pos 2) |
| 10 | SEQ | seqan3::field::seq |
| 11 | QUAL | seqan3::field::qual |

## File extensions

Expand Down Expand Up @@ -149,7 +149,7 @@ Note that this is possible for all SeqAn file objects.
Let's assume we want to compute the average mapping quality of a SAM file.

For this purpose, write a small program that
* only reads the mapping quality (field::MAPQ) out of a SAM file and
* only reads the mapping quality (field::mapq) out of a SAM file and
* computes the average of all qualities.

Use the following file to test your program:
Expand Down Expand Up @@ -246,7 +246,7 @@ With those information do the following:
* For the resulting alignments, print which read was mapped against with reference id and
the number of seqan3::gap's in each sequence (aligned reference and read sequence).

\note reference ids (field::REF_ID) are given as an index of type `std::optional<int32_t>`
\note reference ids (field::ref_id) are given as an index of type `std::optional<int32_t>`
that denote the position of the reference id in the `ref_ids` vector passed to the alignment file.

Your program should print the following:
Expand All @@ -268,7 +268,7 @@ r004 mapped against 1 with 0 gaps in the read sequence and 0 gaps in the referen
## Reading the CIGAR string

If you are accustomed to the raw CIGAR information, we also provide reading the cigar information into a
`std::vector<seqan3::cigar>` if you specify the `seqan3::field::CIGAR`.
`std::vector<seqan3::cigar>` if you specify the `seqan3::field::cigar`.

\snippet doc/tutorial/alignment_file/alignment_file_read_cigar.cpp code

Expand Down
14 changes: 7 additions & 7 deletions doc/tutorial/read_mapper/read_mapper_step4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ void map_reads(std::filesystem::path const & query_path,
sequence_file_input query_in{query_path};

//! [alignment_file_output]
alignment_file_output sam_out{sam_path, fields<field::SEQ,
field::ID,
field::REF_ID,
field::REF_OFFSET,
field::ALIGNMENT,
field::QUAL,
field::MAPQ>{}};
alignment_file_output sam_out{sam_path, fields<field::seq,
field::id,
field::ref_id,
field::ref_offset,
field::alignment,
field::qual,
field::mapq>{}};
//! [alignment_file_output]

configuration const search_config = search_cfg::max_error{search_cfg::total{errors}} |
Expand Down
14 changes: 7 additions & 7 deletions doc/tutorial/sequence_file/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ hoping that it will make the following tutorial easier to understand.
As mentioned above, our file object is a range over records.
More specifically over objects of type seqan3::record which is basically just a std::tuple that holds the data.
To identify or specialise which data is read/written and contained in the records,
we use seqan3::field tags (e.g. seqan3::field::SEQ denotes sequence information).
we use seqan3::field tags (e.g. seqan3::field::seq denotes sequence information).
The seqan3::field tags are shared between file formats and allow for easy file conversion.

Output files can handle various types that fulfill the requirements of the format (e.g.
Expand Down Expand Up @@ -142,10 +142,10 @@ You can also customise this list if you want to allow different or additional fi

The Sequence file abstraction supports reading four different fields:

1. seqan3::field::SEQ
2. seqan3::field::ID
3. seqan3::field::QUAL
4. seqan3::field::SEQ_QUAL
1. seqan3::field::seq
2. seqan3::field::id
3. seqan3::field::qual
4. seqan3::field::seq_qual

The first three fields are retrieved by default (and in that order!).
The last field may be selected to directly store sequence and qualities in a more memory-efficient
Expand Down Expand Up @@ -377,8 +377,8 @@ These work similarly to how they work on an std::vector.

If you pass a tuple to `push_back()` or give arguments to `emplace_back()` the order of elements is assumed
to be the same as the one in the seqan3::sequence_file_output::selected_field_ids.
For the above example the default FASTA fields are first seqan3::field::SEQ,
second seqan3::field::ID and the third one seqan3::field::QUAL.
For the above example the default FASTA fields are first seqan3::field::seq,
second seqan3::field::id and the third one seqan3::field::qual.
You may give less fields than are selected if the actual format you are writing to can cope with less
(e.g. for FastA it is sufficient to give sequence and name information).

Expand Down
8 changes: 4 additions & 4 deletions doc/tutorial/sequence_file/sequence_file_snippets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ sequence_file_input fin2{std::filesystem::temp_directory_path()/"my.fastq"}; //

for (auto && [rec1, rec2] : views::zip(fin1, fin2)) // && is important! because views::zip returns temporaries
{
if (get<field::ID>(rec1) != get<field::ID>(rec2))
if (get<field::id>(rec1) != get<field::id>(rec2))
throw std::runtime_error("Oh oh your pairs don't match.");
}
//![paired_reads]
Expand All @@ -144,7 +144,7 @@ for (auto && records : fin | ranges::view::chunk(10)) // && is important! becaus
{
// `records` contains 10 elements (or less at the end)
debug_stream << "Taking the next 10 sequences:\n";
debug_stream << "ID: " << get<field::ID>(*records.begin()) << '\n'; // prints first ID in batch
debug_stream << "ID: " << get<field::id>(*records.begin()) << '\n'; // prints first ID in batch
}
//![read_in_batches]
}
Expand All @@ -156,14 +156,14 @@ sequence_file_input fin{std::filesystem::temp_directory_path()/"my.fastq"};
// std::views::filter takes a function object (a lambda in this case) as input that returns a boolean
auto minimum_quality_filter = std::views::filter([] (auto const & rec)
{
auto qual = get<field::QUAL>(rec) | std::views::transform([] (auto q) { return q.to_phred(); });
auto qual = get<field::qual>(rec) | std::views::transform([] (auto q) { return q.to_phred(); });
double sum = ranges::accumulate(qual.begin(), qual.end(), 0);
return sum / std::ranges::size(qual) >= 40; // minimum average quality >= 40
});

for (auto & rec : fin | minimum_quality_filter)
{
debug_stream << "ID: " << get<field::ID>(rec) << '\n';
debug_stream << "ID: " << get<field::id>(rec) << '\n';
}
//![quality_filter]
}
Expand Down
6 changes: 3 additions & 3 deletions doc/tutorial/sequence_file/sequence_file_solution1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ int main()

for (auto & rec : fin)
{
debug_stream << "ID: " << get<field::ID>(rec) << '\n';
debug_stream << "SEQ: " << get<field::SEQ>(rec) << '\n';
debug_stream << "QUAL: " << get<field::QUAL>(rec) << '\n';
debug_stream << "ID: " << get<field::id>(rec) << '\n';
debug_stream << "SEQ: " << get<field::seq>(rec) << '\n';
debug_stream << "QUAL: " << get<field::qual>(rec) << '\n';
}
}
//![solution]
6 changes: 3 additions & 3 deletions doc/tutorial/sequence_file/sequence_file_solution3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,21 @@ int main()

auto length_filter = std::views::filter([] (auto const & rec)
{
return std::ranges::size(get<field::SEQ>(rec)) >= 5;
return std::ranges::size(get<field::seq>(rec)) >= 5;
});

// you can use a for loop

// for (auto & rec : fin | length_filter | std::views::take(2))
// {
// debug_stream << "ID: " << get<field::ID>(rec) << '\n';
// debug_stream << "ID: " << get<field::id>(rec) << '\n';
// }

// But you can also do this to retrieve all IDs into a vector:
std::vector<std::string> ids = fin
| length_filter // apply length filter
| std::views::take(2) // take first two records
| views::get<field::ID> // select only ID from record
| views::get<field::id> // select only ID from record
| views::convert<std::string &&> // mark ID to be moved out of record
| views::to<std::vector<std::string>>; // convert to container
// Note that you need to know the type of id (std::string)
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/sequence_file/sequence_file_solution4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ int main()

auto length_filter = std::views::filter([] (auto const & rec)
{
return std::ranges::size(get<field::SEQ>(rec)) >= 5;
return std::ranges::size(get<field::seq>(rec)) >= 5;
});

for (auto & rec : fin | length_filter)
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/sequence_file/sequence_file_solution5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ int main()

auto length_filter = std::views::filter([] (auto & rec)
{
return std::ranges::size(get<field::SEQ>(rec)) >= 5;
return std::ranges::size(get<field::seq>(rec)) >= 5;
});

fout = fin | length_filter;
Expand Down
Loading

0 comments on commit c582635

Please sign in to comment.