Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Modularisation clustering #49

Merged
merged 5 commits into from
Feb 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion include/breakend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,12 @@ inline bool operator<(const breakend & lhs, const breakend & rhs)
: lhs.orientation != rhs.orientation
? lhs.orientation < rhs.orientation
: lhs.position < rhs.position;
}
}

inline bool operator==(const breakend & lhs, const breakend & rhs)
{
return (lhs.seq_name == rhs.seq_name) &&
(lhs.position == rhs.position) &&
(lhs.orientation == rhs.orientation) &&
(lhs.seq_type == rhs.seq_type);
}
8 changes: 7 additions & 1 deletion include/detect_breakends/junction_detection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "detect_breakends/bam_functions.hpp" // for hasFlag* functions
#include "junction.hpp" // for class junction
#include "detect_breakends/aligned_segment.hpp" // for struct aligned_segment
#include "method_enums.hpp" // for enum clustering_methods

/*! \brief Detects junctions between distant genomic positions by analyzing an alignment file (sam/bam). The detected
* junctions are printed on stdout and insertion alleles are stored in a fasta file.
Expand All @@ -16,6 +17,10 @@
* 2: split_read,
* 3: read_pairs,
* 4: read_depth)
* \param clustering_method method for clustering junctions (0: simple_clustering
* 1: hierarchical_clustering,
* 2: self-balancing_binary_tree,
* 3: candidate_selection_based_on_voting)
*
* \details Detects junctions from the CIGAR strings and supplementary alignment tags of read alignment records.
* We sort out unmapped alignments, secondary alignments, duplicates and alignments with low mapping quality.
Expand All @@ -24,4 +29,5 @@
*/
void detect_junctions_in_alignment_file(const std::filesystem::path & alignment_file_path,
const std::filesystem::path & insertion_file_path,
const std::vector<uint8_t> methods);
const std::vector<uint8_t> methods,
const clustering_methods clustering_method);
45 changes: 36 additions & 9 deletions include/junction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,28 @@

class junction
{
private:
breakend mate1{};
breakend mate2{};
std::string read_name{};

public:
uint64_t supporting_reads{1};

/*!\name Constructors, destructor and assignment
* \{
*/
constexpr junction() = default; //!< Defaulted.
junction(junction const &) = default; //!< Defaulted.
junction(junction &&) = default; //!< Defaulted.
junction & operator=(junction const &) = default; //!< Defaulted.
junction & operator=(junction &&) = default; //!< Defaulted.
~junction() = default; //!< Defaulted.
//!\}

junction(breakend mate1, breakend mate2, std::string read_name) : mate1{std::move(mate1)}, mate2{std::move(mate2)}, read_name{std::move(read_name)}
junction(breakend mate1, breakend mate2, std::string read_name) : mate1{std::move(mate1)},
mate2{std::move(mate2)},
read_name{std::move(read_name)}
{
if ((mate2.seq_type < mate1.seq_type) ||
(mate2.seq_type == mate1.seq_type && mate2.seq_name < mate1.seq_name) ||
Expand All @@ -30,18 +49,15 @@ class junction
{
return read_name;
}

private:
breakend mate1{};
breakend mate2{};
std::string read_name{};
};


template <typename stream_t>
inline stream_t operator<<(stream_t && stream, junction const & junc)
{
stream << junc.get_mate1() << '\t' << junc.get_mate2() << '\t' << junc.get_read_name();
stream << junc.get_mate1() << '\t'
<< junc.get_mate2() << '\t'
<< junc.get_read_name() << '\t'
<< junc.supporting_reads;
return stream;
}

Expand All @@ -52,4 +68,15 @@ inline bool operator<(const junction & lhs, const junction & rhs)
: rhs.get_mate1() < lhs.get_mate1()
? false
: lhs.get_mate2() < rhs.get_mate2();
}
}

/*! \brief A junction is equal to another, if their mates are equal to each other. The read_name and supporting_reads
* are allowed to be unequal, because more than one read could support the same junction.
*
* \param lhs left side junction
* \param rhs right side junction
*/
inline bool operator==(const junction & lhs, const junction & rhs)
{
return (lhs.get_mate1() == rhs.get_mate1()) && (lhs.get_mate2() == rhs.get_mate2());
}
10 changes: 10 additions & 0 deletions include/method_enums.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

//!\brief An enum for the different clustering methods.
enum clustering_methods
{
simple_clustering = 0,
hierarchical_clustering = 1,
self_balancing_binary_tree = 2,
candidate_selection_based_on_voting = 3
};
32 changes: 30 additions & 2 deletions src/detect_breakends.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,36 @@
#include <seqan3/argument_parser/all.hpp> // includes all necessary headers

#include <seqan3/range/views/all.hpp>
#include <seqan3/range/views/get.hpp>

#include "detect_breakends/junction_detection.hpp"

// Specialise a mapping from an identifying string to the respective value of your type clustering_methods. With the
// help of this function, you're able to call ./detect_breackends with -c 0 and -c simple_clustering and get the same
// result.
auto enumeration_names(clustering_methods)
{
return std::unordered_map<std::string_view,
clustering_methods>{{"0", clustering_methods::simple_clustering},
{"simple_clustering",
clustering_methods::simple_clustering},
{"1", clustering_methods::hierarchical_clustering},
{"hierarchical_clustering",
clustering_methods::hierarchical_clustering},
{"2", clustering_methods::self_balancing_binary_tree},
{"self_balancing_binary_tree",
clustering_methods::self_balancing_binary_tree},
{"3", clustering_methods::candidate_selection_based_on_voting},
{"candidate_selection_based_on_voting",
clustering_methods::candidate_selection_based_on_voting}};
}

struct cmd_arguments
{
std::filesystem::path alignment_file_path{};
std::filesystem::path insertion_file_path{};
std::vector<uint8_t> methods{1, 2, 3, 4}; // default is using all methods
std::vector<uint8_t> methods{1, 2, 3, 4}; // default is using all methods
clustering_methods clustering_method{simple_clustering}; // default is the simple clustering method
};

void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments & args)
Expand All @@ -28,6 +52,7 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
// "3", "read_pairs",
// "4", "read_depth"};
seqan3::arithmetic_range_validator method_validator{1, 4};
seqan3::value_list_validator clustering_method_validator {(seqan3::enumeration_names<clustering_methods> | seqan3::views::get<1>)};

// Options - Input / Output:
parser.add_positional_option(args.alignment_file_path, "Input read alignments in SAM or BAM format.",
Expand All @@ -38,6 +63,8 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
// Options - Methods:
parser.add_option(args.methods, 'm', "method", "Choose the method to be used.",
seqan3::option_spec::ADVANCED, method_validator);
parser.add_option(args.clustering_method, 'c', "clustering_method", "Choose the clustering method to be used.",
seqan3::option_spec::ADVANCED, clustering_method_validator);
}

int main(int argc, char ** argv)
Expand All @@ -59,7 +86,8 @@ int main(int argc, char ** argv)

detect_junctions_in_alignment_file(args.alignment_file_path,
args.insertion_file_path,
args.methods);
args.methods,
args.clustering_method);

return 0;
}
44 changes: 42 additions & 2 deletions src/detect_breakends/junction_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ void analyze_cigar(std::string chromosome,
* 2: split_read,
* 3: read_pairs,
* 4: read_depth)
* \param clustering_method method for clustering junctions (0: simple_clustering
* 1: hierarchical_clustering,
* 2: self-balancing_binary_tree,
* 3: candidate_selection_based_on_voting)
* \endcond
*
* \details Detects junctions from the CIGAR strings and supplementary alignment tags of read alignment records.
Expand All @@ -237,7 +241,8 @@ void analyze_cigar(std::string chromosome,
*/
void detect_junctions_in_alignment_file(const std::filesystem::path & alignment_file_path,
const std::filesystem::path & insertion_file_path,
const std::vector<uint8_t> methods)
const std::vector<uint8_t> methods,
const clustering_methods clustering_method)
{
// Open input alignment file
using my_fields = seqan3::fields<seqan3::field::id,
Expand Down Expand Up @@ -309,9 +314,11 @@ void detect_junctions_in_alignment_file(const std::filesystem::path & alignment_
}
break;
case 3: // Detect junctions from read pair evidence
seqan3::debug_stream << "The read pair method is not yet implemented.\n";
break;
// continue;
case 4: // Detect junctions from read depth evidence
seqan3::debug_stream << "The read depth method is not yet implemented.\n";
break;
// continue;
}
Expand All @@ -325,9 +332,42 @@ void detect_junctions_in_alignment_file(const std::filesystem::path & alignment_
}
}
std::sort(junctions.begin(), junctions.end());

switch (clustering_method)
{
case 0: // simple_clustering
{
junction previous_elem = junctions[0];
int i = 1;
while (i < junctions.size())
{
if (junctions[i] == previous_elem)
{
junctions.erase(junctions.begin() + i);
junctions[i].supporting_reads ++;
}
else
{
previous_elem = junctions[i];
i++;
}
}
}
break;
case 1: // hierarchical clustering
seqan3::debug_stream << "The hierarchical clustering method is not yet implemented\n";
break;
case 2: // self-balancing_binary_tree,
seqan3::debug_stream << "The self-balancing binary tree clustering method is not yet implemented\n";
break;
case 3: // candidate_selection_based_on_voting
seqan3::debug_stream << "The candidate selection based on voting clustering method is not yet implemented\n";
break;
}

for (junction elem : junctions)
{
std::cout << elem << '\n';
}
seqan3::debug_stream << "Done. Found " << junctions.size() << " junctions." << '\n';
seqan3::debug_stream << "Done. Found " << junctions.size() << " junctions.\n";
}
47 changes: 27 additions & 20 deletions test/api/junction_detection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,38 @@

// TEST(group1, fasta_out_empty)
// {
// std::string expected{"Reference\tchr9\t70103073\tForward\tReference\tchr9\t70103147\tForward\tm13802/6999/CCS\n"};
// std::string expected{"Reference\tchr9\t70103073\tForward\tReference\tchr9\t70103147\tForward\tm13802/6999/CCS\t1\n"};
// testing::internal::CaptureStdout();
// detect_junctions_in_alignment_file(DATADIR"simulated.minimap2.hg19.coordsorted_cutoff.sam", "");
// std::string std_cout = testing::internal::GetCapturedStdout();
// EXPECT_EQ(expected, std_cout);
// }

// Explanation for the stings:
// Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21
// INS from Primary Read - Sequence Type: Reference; Sequence Name: m2257/8161/CCS; Position: 41972616; Orientation: Reverse
// Sequence Type: Read; Sequence Name: 0; Position: 3975; Orientation: Reverse
// Chromosome: chr21
// Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS
// BND from SA Tag - Sequence Type: Reference; Chromosome: chr22; Position: 17458417; Orientation: Forward
// Sequence Type: Reference; Chromosome: chr21; Position: 41972615; Orientation: Forward
// Sequence Name: m41327/11677/CCS

Comment on lines +18 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought I'd add an explanation of the string as I stumbled over it and was confused as to what the parts of the string were.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, makes sense :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused by this string:

// Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21

It should consist of breakend1, breakend2 and the read name it was detected from. However, the read name here is chr21 and the reference chromosome of breakend1 is m2257/8161/CCS\t41972616. Those are swapped but I don't immediately see why 😕

TEST(junction_detection, fasta_out_not_empty)
{
std::string expected{
"Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm21263/13017/CCS\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm38637/7161/CCS\n"
"Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21\n"
"Reference\tm2257/8161/CCS\t41972616\tReverse\tRead \t0\t3975\tReverse\tchr21\n"
"Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS\t1\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm21263/13017/CCS\t1\n"
"Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21\t2\n"
"Reference\tm2257/8161/CCS\t41972616\tReverse\tRead \t0\t3975\tReverse\tchr21\t1\n"
eldariont marked this conversation as resolved.
Show resolved Hide resolved
};

std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory
std::filesystem::remove(tmp_dir/"detect_breakends_out_short.fasta"); // remove old output if existent

testing::internal::CaptureStdout();
detect_junctions_in_alignment_file(DATADIR"simulated.minimap2.hg19.coordsorted_cutoff.sam",
tmp_dir/"detect_breakends_out_short.fasta", {1, 2, 3, 4});
tmp_dir/"detect_breakends_out_short.fasta", {1, 2, 3, 4}, simple_clustering);

std::string std_cout = testing::internal::GetCapturedStdout();
seqan3::debug_stream << "std_out:\n" << std_cout << '\n';
Expand All @@ -42,16 +51,16 @@ TEST(junction_detection, fasta_out_not_empty)
TEST(junction_detection, method_1_only)
{
std::string expected{
"Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21\n"
"Reference\tm2257/8161/CCS\t41972616\tReverse\tRead \t0\t3975\tReverse\tchr21\n"
"Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21\t1\n"
"Reference\tm2257/8161/CCS\t41972616\tReverse\tRead \t0\t3975\tReverse\tchr21\t1\n"
};

std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory
std::filesystem::remove(tmp_dir/"detect_breakends_out_short.fasta"); // remove old output if existent

testing::internal::CaptureStdout();
detect_junctions_in_alignment_file(DATADIR"simulated.minimap2.hg19.coordsorted_cutoff.sam",
tmp_dir/"detect_breakends_out_short.fasta", {1});
tmp_dir/"detect_breakends_out_short.fasta", {1}, simple_clustering);

std::string std_cout = testing::internal::GetCapturedStdout();
seqan3::debug_stream << "std_out:\n" << std_cout << '\n';
Expand All @@ -64,17 +73,16 @@ TEST(junction_detection, method_1_only)
TEST(junction_detection, method_2_only)
{
std::string expected{
"Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm21263/13017/CCS\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm38637/7161/CCS\n"
"Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS\t1\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm21263/13017/CCS\t1\n"
};

std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory
std::filesystem::remove(tmp_dir/"detect_breakends_out_short.fasta"); // remove old output if existent

testing::internal::CaptureStdout();
detect_junctions_in_alignment_file(DATADIR"simulated.minimap2.hg19.coordsorted_cutoff.sam",
tmp_dir/"detect_breakends_out_short.fasta", {2});
tmp_dir/"detect_breakends_out_short.fasta", {2}, simple_clustering);

std::string std_cout = testing::internal::GetCapturedStdout();
seqan3::debug_stream << "std_out:\n" << std_cout << '\n';
Expand All @@ -87,19 +95,18 @@ TEST(junction_detection, method_2_only)
TEST(junction_detection, method_1_and_2)
{
std::string expected{
"Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm21263/13017/CCS\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm38637/7161/CCS\n"
"Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21\n"
"Reference\tm2257/8161/CCS\t41972616\tReverse\tRead \t0\t3975\tReverse\tchr21\n"
"Reference\tchr22\t17458417\tForward\tReference\tchr21\t41972615\tForward\tm41327/11677/CCS\t1\n"
"Reference\tchr22\t17458418\tForward\tReference\tchr21\t41972616\tForward\tm21263/13017/CCS\t1\n"
"Reference\tm2257/8161/CCS\t41972616\tForward\tRead \t0\t2294\tForward\tchr21\t2\n"
"Reference\tm2257/8161/CCS\t41972616\tReverse\tRead \t0\t3975\tReverse\tchr21\t1\n"
};

std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory
std::filesystem::remove(tmp_dir/"detect_breakends_out_short.fasta"); // remove old output if existent

testing::internal::CaptureStdout();
detect_junctions_in_alignment_file(DATADIR"simulated.minimap2.hg19.coordsorted_cutoff.sam",
tmp_dir/"detect_breakends_out_short.fasta", {1, 2});
tmp_dir/"detect_breakends_out_short.fasta", {1, 2}, simple_clustering);

std::string std_cout = testing::internal::GetCapturedStdout();
seqan3::debug_stream << "std_out:\n" << std_cout << '\n';
Expand Down
Loading