diff --git a/.gitignore b/.gitignore index 5b66b76..0733e31 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,6 @@ # Executables *.exe -*.out *.app example_data* diff --git a/include/compare.h b/include/compare.h index bdd1757..c5c1e56 100644 --- a/include/compare.h +++ b/include/compare.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,15 @@ struct range_arguments : minimiser_arguments, strobemer_arguments uint8_t k_size; }; +struct accuracy_arguments : range_arguments +{ + std::vector input_file{}; + uint64_t ibfsize{}; + size_t number_hashes{1}; + std::filesystem::path search_file{}; + float threshold{0.5}; +}; + //!\brief Use dna4 instead of default dna5 struct my_traits : seqan3::sequence_file_input_default_traits_dna { @@ -62,6 +72,36 @@ struct my_traits2 : seqan3::sequence_file_input_default_traits_dna using sequence_container = std::string; }; +/*! \brief Function, loading compressed and uncompressed ibfs + * \param ibf ibf to load + * \param ipath Path, where the ibf can be found. + */ +template +void load_ibf(IBFType & ibf, std::filesystem::path ipath) +{ + std::ifstream is{ipath, std::ios::binary}; + cereal::BinaryInputArchive iarchive{is}; + iarchive(ibf); +} + +/*! \brief Function, which stored compressed and uncompressed ibfs + * \param ibf The IBF to store. + * \param opath Path, where the IBF should be stored. + */ +template +void store_ibf(IBFType const & ibf, + std::filesystem::path opath) +{ + std::ofstream os{opath, std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(seqan3::interleaved_bloom_filter(ibf)); +} + +/*! \brief Function, comparing the methods in regard of their coverage. + * \param args The arguments about the view to be used. + */ +void do_accuracy(accuracy_arguments & args); + /*! \brief Function, comparing the methods. * \param sequence_files A vector of sequence files. * \param args The arguments about the view to be used. diff --git a/src/compare.cpp b/src/compare.cpp index f380bf0..d47facc 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -47,6 +47,79 @@ std::vector +void accuracy(urng_t input_view, + std::string method_name, + accuracy_arguments & args) +{ + // Loading/Creating the ibf. + seqan3::interleaved_bloom_filter ibf; + if ((std::filesystem::path{args.input_file[0]}.extension() == ".ibf") & (args.input_file.size() == 1)) + { + load_ibf(ibf, args.input_file[0]); + } + else if (std::filesystem::path{args.input_file[0]}.extension() == ".out") + { + seqan3::interleaved_bloom_filter ibf_create{seqan3::bin_count{args.input_file.size()}, + seqan3::bin_size{args.ibfsize}, + seqan3::hash_function_count{args.number_hashes}}; + + uint64_t minimiser; + uint16_t minimiser_count; + for(size_t i = 0; i < args.input_file.size(); i++) + { + std::ifstream infile{args.input_file[i], std::ios::binary}; + while(infile.read((char*)&minimiser, sizeof(minimiser))) + { + infile.read((char*)&minimiser_count, sizeof(minimiser_count)); + ibf_create.emplace(minimiser, seqan3::bin_index{i}); + } + } + store_ibf(ibf_create, std::string{args.path_out} + method_name + ".ibf"); + load_ibf(ibf, std::string{args.path_out} + method_name + ".ibf"); + } + + // Search through the ibf with a given threshold. + + // Load search sequence file. + seqan3::sequence_file_input> fin{args.search_file}; + std::vector ids; + std::vector seqs; + for (auto & [id, seq] : fin) + { + ids.push_back(id); + seqs.push_back(seq); + } + + std::ofstream outfile; + outfile.open(std::string{args.path_out} + method_name + "_" + std::string{args.search_file.stem()} + ".search_out"); + // Go over the sequences in the search file. + for (int i = 0; i < seqs.size(); ++i) + { + std::vector counter; + counter.assign(ibf.bin_count(), 0); + uint64_t length = 0; + for (auto && hash : seqs[i] | input_view) + { + auto agent = ibf.membership_agent(); + std::transform (counter.begin(), counter.end(), agent.bulk_contains(hash).begin(), counter.begin(), + std::plus()); + ++length; + } + + outfile << ids[i] << "\t"; + for (int j = 0; j < ibf.bin_count(); ++j) + { + if (counter[j] >= (length * args.threshold)) + outfile << j << ","; + outfile << "\n"; + } + } + + outfile.close(); + +} + /*! \brief Function, comparing the methods. * \param sequence_files A vector of sequence files. * \param input_view View that should be tested. @@ -276,6 +349,21 @@ void compare_cov2(std::filesystem::path sequence_file, urng_t distance_view, std outfile.close(); } +void do_accuracy(accuracy_arguments & args) +{ + switch(args.name) + { + case kmer: accuracy(seqan3::views::kmer_hash(args.shape), "kmer_hash_" + std::to_string(args.k_size), args); + break; + case minimiser: accuracy(seqan3::views::minimiser_hash(args.shape, + args.w_size, args.seed_se), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + break; + case modmers: accuracy(modmer_hash(args.shape, + args.w_size.get(), args.seed_se), "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + break; + } +} + void do_comparison(std::vector sequence_files, range_arguments & args) { switch(args.name) diff --git a/src/main.cpp b/src/main.cpp index 713d8a6..1b3b3f3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,15 +49,62 @@ void parsing(range_arguments & args) args.seed_se = seqan3::seed{adjust_seed(args.k_size, se)}; } +int accuracy(seqan3::argument_parser & parser) +{ + accuracy_arguments args{}; + parser.add_positional_option(args.input_file, "Either one input file containg the" + " ibf with the file extension" + "'.ibf' or multiple preprocessed " + "binary files ending with '.out'."); + parser.add_option(args.path_out, 'o', "out", + "Directory, where output files should be saved."); + parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size."); + std::string method{}; + parser.add_option(method, '\0', "method", "Pick your method.", + seqan3::option_spec::required, + seqan3::value_list_validator{"kmer", "minimiser", "modmer"}); + parser.add_option(args.search_file, '\0', "searchfile", "A sequence files with sequences to search for.", + seqan3::option_spec::required); + parser.add_option(args.ibfsize, '\0', "ibfsize", "The size of the ibf.", + seqan3::option_spec::advanced); + parser.add_option(args.number_hashes, '\0', "number_hashes", + "The number of hashes to use.", + seqan3::option_spec::advanced); + parser.add_option(args.threshold, '\0', "threshold", + "The threshold to use for the search.", + seqan3::option_spec::advanced); + + read_range_arguments_minimiser(parser, args); + + try + { + parser.parse(); + parsing(args); + } + catch (seqan3::argument_parser_error const & ext) // catch user errors + { + seqan3::debug_stream << "Error. Incorrect command line input for accuracy. " << ext.what() << "\n"; + return -1; + } + + string_to_methods(method, args.name); + do_accuracy(args); + + return 0; +} + int coverage(seqan3::argument_parser & parser) { range_arguments args{}; std::filesystem::path sequence_file{}; - parser.add_positional_option(sequence_file, "Please provide at least one sequence file."); - parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); + parser.add_positional_option(sequence_file, "Please provide one sequence file."); + parser.add_option(args.path_out, 'o', "out", + "Directory, where output files should be saved."); parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size."); std::string method{}; - parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer"}); + parser.add_option(method, '\0', "method", "Pick your method.", + seqan3::option_spec::required, + seqan3::value_list_validator{"kmer", "minimiser", "modmer"}); read_range_arguments_minimiser(parser, args); @@ -82,11 +129,14 @@ int speed(seqan3::argument_parser & parser) { range_arguments args{}; std::vector sequence_files{}; - parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); - parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); + parser.add_positional_option(sequence_files, + "Please provide at least one sequence file."); + parser.add_option(args.path_out, 'o', "out", + "Directory, where output files should be saved."); parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size."); std::string method{}; - parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); + parser.add_option(method, '\0', "method", "Pick your method.", + seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); read_range_arguments_minimiser(parser, args); read_range_arguments_strobemers(parser, args); @@ -111,8 +161,9 @@ int speed(seqan3::argument_parser & parser) int main(int argc, char ** argv) { - seqan3::argument_parser top_level_parser{"minions", argc, argv, seqan3::update_notifications::on, - {"coverage", "speed"}}; + seqan3::argument_parser top_level_parser{"minions", argc, argv, + seqan3::update_notifications::on, + {"accuracy", "coverage", "speed"}}; // Parser top_level_parser.info.author = "Mitra Darvish"; // give parser some infos @@ -120,9 +171,9 @@ int main(int argc, char ** argv) try { - top_level_parser.parse(); // trigger command line parsing + top_level_parser.parse(); // trigger command line parsing } - catch (seqan3::argument_parser_error const & ext) // catch user errors + catch (seqan3::argument_parser_error const & ext) // catch user errors { seqan3::debug_stream << "Parsing error. " << ext.what() << "\n"; // give error message return -1; @@ -130,7 +181,9 @@ int main(int argc, char ** argv) seqan3::argument_parser & sub_parser = top_level_parser.get_sub_parser(); // hold a reference to the sub_parser - if (sub_parser.info.app_name == std::string_view{"minions-coverage"}) + if (sub_parser.info.app_name == std::string_view{"minions-accuracy"}) + accuracy(sub_parser); + else if (sub_parser.info.app_name == std::string_view{"minions-coverage"}) coverage(sub_parser); else if (sub_parser.info.app_name == std::string_view{"minions-speed"}) speed(sub_parser); diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt index 73ac436..c6705a0 100644 --- a/test/api/CMakeLists.txt +++ b/test/api/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required (VERSION 3.8) add_api_test (comparison_test.cpp) -target_use_datasources (comparison_test FILES example1.fasta) +target_use_datasources (comparison_test FILES example1.fasta example.ibf minimiser_hash_19_19_example1.out search.fasta) add_api_test (minimiser_distance_test.cpp) diff --git a/test/api/comparison_test.cpp b/test/api/comparison_test.cpp index 98a2593..ec2547d 100644 --- a/test/api/comparison_test.cpp +++ b/test/api/comparison_test.cpp @@ -30,3 +30,68 @@ TEST(minions, small_example) } infile.close(); } + +TEST(minions, accuracy_binary_file) +{ + accuracy_arguments args{}; + args.name = minimiser; + args.k_size = 19; + args.w_size = seqan3::window_size{19}; + args.shape = seqan3::ungapped{19}; + args.seed_se = seqan3::seed{adjust_seed(args.k_size)}; + args.input_file = {DATADIR"minimiser_hash_19_19_example1.out"}; + args.ibfsize = 1000000; + args.path_out = std::filesystem::path{std::string{std::filesystem::temp_directory_path()} + "/bin_"}; + args.search_file = DATADIR"search.fasta"; + do_accuracy(args); + + seqan3::interleaved_bloom_filter ibf{}; + load_ibf(ibf, std::string{args.path_out} + "minimiser_hash_19_19.ibf"); + auto agent = ibf.membership_agent(); + + // Check if ibf was created correctly + std::vector expected_result(1, 1); + auto & res = agent.bulk_contains(39030638997); + EXPECT_RANGE_EQ(expected_result, res); + + std::vector expected{"test 0,", "test2 0,"}; + int i{0}; + std::ifstream infile{std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out"}; + std::string line; + while (std::getline(infile, line)) + { + std::istringstream iss(line); + EXPECT_EQ(expected[i], line); + i++; + } + std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19.ibf"); + std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out"); +} + +TEST(minions, accuracy_existing_ibf) +{ + accuracy_arguments args{}; + args.name = minimiser; + args.k_size = 19; + args.w_size = seqan3::window_size{19}; + args.shape = seqan3::ungapped{19}; + args.seed_se = seqan3::seed{adjust_seed(args.k_size)}; + args.input_file = {DATADIR"example.ibf"}; + args.path_out = std::filesystem::path{std::string{std::filesystem::temp_directory_path()} + "/"}; + args.search_file = DATADIR"search.fasta"; + args.threshold = 0.5; + do_accuracy(args); + + std::vector expected{"test 0,", "test2 0,"}; + int i{0}; + std::ifstream infile{std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out"}; + std::string line; + while (std::getline(infile, line)) + { + std::istringstream iss(line); + EXPECT_EQ(expected[i], line); + i++; + } + std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19.ibf"); + std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out"); +} diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt index 8492e84..10839bf 100644 --- a/test/cli/CMakeLists.txt +++ b/test/cli/CMakeLists.txt @@ -1,5 +1,6 @@ cmake_minimum_required (VERSION 3.8) add_cli_test (minions_options_test.cpp) +add_cli_test (minions_accuracy_test.cpp FILES example.ibf minimiser_hash_19_19_example1.out) add_cli_test (minions_coverage_test.cpp FILES example1.fasta) add_cli_test (minions_speed_test.cpp FILES example1.fasta) diff --git a/test/cli/minions_accuracy_test.cpp b/test/cli/minions_accuracy_test.cpp new file mode 100644 index 0000000..2543c59 --- /dev/null +++ b/test/cli/minions_accuracy_test.cpp @@ -0,0 +1,69 @@ +#include "cli_test.hpp" + +TEST_F(cli_test, no_options) +{ + cli_test_result result = execute_app("minions accuracy"); + std::string expected + { + "minions-accuracy\n" + "================\n" + " Try -h or --help for more information.\n" + }; + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, expected); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, kmer) +{ + cli_test_result result = execute_app("minions accuracy --method kmer -k 19 ", data("example.ibf"), "--searchfile", data("search.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, minimiser) +{ + cli_test_result result = execute_app("minions accuracy --method minimiser -k 19 -w 19 ", data("example.ibf"), "--searchfile", data("search.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, gapped_minimiser) +{ + cli_test_result result = execute_app("minions accuracy --method minimiser -k 19 -w 19 --shape 524223 ", data("example.ibf"), "--searchfile", data("search.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, modmer) +{ + cli_test_result result = execute_app("minions accuracy --method modmer -k 19 -w 2 ", data("example.ibf"), "--searchfile", data("search.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, no_ibf_yet) +{ + cli_test_result result = execute_app("minions accuracy --method minimiser -k 19 -w 19 --ibfsize 10000 ", data("minimiser_hash_19_19_example1.out"), "--searchfile", data("search.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, wrong_method) +{ + cli_test_result result = execute_app("minions accuracy --method submer -k 19 ", data("example.ibf"), "--searchfile", data("search.fasta")); + std::string expected + { + "Error. Incorrect command line input for accuracy. Validation failed " + "for option --method: Value submer is not one of [kmer,minimiser,modmer].\n" + }; + + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.err, expected); + EXPECT_EQ(result.out, std::string{}); +} diff --git a/test/cli/minions_options_test.cpp b/test/cli/minions_options_test.cpp index abf2b20..49a50e7 100644 --- a/test/cli/minions_options_test.cpp +++ b/test/cli/minions_options_test.cpp @@ -19,8 +19,9 @@ TEST_F(cli_test, fail_no_argument) cli_test_result result = execute_app("minions", "-v"); std::string expected { - "Parsing error. You either forgot or misspelled the subcommand! Please specify which sub-program you want to " - "use: one of [coverage,speed]. Use -h/--help for more information.\n" + "Parsing error. You either forgot or misspelled the subcommand! Please " + "specify which sub-program you want to use: one of [accuracy,coverage," + "speed]. Use -h/--help for more information.\n" }; EXPECT_NE(result.exit_code, 0); diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index 56a22f3..af9ba96 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -3,6 +3,15 @@ cmake_minimum_required (VERSION 3.8) include (cmake/app_datasources.cmake) # copies file to /data/* +declare_datasource (FILE example.ibf + URL ${CMAKE_SOURCE_DIR}/test/data/example.ibf + URL_HASH SHA256=8d18ce55fdbb78acbd4f44d5414de1b55ac9964e00e391a5fd12bcc1622b1c6c) +declare_datasource (FILE minimiser_hash_19_19_example1.out + URL ${CMAKE_SOURCE_DIR}/test/data/minimiser_hash_19_19_example1.out + URL_HASH SHA256=8086779dc7fb37a81f20a3d202d2f9c5a39c4611f67f2c3881e4ba394deef9e6) declare_datasource (FILE example1.fasta URL ${CMAKE_SOURCE_DIR}/test/data/example1.fasta URL_HASH SHA256=e7236e7b86303d84a86a4454044125005433857416183cdaac0f4cdf7ac34e06) +declare_datasource (FILE search.fasta + URL ${CMAKE_SOURCE_DIR}/test/data/search.fasta + URL_HASH SHA256=abed0af7e29a07f5964239be77b46c427369f88cbd2e3c677f763cd7d94f9e4a) diff --git a/test/data/example.ibf b/test/data/example.ibf new file mode 100644 index 0000000..2de81fe Binary files /dev/null and b/test/data/example.ibf differ diff --git a/test/data/minimiser_hash_19_19_example1.out b/test/data/minimiser_hash_19_19_example1.out new file mode 100644 index 0000000..050bbb8 Binary files /dev/null and b/test/data/minimiser_hash_19_19_example1.out differ diff --git a/test/data/search.fasta b/test/data/search.fasta new file mode 100644 index 0000000..293c828 --- /dev/null +++ b/test/data/search.fasta @@ -0,0 +1,4 @@ +> test +ATTTCTATAATATCACATGAATATACTCGACGACCAGCATTTCCTGTGATTCACCTAGAGGATA +> test2 +AAAAAAAAAAAAAAAAAAAAAAA