Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add accuracy #16

Merged
merged 8 commits into from
Feb 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

# Executables
*.exe
*.out
*.app

example_data*
40 changes: 40 additions & 0 deletions include/compare.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <seqan3/alphabet/nucleotide/dna5.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/io/sequence_file/all.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
#include <seqan3/search/views/kmer_hash.hpp>
#include <seqan3/search/views/minimiser_hash.hpp>

Expand Down Expand Up @@ -47,6 +48,15 @@ struct range_arguments : minimiser_arguments, strobemer_arguments
uint8_t k_size;
};

struct accuracy_arguments : range_arguments
{
std::vector<std::filesystem::path> input_file{};
uint64_t ibfsize{};
size_t number_hashes{1};
std::filesystem::path search_file{};
float threshold{0.5};
};

//!\brief Use dna4 instead of default dna5
struct my_traits : seqan3::sequence_file_input_default_traits_dna
{
Expand All @@ -62,6 +72,36 @@ struct my_traits2 : seqan3::sequence_file_input_default_traits_dna
using sequence_container = std::string;
};

/*! \brief Function, loading compressed and uncompressed ibfs
* \param ibf ibf to load
* \param ipath Path, where the ibf can be found.
*/
template <class IBFType>
void load_ibf(IBFType & ibf, std::filesystem::path ipath)
{
std::ifstream is{ipath, std::ios::binary};
cereal::BinaryInputArchive iarchive{is};
iarchive(ibf);
}

/*! \brief Function, which stored compressed and uncompressed ibfs
* \param ibf The IBF to store.
* \param opath Path, where the IBF should be stored.
*/
template <class IBFType>
void store_ibf(IBFType const & ibf,
std::filesystem::path opath)
{
std::ofstream os{opath, std::ios::binary};
cereal::BinaryOutputArchive oarchive{os};
oarchive(seqan3::interleaved_bloom_filter(ibf));
}

/*! \brief Function, comparing the methods in regard of their coverage.
* \param args The arguments about the view to be used.
*/
void do_accuracy(accuracy_arguments & args);

/*! \brief Function, comparing the methods.
* \param sequence_files A vector of sequence files.
* \param args The arguments about the view to be used.
Expand Down
88 changes: 88 additions & 0 deletions src/compare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,79 @@ std::vector<std::tuple<uint64_t, unsigned int, unsigned int, unsigned int, unsig
strobes_vector = seq_to_minstrobes2(args.order, args.k_size, args.w_min, args.w_max, seq, 0);
}

template <typename urng_t>
void accuracy(urng_t input_view,
std::string method_name,
accuracy_arguments & args)
{
// Loading/Creating the ibf.
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed> ibf;
if ((std::filesystem::path{args.input_file[0]}.extension() == ".ibf") & (args.input_file.size() == 1))
{
load_ibf(ibf, args.input_file[0]);
}
else if (std::filesystem::path{args.input_file[0]}.extension() == ".out")
{
seqan3::interleaved_bloom_filter ibf_create{seqan3::bin_count{args.input_file.size()},
seqan3::bin_size{args.ibfsize},
seqan3::hash_function_count{args.number_hashes}};

uint64_t minimiser;
uint16_t minimiser_count;
for(size_t i = 0; i < args.input_file.size(); i++)
{
std::ifstream infile{args.input_file[i], std::ios::binary};
while(infile.read((char*)&minimiser, sizeof(minimiser)))
{
infile.read((char*)&minimiser_count, sizeof(minimiser_count));
ibf_create.emplace(minimiser, seqan3::bin_index{i});
}
}
store_ibf(ibf_create, std::string{args.path_out} + method_name + ".ibf");
load_ibf(ibf, std::string{args.path_out} + method_name + ".ibf");
}

// Search through the ibf with a given threshold.

// Load search sequence file.
seqan3::sequence_file_input<my_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>> fin{args.search_file};
std::vector<std::string> ids;
std::vector<seqan3::dna4_vector> seqs;
for (auto & [id, seq] : fin)
{
ids.push_back(id);
seqs.push_back(seq);
}

std::ofstream outfile;
outfile.open(std::string{args.path_out} + method_name + "_" + std::string{args.search_file.stem()} + ".search_out");
// Go over the sequences in the search file.
for (int i = 0; i < seqs.size(); ++i)
{
std::vector<uint32_t> counter;
counter.assign(ibf.bin_count(), 0);
uint64_t length = 0;
for (auto && hash : seqs[i] | input_view)
{
auto agent = ibf.membership_agent();
std::transform (counter.begin(), counter.end(), agent.bulk_contains(hash).begin(), counter.begin(),
std::plus<int>());
++length;
}

outfile << ids[i] << "\t";
for (int j = 0; j < ibf.bin_count(); ++j)
{
if (counter[j] >= (length * args.threshold))
outfile << j << ",";
outfile << "\n";
}
}

outfile.close();

}

/*! \brief Function, comparing the methods.
* \param sequence_files A vector of sequence files.
* \param input_view View that should be tested.
Expand Down Expand Up @@ -276,6 +349,21 @@ void compare_cov2(std::filesystem::path sequence_file, urng_t distance_view, std
outfile.close();
}

void do_accuracy(accuracy_arguments & args)
{
switch(args.name)
{
case kmer: accuracy(seqan3::views::kmer_hash(args.shape), "kmer_hash_" + std::to_string(args.k_size), args);
break;
case minimiser: accuracy(seqan3::views::minimiser_hash(args.shape,
args.w_size, args.seed_se), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args);
break;
case modmers: accuracy(modmer_hash(args.shape,
args.w_size.get(), args.seed_se), "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args);
break;
}
}

void do_comparison(std::vector<std::filesystem::path> sequence_files, range_arguments & args)
{
switch(args.name)
Expand Down
75 changes: 64 additions & 11 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,62 @@ void parsing(range_arguments & args)
args.seed_se = seqan3::seed{adjust_seed(args.k_size, se)};
}

int accuracy(seqan3::argument_parser & parser)
{
accuracy_arguments args{};
parser.add_positional_option(args.input_file, "Either one input file containg the"
" ibf with the file extension"
"'.ibf' or multiple preprocessed "
"binary files ending with '.out'.");
parser.add_option(args.path_out, 'o', "out",
"Directory, where output files should be saved.");
parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size.");
std::string method{};
parser.add_option(method, '\0', "method", "Pick your method.",
seqan3::option_spec::required,
seqan3::value_list_validator{"kmer", "minimiser", "modmer"});
parser.add_option(args.search_file, '\0', "searchfile", "A sequence files with sequences to search for.",
seqan3::option_spec::required);
parser.add_option(args.ibfsize, '\0', "ibfsize", "The size of the ibf.",
seqan3::option_spec::advanced);
parser.add_option(args.number_hashes, '\0', "number_hashes",
"The number of hashes to use.",
seqan3::option_spec::advanced);
parser.add_option(args.threshold, '\0', "threshold",
"The threshold to use for the search.",
seqan3::option_spec::advanced);

read_range_arguments_minimiser(parser, args);

try
{
parser.parse();
parsing(args);
}
catch (seqan3::argument_parser_error const & ext) // catch user errors
{
seqan3::debug_stream << "Error. Incorrect command line input for accuracy. " << ext.what() << "\n";
return -1;
}

string_to_methods(method, args.name);
do_accuracy(args);

return 0;
}

int coverage(seqan3::argument_parser & parser)
{
range_arguments args{};
std::filesystem::path sequence_file{};
parser.add_positional_option(sequence_file, "Please provide at least one sequence file.");
parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved.");
parser.add_positional_option(sequence_file, "Please provide one sequence file.");
parser.add_option(args.path_out, 'o', "out",
"Directory, where output files should be saved.");
parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size.");
std::string method{};
parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer"});
parser.add_option(method, '\0', "method", "Pick your method.",
seqan3::option_spec::required,
seqan3::value_list_validator{"kmer", "minimiser", "modmer"});

read_range_arguments_minimiser(parser, args);

Expand All @@ -82,11 +129,14 @@ int speed(seqan3::argument_parser & parser)
{
range_arguments args{};
std::vector<std::filesystem::path> sequence_files{};
parser.add_positional_option(sequence_files, "Please provide at least one sequence file.");
parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved.");
parser.add_positional_option(sequence_files,
"Please provide at least one sequence file.");
parser.add_option(args.path_out, 'o', "out",
"Directory, where output files should be saved.");
parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size.");
std::string method{};
parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"});
parser.add_option(method, '\0', "method", "Pick your method.",
seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"});

read_range_arguments_minimiser(parser, args);
read_range_arguments_strobemers(parser, args);
Expand All @@ -111,26 +161,29 @@ int speed(seqan3::argument_parser & parser)

int main(int argc, char ** argv)
{
seqan3::argument_parser top_level_parser{"minions", argc, argv, seqan3::update_notifications::on,
{"coverage", "speed"}};
seqan3::argument_parser top_level_parser{"minions", argc, argv,
seqan3::update_notifications::on,
{"accuracy", "coverage", "speed"}};

// Parser
top_level_parser.info.author = "Mitra Darvish"; // give parser some infos
top_level_parser.info.version = "0.1.0";

try
{
top_level_parser.parse(); // trigger command line parsing
top_level_parser.parse(); // trigger command line parsing
}
catch (seqan3::argument_parser_error const & ext) // catch user errors
catch (seqan3::argument_parser_error const & ext) // catch user errors
{
seqan3::debug_stream << "Parsing error. " << ext.what() << "\n"; // give error message
return -1;
}

seqan3::argument_parser & sub_parser = top_level_parser.get_sub_parser(); // hold a reference to the sub_parser

if (sub_parser.info.app_name == std::string_view{"minions-coverage"})
if (sub_parser.info.app_name == std::string_view{"minions-accuracy"})
accuracy(sub_parser);
else if (sub_parser.info.app_name == std::string_view{"minions-coverage"})
coverage(sub_parser);
else if (sub_parser.info.app_name == std::string_view{"minions-speed"})
speed(sub_parser);
Expand Down
2 changes: 1 addition & 1 deletion test/api/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required (VERSION 3.8)

add_api_test (comparison_test.cpp)
target_use_datasources (comparison_test FILES example1.fasta)
target_use_datasources (comparison_test FILES example1.fasta example.ibf minimiser_hash_19_19_example1.out search.fasta)

add_api_test (minimiser_distance_test.cpp)

Expand Down
65 changes: 65 additions & 0 deletions test/api/comparison_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,68 @@ TEST(minions, small_example)
}
infile.close();
}

TEST(minions, accuracy_binary_file)
{
accuracy_arguments args{};
args.name = minimiser;
args.k_size = 19;
args.w_size = seqan3::window_size{19};
args.shape = seqan3::ungapped{19};
args.seed_se = seqan3::seed{adjust_seed(args.k_size)};
args.input_file = {DATADIR"minimiser_hash_19_19_example1.out"};
args.ibfsize = 1000000;
args.path_out = std::filesystem::path{std::string{std::filesystem::temp_directory_path()} + "/bin_"};
args.search_file = DATADIR"search.fasta";
do_accuracy(args);

seqan3::interleaved_bloom_filter ibf{};
load_ibf(ibf, std::string{args.path_out} + "minimiser_hash_19_19.ibf");
auto agent = ibf.membership_agent();

// Check if ibf was created correctly
std::vector<bool> expected_result(1, 1);
auto & res = agent.bulk_contains(39030638997);
EXPECT_RANGE_EQ(expected_result, res);

std::vector<std::string> expected{"test 0,", "test2 0,"};
int i{0};
std::ifstream infile{std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out"};
std::string line;
while (std::getline(infile, line))
{
std::istringstream iss(line);
EXPECT_EQ(expected[i], line);
i++;
}
std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19.ibf");
std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out");
}

TEST(minions, accuracy_existing_ibf)
{
accuracy_arguments args{};
args.name = minimiser;
args.k_size = 19;
args.w_size = seqan3::window_size{19};
args.shape = seqan3::ungapped{19};
args.seed_se = seqan3::seed{adjust_seed(args.k_size)};
args.input_file = {DATADIR"example.ibf"};
args.path_out = std::filesystem::path{std::string{std::filesystem::temp_directory_path()} + "/"};
args.search_file = DATADIR"search.fasta";
args.threshold = 0.5;
do_accuracy(args);

std::vector<std::string> expected{"test 0,", "test2 0,"};
int i{0};
std::ifstream infile{std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out"};
std::string line;
while (std::getline(infile, line))
{
std::istringstream iss(line);
EXPECT_EQ(expected[i], line);
i++;
}
std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19.ibf");
std::filesystem::remove(std::string{args.path_out} + "minimiser_hash_19_19_" + std::string{args.search_file.stem()} + ".search_out");
}
1 change: 1 addition & 0 deletions test/cli/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
cmake_minimum_required (VERSION 3.8)

add_cli_test (minions_options_test.cpp)
add_cli_test (minions_accuracy_test.cpp FILES example.ibf minimiser_hash_19_19_example1.out)
add_cli_test (minions_coverage_test.cpp FILES example1.fasta)
add_cli_test (minions_speed_test.cpp FILES example1.fasta)
Loading