Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix] alignment bug #222

Merged
merged 7 commits into from
Nov 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/mkindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,21 @@ void realMain(LambdaIndexerOptions const & options);
int mkindexMain(int const argc, char const ** argv)
{
LambdaIndexerOptions options;

#ifdef NDEBUG
try
{
parseCommandLine(options, argc, argv);
}
catch (sharg::parser_error const & ext) // catch user errors
{
std::cerr << "\n\nERROR: during command line parsing\n"
<< " \"" << ext.what() << "\"\n";
return -1;
}
#else
parseCommandLine(options, argc, argv);
#endif

#ifdef NDEBUG
try
Expand Down
14 changes: 14 additions & 0 deletions src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,21 @@ void realMain(LambdaOptions const & options);
int searchMain(int const argc, char const ** argv)
{
LambdaOptions options;

#ifdef NDEBUG
try
{
parseCommandLine(options, argc, argv);
}
catch (sharg::parser_error const & ext) // catch user errors
{
std::cerr << "\n\nERROR: during command line parsing\n"
<< " \"" << ext.what() << "\"\n";
return -1;
}
#else
parseCommandLine(options, argc, argv);
#endif

#ifdef _OPENMP
omp_set_num_threads(options.threads - options.lazyQryFile); // reserve one thread for I/O when lazy-loading
Expand Down
166 changes: 100 additions & 66 deletions src/search_algo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
return std::tie(m1._n_sId, m1.qStart, m1.qEnd, m1.sStart, m1.sEnd, m1.qFrameShift, m1.sFrameShift) ==
std::tie(m2._n_sId, m2.qStart, m2.qEnd, m2.sStart, m2.sEnd, m2.qFrameShift, m2.sFrameShift);
});
lH.stats.hitsDuplicate += before - record.matches.size();
lH.stats.hitsDuplicate2 += before - record.matches.size();

// sort by evalue before writing
record.matches.sort([](auto const & m1, auto const & m2) { return m1.bitScore > m2.bitScore; });
Expand All @@ -873,6 +873,14 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
}
lH.stats.hitsFinal += record.matches.size();

/* count uniq qry-subj-pairs */
lH.uniqSubjIds.clear();
lH.uniqSubjIds.reserve(record.matches.size());
for (auto const & bm : record.matches)
lH.uniqSubjIds.insert(bm._n_sId);

lH.stats.pairs += lH.uniqSubjIds.size();

// compute LCA
if (lH.options.computeLCA)
{
Expand Down Expand Up @@ -908,32 +916,25 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
// Function computeBlastMatch()
// --------------------------------------------------------------------------

template <typename TBlastMatch, typename TLocalHolder>
inline void _setupAlignInfix(TBlastMatch & bm, typename TLocalHolder::TMatch const & m, TLocalHolder & lH)
template <typename TLocalHolder>
inline void _widenMatch(Match & m, TLocalHolder const & lH)
{
int64_t startMod = (int64_t)m.subjStart - (int64_t)m.qryStart;
// move sStart as far left as needed to cover the part of query before qryStart
m.subjStart = (m.subjStart < m.qryStart) ? 0 : m.subjStart - m.qryStart;

bm.qEnd = lH.transQrySeqs[m.qryId].size();
decltype(bm.qEnd) band = _bandSize(bm.qEnd);
if (startMod >= 0)
{
bm.sStart = startMod;
bm.qStart = 0;
}
else
{
bm.sStart = 0;
bm.qStart = -startMod;
}
bm.sEnd = std::min<size_t>(bm.sStart + bm.qEnd - bm.qStart + band, lH.gH.transSbjSeqs[m.subjId].size());
/* always align full query independent of hit-region */
m.qryStart = 0;
m.qryEnd = lH.transQrySeqs[m.qryId].size();

if (bm.sStart >= band)
bm.sStart -= band;
else
bm.sStart = 0;
// there is no band in computation but this value extends begin and end of Subj to account for gaps
uint64_t band = _bandSize(lH.transQrySeqs[m.qryId].size());

// end on subject is beginning plus full query length plus band
m.subjEnd =
std::min<size_t>(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size());

seqan::assignSource(bm.alignRow0, lH.transQrySeqs[m.qryId] | bio::views::slice(bm.qStart, bm.qEnd));
seqan::assignSource(bm.alignRow1, lH.gH.transSbjSeqs[m.subjId] | bio::views::slice(bm.sStart, bm.sEnd));
// account for band in subj start
m.subjStart = (band < m.subjStart) ? m.subjStart - band : 0;
}

template <typename TBlastMatch, typename TLocalHolder>
Expand Down Expand Up @@ -1133,7 +1134,48 @@ inline void _performAlignment(TDepSetH & depSetH,
}

template <typename TLocalHolder>
inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bsDirection::fwd)
inline void _widenAndPreprocessMatches(std::span<Match> & matches, TLocalHolder & lH)
{
auto before = matches.size();

for (Match & m : matches)
_widenMatch<TLocalHolder>(m, lH);

std::ranges::sort(matches);

if (matches.size() > 1)
{
// pairwise merge from left to right
for (auto it = matches.begin(); it < matches.end() - 1; ++it)
{
Match & l = *it;
Match & r = *(it + 1);
if ((std::tie(l.qryId, l.subjId) == std::tie(r.qryId, r.subjId)) && (l.subjEnd >= r.subjStart))
{
l.subjEnd = r.subjEnd;
r.subjStart = l.subjStart;
}
}

// pairwise "swallow" from right to left
for (auto it = matches.rbegin(); it < matches.rend() - 1; ++it)
{
Match & r = *it;
Match & l = *(it + 1);
if ((std::tie(r.qryId, r.subjId) == std::tie(l.qryId, l.subjId)) && (r.subjStart < l.subjEnd))
{
l = r;
}
}

auto [new_end, old_end] = std::ranges::unique(matches); // move non-uniq to the end
matches = std::span<Match>{matches.begin(), new_end}; // "resize" of the span
lH.stats.hitsDuplicate += (before - matches.size());
}
}

template <typename TLocalHolder>
inline void iterateMatchesFullSimd(std::span<Match> lambdaMatches, TLocalHolder & lH, bsDirection const dir)
{
using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
using TBlastMatch = typename TLocalHolder::TBlastMatch;
Expand All @@ -1143,7 +1185,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
// statistics
#ifdef LAMBDA_MICRO_STATS
++lH.stats.numQueryWithExt;
lH.stats.numExtScore += seqan::length(lH.matches);
lH.stats.numExtScore += seqan::length(lambdaMatches);

double start = sysTime();
#endif
Expand All @@ -1152,58 +1194,37 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow0>::Type> depSetH;
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow1>::Type> depSetV;

// create blast matches
// pre-sort and filter
_widenAndPreprocessMatches(lambdaMatches, lH);

// create blast matches from Lambda matches
std::list<TBlastMatch> blastMatches;
for (auto it = lH.matches.begin(), itEnd = lH.matches.end(); it != itEnd; ++it)
for (Match const & m : lambdaMatches)
{
// In BS-mode, skip those results that have wrong orientation
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
{
if ((dir == bsDirection::fwd && (it->subjId % 2)) || (dir == bsDirection::rev && !(it->subjId % 2)))
continue;
}
// create blastmatch in list without copy or move
blastMatches.emplace_back(lH.qryIds[it->qryId / TGlobalHolder::qryNumFrames],
const_gH.indexFile.ids[it->subjId / TGlobalHolder::sbjNumFrames]);
blastMatches.emplace_back(lH.qryIds[m.qryId / TGlobalHolder::qryNumFrames],
const_gH.indexFile.ids[m.subjId / TGlobalHolder::sbjNumFrames]);

TBlastMatch & bm = blastMatches.back();

bm._n_qId = it->qryId / TGlobalHolder::qryNumFrames;
bm._n_sId = it->subjId / TGlobalHolder::sbjNumFrames;
bm._n_qId = m.qryId / TGlobalHolder::qryNumFrames;
bm._n_sId = m.subjId / TGlobalHolder::sbjNumFrames;

bm.qLength = //std::ranges::size(lH.transQrySeqs[it->qryId]);
std::ranges::size(lH.qrySeqs[bm._n_qId]);
bm.qLength = std::ranges::size(lH.qrySeqs[bm._n_qId]);
bm.sLength = std::ranges::size(lH.gH.indexFile.seqs[bm._n_sId]);

bm.sLength = // std::ranges::size(lH.gH.transSbjSeqs[it->subjId]);
std::ranges::size(lH.gH.indexFile.seqs[bm._n_sId]);
bm.qStart = m.qryStart;
bm.qEnd = m.qryEnd;
bm.sStart = m.subjStart;
bm.sEnd = m.subjEnd;
seqan::assignSource(bm.alignRow0, lH.transQrySeqs[m.qryId] | bio::views::slice(bm.qStart, bm.qEnd));
seqan::assignSource(bm.alignRow1, lH.gH.transSbjSeqs[m.subjId] | bio::views::slice(bm.sStart, bm.sEnd));

_setupAlignInfix(bm, *it, lH);

_setFrames(bm, *it, lH);
_setFrames(bm, m, lH);

if (lH.options.hasSTaxIds)
bm.sTaxIds = lH.gH.indexFile.sTaxIds[bm._n_sId];
}
#ifdef LAMBDA_MICRO_STATS
lH.stats.timeExtend += sysTime() - start;

// filter out duplicates
start = sysTime();
#endif
auto before = seqan::length(blastMatches);
blastMatches.sort(
[](auto const & l, auto const & r)
{
return std::tie(l._n_qId, l._n_sId, l.sStart, l.sEnd, l.qStart, l.qEnd, l.qFrameShift, l.sFrameShift) <
std::tie(r._n_qId, r._n_sId, r.sStart, r.sEnd, r.qStart, r.qEnd, r.qFrameShift, r.sFrameShift);
});
blastMatches.unique(
[](auto const & l, auto const & r)
{
return std::tie(l._n_qId, l._n_sId, l.sStart, l.sEnd, l.qStart, l.qEnd, l.qFrameShift, l.sFrameShift) ==
std::tie(r._n_qId, r._n_sId, r.sStart, r.sEnd, r.qStart, r.qEnd, r.qFrameShift, r.sFrameShift);
});
lH.stats.hitsDuplicate += (before - seqan::length(blastMatches));

// sort by lengths to minimize padding in SIMD
blastMatches.sort(
Expand All @@ -1217,6 +1238,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs

start = sysTime();
#endif

// fill batches
_setupDepSets(depSetH, depSetV, blastMatches);

Expand Down Expand Up @@ -1342,12 +1364,24 @@ inline void writeRecords(TLocalHolder & lH)
template <typename TLocalHolder>
inline void iterateMatches(TLocalHolder & lH)
{
iterateMatchesFullSimd(lH, bsDirection::fwd);
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
{
iterateMatchesFullSimd(lH, bsDirection::rev);
std::ranges::sort(lH.matches,
[](Match const & l, Match const & r) {
return std::tuple<bool, Match const &>{l.subjId % 2, l} <
std::tuple<bool, Match const &>{r.subjId % 2, r};
});

auto it = std::ranges::find_if(lH.matches, [](Match const & m) { return m.subjId % 2; });

iterateMatchesFullSimd(std::span{lH.matches.begin(), it}, lH, bsDirection::fwd);
iterateMatchesFullSimd(std::span{it, lH.matches.end()}, lH, bsDirection::rev);
lH.blastMatches.sort([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId; });
}
else
{
iterateMatchesFullSimd(lH.matches, lH, bsDirection::fwd);
}
}

//-----------------------------------------------------------------------
Expand Down
25 changes: 13 additions & 12 deletions src/search_datastructures.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,7 @@ struct Match
TPos subjStart;
TPos subjEnd;

inline bool operator==(Match const & m2) const
{
return std::tie(qryId, subjId, qryStart, subjStart, qryEnd, subjEnd) ==
std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart, m2.qryEnd, m2.subjEnd);
}
inline bool operator<(Match const & m2) const
{
return std::tie(qryId, subjId, qryStart, subjStart, qryEnd, subjEnd) <
std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart, m2.qryEnd, m2.subjEnd);
}
constexpr friend auto operator<=>(Match const &, Match const &) = default;
};

// template <typename TAlph>
Expand Down Expand Up @@ -117,10 +108,12 @@ struct StatsHolder
uint64_t hitsFailedExtendEValueTest;
uint64_t hitsAbundant;
uint64_t hitsDuplicate;
uint64_t hitsDuplicate2;

// final
uint64_t hitsFinal;
uint64_t qrysWithHit;
uint64_t pairs;

#ifdef LAMBDA_MICRO_STATS
// times
Expand Down Expand Up @@ -155,9 +148,11 @@ struct StatsHolder
hitsFailedExtendEValueTest = 0;
hitsAbundant = 0;
hitsDuplicate = 0;
hitsDuplicate2 = 0;

hitsFinal = 0;
qrysWithHit = 0;
pairs = 0;

#ifdef LAMBDA_MICRO_STATS
seedLengths.clear();
Expand Down Expand Up @@ -187,9 +182,11 @@ struct StatsHolder
hitsFailedExtendEValueTest += rhs.hitsFailedExtendEValueTest;
hitsAbundant += rhs.hitsAbundant;
hitsDuplicate += rhs.hitsDuplicate;
hitsDuplicate2 += rhs.hitsDuplicate2;

hitsFinal += rhs.hitsFinal;
qrysWithHit += rhs.qrysWithHit;
pairs += rhs.pairs;

#ifdef LAMBDA_MICRO_STATS
seqan::append(seedLengths, rhs.seedLengths);
Expand Down Expand Up @@ -248,6 +245,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
std::cout << "\n - failed %-identity test " << R << stats.hitsFailedExtendPercentIdentTest << RR
<< (rem -= stats.hitsFailedExtendPercentIdentTest);
std::cout << "\n - duplicates " << R << stats.hitsDuplicate << RR << (rem -= stats.hitsDuplicate);
std::cout << "\n - late duplicates " << R << stats.hitsDuplicate2 << RR
<< (rem -= stats.hitsDuplicate2);
std::cout << "\n - abundant " << R << stats.hitsAbundant << "\033[1m" << RR
<< (rem -= stats.hitsAbundant) << "\033[0m\n\n";

Expand Down Expand Up @@ -289,7 +288,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
if (options.verbosity >= 1)
{
auto const w = seqan::_numberOfDigits(stats.hitsFinal);
std::cout << "Number of valid hits: " << std::setw(w) << stats.hitsFinal
std::cout << "Number of total hits: " << std::setw(w) << stats.hitsFinal
<< "\nNumber of Query-Subject pairs: " << std::setw(w) << stats.pairs
<< "\nNumber of Queries with at least one valid hit: " << std::setw(w) << stats.qrysWithHit << "\n";
}
}
Expand Down Expand Up @@ -481,7 +481,8 @@ class LocalDataHolder
std::vector<std::string>, // not used
std::string_view,
uint32_t>;
std::list<TBlastMatch> blastMatches;
std::list<TBlastMatch> blastMatches;
std::unordered_set<uint64_t> uniqSubjIds;

// regarding the gathering of stats
StatsHolder stats{};
Expand Down
4 changes: 2 additions & 2 deletions src/search_misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ struct QueryException : public std::runtime_error
// Alignment-related
// ============================================================================

inline int _bandSize(uint64_t const seqLength)
inline int64_t _bandSize(uint64_t const seqLength)
{
// Currently only used for padding of the subject sequences
return 63ull - std::countl_zero(std::bit_ceil(seqLength));
return static_cast<int64_t>(std::sqrt(seqLength)) + 1;
}

// ----------------------------------------------------------------------------
Expand Down
Loading
Loading