Skip to content

Commit

Permalink
Merge pull request #222 from h-2/align_bug
Browse files Browse the repository at this point in the history
[fix] alignment bug
  • Loading branch information
sarahet committed Nov 26, 2023
2 parents 73f370d + 01b7d01 commit d995cb5
Show file tree
Hide file tree
Showing 9 changed files with 599 additions and 315 deletions.
14 changes: 14 additions & 0 deletions src/mkindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,21 @@ void realMain(LambdaIndexerOptions const & options);
int mkindexMain(int const argc, char const ** argv)
{
LambdaIndexerOptions options;

#ifdef NDEBUG
try
{
parseCommandLine(options, argc, argv);
}
catch (sharg::parser_error const & ext) // catch user errors
{
std::cerr << "\n\nERROR: during command line parsing\n"
<< " \"" << ext.what() << "\"\n";
return -1;
}
#else
parseCommandLine(options, argc, argv);
#endif

#ifdef NDEBUG
try
Expand Down
14 changes: 14 additions & 0 deletions src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,21 @@ void realMain(LambdaOptions const & options);
int searchMain(int const argc, char const ** argv)
{
LambdaOptions options;

#ifdef NDEBUG
try
{
parseCommandLine(options, argc, argv);
}
catch (sharg::parser_error const & ext) // catch user errors
{
std::cerr << "\n\nERROR: during command line parsing\n"
<< " \"" << ext.what() << "\"\n";
return -1;
}
#else
parseCommandLine(options, argc, argv);
#endif

#ifdef _OPENMP
omp_set_num_threads(options.threads - options.lazyQryFile); // reserve one thread for I/O when lazy-loading
Expand Down
166 changes: 100 additions & 66 deletions src/search_algo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
return std::tie(m1._n_sId, m1.qStart, m1.qEnd, m1.sStart, m1.sEnd, m1.qFrameShift, m1.sFrameShift) ==
std::tie(m2._n_sId, m2.qStart, m2.qEnd, m2.sStart, m2.sEnd, m2.qFrameShift, m2.sFrameShift);
});
lH.stats.hitsDuplicate += before - record.matches.size();
lH.stats.hitsDuplicate2 += before - record.matches.size();

// sort by evalue before writing
record.matches.sort([](auto const & m1, auto const & m2) { return m1.bitScore > m2.bitScore; });
Expand All @@ -873,6 +873,14 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
}
lH.stats.hitsFinal += record.matches.size();

/* count uniq qry-subj-pairs */
lH.uniqSubjIds.clear();
lH.uniqSubjIds.reserve(record.matches.size());
for (auto const & bm : record.matches)
lH.uniqSubjIds.insert(bm._n_sId);

lH.stats.pairs += lH.uniqSubjIds.size();

// compute LCA
if (lH.options.computeLCA)
{
Expand Down Expand Up @@ -908,32 +916,25 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
// Function computeBlastMatch()
// --------------------------------------------------------------------------

template <typename TBlastMatch, typename TLocalHolder>
inline void _setupAlignInfix(TBlastMatch & bm, typename TLocalHolder::TMatch const & m, TLocalHolder & lH)
template <typename TLocalHolder>
inline void _widenMatch(Match & m, TLocalHolder const & lH)
{
int64_t startMod = (int64_t)m.subjStart - (int64_t)m.qryStart;
// move sStart as far left as needed to cover the part of query before qryStart
m.subjStart = (m.subjStart < m.qryStart) ? 0 : m.subjStart - m.qryStart;

bm.qEnd = lH.transQrySeqs[m.qryId].size();
decltype(bm.qEnd) band = _bandSize(bm.qEnd);
if (startMod >= 0)
{
bm.sStart = startMod;
bm.qStart = 0;
}
else
{
bm.sStart = 0;
bm.qStart = -startMod;
}
bm.sEnd = std::min<size_t>(bm.sStart + bm.qEnd - bm.qStart + band, lH.gH.transSbjSeqs[m.subjId].size());
/* always align full query independent of hit-region */
m.qryStart = 0;
m.qryEnd = lH.transQrySeqs[m.qryId].size();

if (bm.sStart >= band)
bm.sStart -= band;
else
bm.sStart = 0;
// there is no band in computation but this value extends begin and end of Subj to account for gaps
uint64_t band = _bandSize(lH.transQrySeqs[m.qryId].size());

// end on subject is beginning plus full query length plus band
m.subjEnd =
std::min<size_t>(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size());

seqan::assignSource(bm.alignRow0, lH.transQrySeqs[m.qryId] | bio::views::slice(bm.qStart, bm.qEnd));
seqan::assignSource(bm.alignRow1, lH.gH.transSbjSeqs[m.subjId] | bio::views::slice(bm.sStart, bm.sEnd));
// account for band in subj start
m.subjStart = (band < m.subjStart) ? m.subjStart - band : 0;
}

template <typename TBlastMatch, typename TLocalHolder>
Expand Down Expand Up @@ -1133,7 +1134,48 @@ inline void _performAlignment(TDepSetH & depSetH,
}

template <typename TLocalHolder>
inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bsDirection::fwd)
inline void _widenAndPreprocessMatches(std::span<Match> & matches, TLocalHolder & lH)
{
auto before = matches.size();

for (Match & m : matches)
_widenMatch<TLocalHolder>(m, lH);

std::ranges::sort(matches);

if (matches.size() > 1)
{
// pairwise merge from left to right
for (auto it = matches.begin(); it < matches.end() - 1; ++it)
{
Match & l = *it;
Match & r = *(it + 1);
if ((std::tie(l.qryId, l.subjId) == std::tie(r.qryId, r.subjId)) && (l.subjEnd >= r.subjStart))
{
l.subjEnd = r.subjEnd;
r.subjStart = l.subjStart;
}
}

// pairwise "swallow" from right to left
for (auto it = matches.rbegin(); it < matches.rend() - 1; ++it)
{
Match & r = *it;
Match & l = *(it + 1);
if ((std::tie(r.qryId, r.subjId) == std::tie(l.qryId, l.subjId)) && (r.subjStart < l.subjEnd))
{
l = r;
}
}

auto [new_end, old_end] = std::ranges::unique(matches); // move non-uniq to the end
matches = std::span<Match>{matches.begin(), new_end}; // "resize" of the span
lH.stats.hitsDuplicate += (before - matches.size());
}
}

template <typename TLocalHolder>
inline void iterateMatchesFullSimd(std::span<Match> lambdaMatches, TLocalHolder & lH, bsDirection const dir)
{
using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
using TBlastMatch = typename TLocalHolder::TBlastMatch;
Expand All @@ -1143,7 +1185,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
// statistics
#ifdef LAMBDA_MICRO_STATS
++lH.stats.numQueryWithExt;
lH.stats.numExtScore += seqan::length(lH.matches);
lH.stats.numExtScore += seqan::length(lambdaMatches);

double start = sysTime();
#endif
Expand All @@ -1152,58 +1194,37 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow0>::Type> depSetH;
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow1>::Type> depSetV;

// create blast matches
// pre-sort and filter
_widenAndPreprocessMatches(lambdaMatches, lH);

// create blast matches from Lambda matches
std::list<TBlastMatch> blastMatches;
for (auto it = lH.matches.begin(), itEnd = lH.matches.end(); it != itEnd; ++it)
for (Match const & m : lambdaMatches)
{
// In BS-mode, skip those results that have wrong orientation
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
{
if ((dir == bsDirection::fwd && (it->subjId % 2)) || (dir == bsDirection::rev && !(it->subjId % 2)))
continue;
}
// create blastmatch in list without copy or move
blastMatches.emplace_back(lH.qryIds[it->qryId / TGlobalHolder::qryNumFrames],
const_gH.indexFile.ids[it->subjId / TGlobalHolder::sbjNumFrames]);
blastMatches.emplace_back(lH.qryIds[m.qryId / TGlobalHolder::qryNumFrames],
const_gH.indexFile.ids[m.subjId / TGlobalHolder::sbjNumFrames]);

TBlastMatch & bm = blastMatches.back();

bm._n_qId = it->qryId / TGlobalHolder::qryNumFrames;
bm._n_sId = it->subjId / TGlobalHolder::sbjNumFrames;
bm._n_qId = m.qryId / TGlobalHolder::qryNumFrames;
bm._n_sId = m.subjId / TGlobalHolder::sbjNumFrames;

bm.qLength = //std::ranges::size(lH.transQrySeqs[it->qryId]);
std::ranges::size(lH.qrySeqs[bm._n_qId]);
bm.qLength = std::ranges::size(lH.qrySeqs[bm._n_qId]);
bm.sLength = std::ranges::size(lH.gH.indexFile.seqs[bm._n_sId]);

bm.sLength = // std::ranges::size(lH.gH.transSbjSeqs[it->subjId]);
std::ranges::size(lH.gH.indexFile.seqs[bm._n_sId]);
bm.qStart = m.qryStart;
bm.qEnd = m.qryEnd;
bm.sStart = m.subjStart;
bm.sEnd = m.subjEnd;
seqan::assignSource(bm.alignRow0, lH.transQrySeqs[m.qryId] | bio::views::slice(bm.qStart, bm.qEnd));
seqan::assignSource(bm.alignRow1, lH.gH.transSbjSeqs[m.subjId] | bio::views::slice(bm.sStart, bm.sEnd));

_setupAlignInfix(bm, *it, lH);

_setFrames(bm, *it, lH);
_setFrames(bm, m, lH);

if (lH.options.hasSTaxIds)
bm.sTaxIds = lH.gH.indexFile.sTaxIds[bm._n_sId];
}
#ifdef LAMBDA_MICRO_STATS
lH.stats.timeExtend += sysTime() - start;

// filter out duplicates
start = sysTime();
#endif
auto before = seqan::length(blastMatches);
blastMatches.sort(
[](auto const & l, auto const & r)
{
return std::tie(l._n_qId, l._n_sId, l.sStart, l.sEnd, l.qStart, l.qEnd, l.qFrameShift, l.sFrameShift) <
std::tie(r._n_qId, r._n_sId, r.sStart, r.sEnd, r.qStart, r.qEnd, r.qFrameShift, r.sFrameShift);
});
blastMatches.unique(
[](auto const & l, auto const & r)
{
return std::tie(l._n_qId, l._n_sId, l.sStart, l.sEnd, l.qStart, l.qEnd, l.qFrameShift, l.sFrameShift) ==
std::tie(r._n_qId, r._n_sId, r.sStart, r.sEnd, r.qStart, r.qEnd, r.qFrameShift, r.sFrameShift);
});
lH.stats.hitsDuplicate += (before - seqan::length(blastMatches));

// sort by lengths to minimize padding in SIMD
blastMatches.sort(
Expand All @@ -1217,6 +1238,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs

start = sysTime();
#endif

// fill batches
_setupDepSets(depSetH, depSetV, blastMatches);

Expand Down Expand Up @@ -1342,12 +1364,24 @@ inline void writeRecords(TLocalHolder & lH)
template <typename TLocalHolder>
inline void iterateMatches(TLocalHolder & lH)
{
iterateMatchesFullSimd(lH, bsDirection::fwd);
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
{
iterateMatchesFullSimd(lH, bsDirection::rev);
std::ranges::sort(lH.matches,
[](Match const & l, Match const & r) {
return std::tuple<bool, Match const &>{l.subjId % 2, l} <
std::tuple<bool, Match const &>{r.subjId % 2, r};
});

auto it = std::ranges::find_if(lH.matches, [](Match const & m) { return m.subjId % 2; });

iterateMatchesFullSimd(std::span{lH.matches.begin(), it}, lH, bsDirection::fwd);
iterateMatchesFullSimd(std::span{it, lH.matches.end()}, lH, bsDirection::rev);
lH.blastMatches.sort([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId; });
}
else
{
iterateMatchesFullSimd(lH.matches, lH, bsDirection::fwd);
}
}

//-----------------------------------------------------------------------
Expand Down
25 changes: 13 additions & 12 deletions src/search_datastructures.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,7 @@ struct Match
TPos subjStart;
TPos subjEnd;

inline bool operator==(Match const & m2) const
{
return std::tie(qryId, subjId, qryStart, subjStart, qryEnd, subjEnd) ==
std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart, m2.qryEnd, m2.subjEnd);
}
inline bool operator<(Match const & m2) const
{
return std::tie(qryId, subjId, qryStart, subjStart, qryEnd, subjEnd) <
std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart, m2.qryEnd, m2.subjEnd);
}
constexpr friend auto operator<=>(Match const &, Match const &) = default;
};

// template <typename TAlph>
Expand Down Expand Up @@ -117,10 +108,12 @@ struct StatsHolder
uint64_t hitsFailedExtendEValueTest;
uint64_t hitsAbundant;
uint64_t hitsDuplicate;
uint64_t hitsDuplicate2;

// final
uint64_t hitsFinal;
uint64_t qrysWithHit;
uint64_t pairs;

#ifdef LAMBDA_MICRO_STATS
// times
Expand Down Expand Up @@ -155,9 +148,11 @@ struct StatsHolder
hitsFailedExtendEValueTest = 0;
hitsAbundant = 0;
hitsDuplicate = 0;
hitsDuplicate2 = 0;

hitsFinal = 0;
qrysWithHit = 0;
pairs = 0;

#ifdef LAMBDA_MICRO_STATS
seedLengths.clear();
Expand Down Expand Up @@ -187,9 +182,11 @@ struct StatsHolder
hitsFailedExtendEValueTest += rhs.hitsFailedExtendEValueTest;
hitsAbundant += rhs.hitsAbundant;
hitsDuplicate += rhs.hitsDuplicate;
hitsDuplicate2 += rhs.hitsDuplicate2;

hitsFinal += rhs.hitsFinal;
qrysWithHit += rhs.qrysWithHit;
pairs += rhs.pairs;

#ifdef LAMBDA_MICRO_STATS
seqan::append(seedLengths, rhs.seedLengths);
Expand Down Expand Up @@ -248,6 +245,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
std::cout << "\n - failed %-identity test " << R << stats.hitsFailedExtendPercentIdentTest << RR
<< (rem -= stats.hitsFailedExtendPercentIdentTest);
std::cout << "\n - duplicates " << R << stats.hitsDuplicate << RR << (rem -= stats.hitsDuplicate);
std::cout << "\n - late duplicates " << R << stats.hitsDuplicate2 << RR
<< (rem -= stats.hitsDuplicate2);
std::cout << "\n - abundant " << R << stats.hitsAbundant << "\033[1m" << RR
<< (rem -= stats.hitsAbundant) << "\033[0m\n\n";

Expand Down Expand Up @@ -289,7 +288,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
if (options.verbosity >= 1)
{
auto const w = seqan::_numberOfDigits(stats.hitsFinal);
std::cout << "Number of valid hits: " << std::setw(w) << stats.hitsFinal
std::cout << "Number of total hits: " << std::setw(w) << stats.hitsFinal
<< "\nNumber of Query-Subject pairs: " << std::setw(w) << stats.pairs
<< "\nNumber of Queries with at least one valid hit: " << std::setw(w) << stats.qrysWithHit << "\n";
}
}
Expand Down Expand Up @@ -481,7 +481,8 @@ class LocalDataHolder
std::vector<std::string>, // not used
std::string_view,
uint32_t>;
std::list<TBlastMatch> blastMatches;
std::list<TBlastMatch> blastMatches;
std::unordered_set<uint64_t> uniqSubjIds;

// regarding the gathering of stats
StatsHolder stats{};
Expand Down
4 changes: 2 additions & 2 deletions src/search_misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ struct QueryException : public std::runtime_error
// Alignment-related
// ============================================================================

inline int _bandSize(uint64_t const seqLength)
inline int64_t _bandSize(uint64_t const seqLength)
{
// Currently only used for padding of the subject sequences
return 63ull - std::countl_zero(std::bit_ceil(seqLength));
return static_cast<int64_t>(std::sqrt(seqLength)) + 1;
}

// ----------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit d995cb5

Please sign in to comment.