Skip to content

Commit

Permalink
[FEATURE] staxid support complete [albeit a little slow]
Browse files Browse the repository at this point in the history
  • Loading branch information
h-2 committed Nov 21, 2016
1 parent 7d5c66b commit 2d508a1
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 30 deletions.
52 changes: 32 additions & 20 deletions src/lambda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,32 +399,33 @@ loadDbIndexFromDisk(TGlobalHolder & globalHolder,
template <typename TGlobalHolder>
inline int
loadSTaxIds(TGlobalHolder & globalHolder,
LambdaOptions & options)
LambdaOptions const & options)
{
std::string path = toCString(options.indexDir);
path += "/staxids";
if (options.hasSTaxIds)
{
std::string path = toCString(options.indexDir);
path += "/staxids";

//TODO in the future check all index things together
if (!fileExists((path + ".concat").c_str()))
return 0;
//TODO in the future check all index things together

std::string strIdent = "Loading Subject Taxonomy IDs...";
myPrint(options, 1, strIdent);
double start = sysTime();
std::string strIdent = "Loading Subject Taxonomy IDs...";
myPrint(options, 1, strIdent);
double start = sysTime();


int ret = open(globalHolder.sTaxIds, path.c_str(), OPEN_RDONLY);
if (ret != true)
{
std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
<< " failed.\n";
return 1;
}
int ret = open(globalHolder.sTaxIds, path.c_str(), OPEN_RDONLY);
if (ret != true)
{
std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
<< " failed.\n"
<< "Your index does not provide them. Did you forget to specify a map file while indexing?\n";
return 1;
}

double finish = sysTime() - start;
myPrint(options, 1, " done.\n");
myPrint(options, 2, "Runtime: ", finish, "s \n\n");
options.hasSTaxIds = true;
double finish = sysTime() - start;
myPrint(options, 1, " done.\n");
myPrint(options, 2, "Runtime: ", finish, "s \n\n");
}
return 0;
}

Expand Down Expand Up @@ -606,6 +607,7 @@ loadQuery(GlobalDataHolder<TRedAlph, TIndexSpec, TOutFormat, p, h> & global
return -1;
}

// TODO: after changing this, make options const again
if (options.extensionMode == LambdaOptions::ExtensionMode::AUTO)
{
if (maxLen <= 100)
Expand Down Expand Up @@ -1938,6 +1940,10 @@ iterateMatchesExtend(TLocalHolder & lH)
bm._n_qId = it->qryId / qNumFrames(TGlobalHolder::blastProgram);
bm._n_sId = it->subjId / sNumFrames(TGlobalHolder::blastProgram);
}

if (lH.options.hasSTaxIds)
bm.sTaxIds = lH.gH.sTaxIds[it->subjId / sNumFrames(TGlobalHolder::blastProgram)];

break;
case EVALUE:
++lH.stats.hitsFailedExtendEValueTest;
Expand Down Expand Up @@ -2155,6 +2161,9 @@ iterateMatchesFullSimd(TLocalHolder & lH)

bm._n_qId = it->qryId / qNumFrames(TGlobalHolder::blastProgram);
bm._n_sId = it->subjId / sNumFrames(TGlobalHolder::blastProgram);

if (lH.options.hasSTaxIds)
bm.sTaxIds = lH.gH.sTaxIds[it->subjId / sNumFrames(TGlobalHolder::blastProgram)];
}

// fill up last batch
Expand Down Expand Up @@ -2339,6 +2348,9 @@ iterateMatchesFullSerial(TLocalHolder & lH)

bm._n_qId = it->qryId / qNumFrames(TGlobalHolder::blastProgram);
bm._n_sId = it->subjId / sNumFrames(TGlobalHolder::blastProgram);

if (lH.options.hasSTaxIds)
bm.sTaxIds = lH.gH.sTaxIds[it->subjId / sNumFrames(TGlobalHolder::blastProgram)];
}

_writeRecord(record, lH);
Expand Down
18 changes: 11 additions & 7 deletions src/lambda_indexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ mapAndDumpTaxIDs(std::unordered_map<std::string, uint64_t> const & accToId
// stream iterator
auto fit = directionIterator(vfin, Input());

std::cout << "Parsing acc-to-tax-map file... " << std::flush;
myPrint(options, 1, "Parsing acc-to-tax-map file... ");

// skip line with headers
skipLine(fit);
Expand Down Expand Up @@ -376,7 +376,9 @@ mapAndDumpTaxIDs(std::unordered_map<std::string, uint64_t> const & accToId
skipLine(fit);
}

std::cout << "done. Took " << sysTime() - start << "s\n";
myPrint(options, 1, "done.\n");

myPrint(options, 2, "Runtime: ", sysTime() - start, "s \n");

// TODO do something with the subjects that have no (valid) taxid?

Expand All @@ -391,15 +393,17 @@ mapAndDumpTaxIDs(std::unordered_map<std::string, uint64_t> const & accToId
++multi;
}

std::cout << "Subjects without tax IDs: " << nomap << "\n";
std::cout << "Subjects with more than one tax ID: " << multi << "\n";

myPrint(options, 2, "Subjects without tax IDs: ", nomap, "\n",
"Subjects with more than one tax ID: ", multi, "\n\n");
if (numSubjects / nomap < 10)
myPrint(options, 1, "WARNING: ", double(nomap) * 100 / numSubjects, "% of subjects have no taxID.\n"
" Maybe you specified the wrong map file?\n\n");

std::cout << "Dumping Subject Taxonomy IDs... ";
myPrint(options, 1,"Dumping Subject Taxonomy IDs... ");
// concat direct so that it's easier to read/write
StringSet<String<uint64_t>, Owner<ConcatDirect<>>> outSTaxIds = sTaxIds;
save(outSTaxIds, std::string(options.indexDir + "/staxids").c_str());
std::cout << "done.\n";
myPrint(options, 1, "done.\n");

return 0;
}
Expand Down
7 changes: 6 additions & 1 deletion src/options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ struct SharedOptions
unsigned terminalCols = 80;

unsigned threads = 1;
bool hasSTaxIds;
bool hasSTaxIds = false;

SharedOptions()
{
Expand Down Expand Up @@ -939,6 +939,8 @@ parseCommandLine(LambdaOptions & options, int argc, char const ** argv)
{
appendValue(options.columns, static_cast<BlastMatchField<>::Enum>(i));
resolved = true;
if (static_cast<BlastMatchField<>::Enum>(i) == BlastMatchField<>::Enum::S_TAX_IDS)
options.hasSTaxIds = true;
break;
}
}
Expand Down Expand Up @@ -992,6 +994,9 @@ parseCommandLine(LambdaOptions & options, int argc, char const ** argv)
(!options.samBamTags[SamBamExtraTags<>::Q_AA_CIGAR]))
options.samBamSeq = 0;

if (options.samBamTags[SamBamExtraTags<>::S_TAX_IDS])
options.hasSTaxIds = true;

getOptionValue(buffer, parser, "version-to-outputfile");
options.versionInformationToOutputFile = (buffer == "on");

Expand Down
27 changes: 25 additions & 2 deletions src/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ struct SamBamExtraTags
P_POS,
Q_FRAME,
S_FRAME,
S_TAX_IDS,
Q_AA_SEQ,
Q_AA_CIGAR,
EDIT_DISTANCE,
MATCH_COUNT
};

static constexpr const std::array<std::pair<const char*, const char*>, 11> keyDescPairs
static constexpr const std::array<std::pair<const char*, const char*>, 12> keyDescPairs
{
{
// { "ZS", "query start (in DNA if original was DNA)" }, // Q_START,
Expand All @@ -59,6 +60,7 @@ struct SamBamExtraTags
{ "ZP", "% positive (in protein space unless BLASTN)"}, // P_POS,
{ "ZF", "query frame" }, // Q_FRAME,
{ "YF", "subject frame" }, // S_FRAME,
{ "YT", "subject taxonomy IDs (* if n/a)" }, // S_TAX_IDS,
{ "ZQ", "query protein sequence (* for BLASTN)"}, // Q_AA_SEQ,
{ "OC", "query protein cigar (* for BLASTN)"}, // Q_AA_CIGAR,
{ "NM", "edit distance (in protein space unless BLASTN)"}, // EDIT_DISTANCE
Expand All @@ -69,7 +71,7 @@ struct SamBamExtraTags
};

template <typename TVoidSpec>
constexpr const std::array<std::pair<const char*, const char*>, 11> SamBamExtraTags<TVoidSpec>::keyDescPairs;
constexpr const std::array<std::pair<const char*, const char*>, 12> SamBamExtraTags<TVoidSpec>::keyDescPairs;

// ----------------------------------------------------------------------------
// Function _untranslatedClipPositions()
Expand Down Expand Up @@ -572,6 +574,27 @@ myWriteRecord(TLH & lH, TRecord const & record)
appendTagValue(bamR.tags,
std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::S_FRAME]),
int8_t(mIt->sFrameShift), 'c');
if (lH.options.samBamTags[SamBamExtraTags<>::S_TAX_IDS])
{
//TODO append integer array, instead of transforming to string
CharString buf;
auto it = begin(buf);
if (length(mIt->sTaxIds) == 0)
{
buf = "*";
} else
{
appendNumber(it, mIt->sTaxIds[0]);
for (unsigned i = 1; i < length(mIt->sTaxIds); ++i)
{
write(it, ";");
appendNumber(it, mIt->sTaxIds[i]);
}
}
appendTagValue(bamR.tags,
std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::Q_AA_SEQ]),
buf, 'Z');
}
if (lH.options.samBamTags[SamBamExtraTags<>::Q_AA_SEQ])
{
if ((TGH::blastProgram == BlastProgram::BLASTN) || (!writeSeq))
Expand Down

0 comments on commit 2d508a1

Please sign in to comment.