Skip to content

Commit

Permalink
adding the option to search for ATG before first exon
Browse files Browse the repository at this point in the history
  • Loading branch information
elileka committed Apr 30, 2024
1 parent 66430bf commit c1ea8d1
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 9 deletions.
11 changes: 10 additions & 1 deletion src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ class LocalParameters : public Parameters {
PARAMETER(PARAM_WRITE_FRAG_COORDS)
int writeFragCoords;

PARAMETER(PARAM_LEN_SEARCH_START)
int lenSearchStart;

private:
LocalParameters() :
Parameters(),
Expand All @@ -83,7 +86,9 @@ class LocalParameters : public Parameters {
PARAM_SHOULD_TRANSLATE(PARAM_SHOULD_TRANSLATE_ID,"--protein", "translate codons to AAs", "translate the joint exons coding sequence to amino acids [0,1]", typeid(int), (void *) &shouldTranslate, "^[0-1]{1}$"),
PARAM_ALLOW_OVERLAP(PARAM_ALLOW_OVERLAP_ID,"--overlap", "allow same-strand overlaps", "allow predictions to overlap another on the same strand. when not allowed (default), only the prediction with better E-value will be retained [0,1]", typeid(int), (void *) &overlapAllowed, "^[0-1]{1}$"),
PARAM_WRITE_TKEY(PARAM_WRITE_TKEY_ID,"--target-key", "write target key instead of accession", "write the target key (internal DB identifier) instead of its accession. By default (0) target accession will be written [0,1]", typeid(int), (void *) &writeTargetKey, "^[0-1]{1}$"),
PARAM_WRITE_FRAG_COORDS(PARAM_WRITE_FRAG_COORDS_ID,"--write-frag-coords", "write fragment contig coords", "write the contig coords of the stop-to-stop fragment in which putative exon lies. By default (0) only putative exon coords will be written [0,1]", typeid(int), (void *) &writeFragCoords, "^[0-1]{1}$")
PARAM_WRITE_FRAG_COORDS(PARAM_WRITE_FRAG_COORDS_ID,"--write-frag-coords", "write fragment contig coords", "write the contig coords of the stop-to-stop fragment in which putative exon lies. By default (0) only putative exon coords will be written [0,1]", typeid(int), (void *) &writeFragCoords, "^[0-1]{1}$"),
PARAM_LEN_SEARCH_START(PARAM_LEN_SEARCH_START_ID,"--len-search-start", "length to search for start codon", "length to search for a start codon before the first exon and in the same frame. By default (0) no search", typeid(int), (void *) &lenSearchStart, "^[0-9]+$")

{
collectoptimalset.push_back(&PARAM_METAEUK_EVAL_THR);
collectoptimalset.push_back(&PARAM_METAEUK_TARGET_COV_THR);
Expand Down Expand Up @@ -114,6 +119,7 @@ class LocalParameters : public Parameters {
unitesetstofasta.push_back(&PARAM_TRANSLATION_TABLE);
unitesetstofasta.push_back(&PARAM_WRITE_TKEY);
unitesetstofasta.push_back(&PARAM_WRITE_FRAG_COORDS);
unitesetstofasta.push_back(&PARAM_LEN_SEARCH_START);
unitesetstofasta.push_back(&PARAM_MAX_SEQ_LEN);
unitesetstofasta.push_back(&PARAM_THREADS);
unitesetstofasta.push_back(&PARAM_V);
Expand Down Expand Up @@ -149,6 +155,9 @@ class LocalParameters : public Parameters {
// default value 0 means only coords of putative exon are written
writeFragCoords = 0;

// default value 0 means no searching before the first exon
lenSearchStart = 0;

citations.emplace(CITATION_METAEUK, "Levy Karin E, Mirdita M, Soeding J: MetaEuk – sensitive, high-throughput gene discovery and annotation for large-scale eukaryotic metagenomics. biorxiv, 851964 (2019).");
}
LocalParameters(LocalParameters const&);
Expand Down
119 changes: 111 additions & 8 deletions src/exonpredictor/unitesetstofasta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,99 @@ void reverseComplement (const std::string & seq, std::string & revCompSeq) {
}
}

int findStartInString (const std::string & seq) {
int lastPosOfClosestStart = 0;
// In a case like this: ATGcccATGccc
// The returned index will be 8 (the last char of the last ATG in the string)
for (size_t i = 0; i < seq.size(); i += 3) {
if ((seq.substr(i,3).compare("ATG") == 0) or (seq.substr(i,3).compare("atg") == 0)) {
lastPosOfClosestStart = i + 2;
}
}
return(lastPosOfClosestStart);
}

int searchForStartBeforeFirstExon(Prediction & pred, const char* contigData, std::ostringstream & joinedExonsStream, const int searchLen) {
// check if first exon's first codon is start and if so return 0 - already a start codon
if (pred.strand == PLUS) {
std::string firstCodon(&contigData[pred.lowContigCoord], 3);
int lastPosOfStart = findStartInString(firstCodon);
if (lastPosOfStart > 0) {
return 0;
}
}
if (pred.strand == MINUS) {
std::string firstCodon(&contigData[pred.highContigCoord - 2], 3);
std::string firstCodonRevCompSeq(firstCodon);
reverseComplement (firstCodon, firstCodonRevCompSeq);
int lastPosOfStart = findStartInString(firstCodonRevCompSeq);
if (lastPosOfStart > 0) {
return 0;
}
}

int seachLenLegal = searchLen - (searchLen % 3);

// case PLUS
int coordToStartSearch = pred.lowContigCoord - seachLenLegal;
int posAfterStopCodon = pred.optimalExonSet[0].potentialExonContigStartBeforeTrim;
// case MINUS
if (pred.strand == MINUS) {
coordToStartSearch = pred.highContigCoord + 1;
posAfterStopCodon = pred.optimalExonSet[0].potentialExonContigEndBeforeTrim;
}

// be careful at the edge of the first exon - don't go over stop
// case PLUS
if ((pred.strand == PLUS) && (coordToStartSearch < posAfterStopCodon)) {
coordToStartSearch = posAfterStopCodon;
seachLenLegal = pred.lowContigCoord - coordToStartSearch;
}
// case MINUS
if ((pred.strand == MINUS) && ((posAfterStopCodon - pred.highContigCoord) < (size_t)seachLenLegal)) {
seachLenLegal = (posAfterStopCodon - pred.highContigCoord);
if (seachLenLegal % 3 != 0) {
Debug(Debug::ERROR) << "ERROR: seachLenLegal mod 3 is not 0.\n";
EXIT(EXIT_FAILURE);
}
}

// extract the segment from the contig:
std::string beforeFirstExonSeq(&contigData[coordToStartSearch], (size_t)seachLenLegal);
std::string beforeFirstExonSeqRevCompSeq(beforeFirstExonSeq);
if (pred.strand == MINUS) {
reverseComplement (beforeFirstExonSeq, beforeFirstExonSeqRevCompSeq);
}

size_t posLastCharStart = 0;
// if not found, findStartInString will return 0, not changing anything
if (pred.strand == PLUS) {
posLastCharStart = findStartInString(beforeFirstExonSeq);
if (posLastCharStart > 0) {
std::string seqToAdd(beforeFirstExonSeq.substr(posLastCharStart - 2));
int numNucsAdded = seqToAdd.size();
pred.lowContigCoord = pred.lowContigCoord - numNucsAdded;
joinedExonsStream << seqToAdd;
return numNucsAdded;
}
} else {
posLastCharStart = findStartInString(beforeFirstExonSeqRevCompSeq);
if (posLastCharStart > 0) {
std::string seqToAdd(beforeFirstExonSeqRevCompSeq.substr(posLastCharStart - 2));
int numNucsAdded = seqToAdd.size();
pred.highContigCoord = pred.highContigCoord + numNucsAdded;
joinedExonsStream << seqToAdd;
return numNucsAdded;
}
}

// start codon not found - not changing anything, returning 0
return 0;
}

void preparePredDataAndHeader (Prediction & pred, const std::string & targetHeaderAcc, const std::string & contigHeaderAcc,
const char* contigData, std::ostringstream & joinedHeaderStream, std::ostringstream & joinedExonsStream,
const int writeFragCoords, const size_t contigLen) {
const int writeFragCoords, const int lenSearchStart, const size_t contigLen) {

// clear streams:
joinedHeaderStream.str("");
Expand All @@ -44,7 +134,20 @@ void preparePredDataAndHeader (Prediction & pred, const std::string & targetHead
joinedHeaderStream << "-|";
}
joinedHeaderStream << pred.totalBitscore << "|" << pred.combinedEvalue << "|" << pred.numExons << "|";
joinedHeaderStream << pred.lowContigCoord << "|" << pred.highContigCoord;
if (lenSearchStart == 0) {
// default case: user doesn't want to search for start codon before the first exon:
joinedHeaderStream << pred.lowContigCoord << "|" << pred.highContigCoord;
} else {
// special case: user wants to search for start codon before the first exon:
// if found - padd it up/downstream and indicate the padding != 0 in []
int numNucsAdded = searchForStartBeforeFirstExon(pred, contigData, joinedExonsStream, lenSearchStart);
if (pred.strand == PLUS) {
joinedHeaderStream << pred.lowContigCoord << "[" << numNucsAdded << "]" << "|" << pred.highContigCoord;
} else {
joinedHeaderStream << pred.lowContigCoord << "|" << pred.highContigCoord << "[" << numNucsAdded << "]";
}
}


// add all exons:
int lastTargetPosMatched = -1;
Expand All @@ -71,13 +174,13 @@ void preparePredDataAndHeader (Prediction & pred, const std::string & targetHead
exonAdjustedContigStart += (3 * diffInAAs);
exonAdjustedNucleotideLen -= (3 * diffInAAs);
}
int lowContigCoord = (pred.strand == PLUS) ? exonAdjustedContigStart : (-1 * exonContigEnd);
int exonLowContigCoord = (pred.strand == PLUS) ? exonAdjustedContigStart : (-1 * exonContigEnd);

// update the field:
pred.optimalExonSet[i].adjustedContigStart = exonAdjustedContigStart;

// extract the segment from the contig:
std::string exonContigSeq(&contigData[lowContigCoord], (size_t)exonAdjustedNucleotideLen);
std::string exonContigSeq(&contigData[exonLowContigCoord], (size_t)exonAdjustedNucleotideLen);
// update the last AA of the target that was matched:
lastTargetPosMatched = targetMatchEnd;

Expand Down Expand Up @@ -369,7 +472,7 @@ int unitesetstofasta(int argn, const char **argv, const Command& command) {
}

if (plusPred.optimalExonSet.size() > 0) {
preparePredDataAndHeader(plusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, contigLen);
preparePredDataAndHeader(plusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, par.lenSearchStart, contigLen);
std::string result = ">" + joinedHeaderStream.str();
fastaAaWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
fastaCodonWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
Expand Down Expand Up @@ -401,7 +504,7 @@ int unitesetstofasta(int argn, const char **argv, const Command& command) {
fastaCodonWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
}
if (minusPred.optimalExonSet.size() > 0) {
preparePredDataAndHeader(minusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, contigLen);
preparePredDataAndHeader(minusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, par.lenSearchStart, contigLen);
std::string result = ">" + joinedHeaderStream.str();
fastaAaWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
fastaCodonWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
Expand Down Expand Up @@ -463,7 +566,7 @@ int unitesetstofasta(int argn, const char **argv, const Command& command) {
}

if (plusPred.optimalExonSet.size() > 0) {
preparePredDataAndHeader(plusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, contigLen);
preparePredDataAndHeader(plusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, par.lenSearchStart, contigLen);
std::string result = ">" + joinedHeaderStream.str();
fastaAaWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
fastaCodonWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
Expand Down Expand Up @@ -495,7 +598,7 @@ int unitesetstofasta(int argn, const char **argv, const Command& command) {
fastaCodonWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
}
if (minusPred.optimalExonSet.size() > 0) {
preparePredDataAndHeader(minusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, contigLen);
preparePredDataAndHeader(minusPred, targetHeaderAcc, contigHeaderAcc, contigData, joinedHeaderStream, joinedExonsStream, par.writeFragCoords, par.lenSearchStart, contigLen);
std::string result = ">" + joinedHeaderStream.str();
fastaAaWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
fastaCodonWriter.writeData(result.c_str(), result.size(), 0, thread_idx, false, false);
Expand Down

0 comments on commit c1ea8d1

Please sign in to comment.