Permalink
Browse files

Store profiles without pseudo counts

  • Loading branch information...
martin-steinegger committed Dec 7, 2017
1 parent 1ffdfd1 commit f00600c28b6d727cad0f627c3ed58fbd1d95cd33
@@ -141,7 +141,7 @@ Alignment::Alignment(const std::string &querySeqDB, const std::string &querySeqD
}
if (realign == true) {
realign_m = new SubstitutionMatrix(scoringMatrixFile.c_str(), 2.0, -0.4f);
realign_m = new SubstitutionMatrix(scoringMatrixFile.c_str(), 2.0, -0.2f);
} else {
realign_m = NULL;
}
@@ -261,8 +261,8 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
std::string alnResultsOutString;
alnResultsOutString.reserve(1024*1024);
char buffer[1024+32768];
Sequence qSeq(maxSeqLen, m->aa2int, m->int2aa, querySeqType, 0, false, compBiasCorrection);
Sequence dbSeq(maxSeqLen, m->aa2int, m->int2aa, targetSeqType, 0, false, compBiasCorrection);
Sequence qSeq(maxSeqLen, querySeqType, m, 0, false, compBiasCorrection);
Sequence dbSeq(maxSeqLen, targetSeqType, m, 0, false, compBiasCorrection);
Matcher matcher(maxSeqLen, m, &evaluer, compBiasCorrection);
Matcher *realigner = NULL;
if (realign == true) {
@@ -20,13 +20,7 @@ PSSMCalculator::PSSMCalculator(SubstitutionMatrix *subMat, size_t maxSeqLength,
this->maxSeqLength = maxSeqLength;
this->matchWeight = (float *) malloc_simd_float(Sequence::PROFILE_AA_SIZE * maxSeqLength * sizeof(float));
this->pseudocountsWeight = (float *) malloc_simd_float(Sequence::PROFILE_AA_SIZE * maxSeqLength * sizeof(float));
this->R = (float *) malloc_simd_float(Sequence::PROFILE_AA_SIZE * Sequence::PROFILE_AA_SIZE * sizeof(float));
this->nseqs = new int[maxSeqLength];
for (size_t aa_i = 0; aa_i < Sequence::PROFILE_AA_SIZE; ++aa_i){
for (size_t aa_j = 0; aa_j < Sequence::PROFILE_AA_SIZE; ++aa_j){
this->R[aa_i * Sequence::PROFILE_AA_SIZE + aa_j] = subMat->subMatrixPseudoCounts[aa_i][aa_j];
}
}
const unsigned int NAA_VECSIZE = ((MultipleAlignment::NAA+ 3 + VECSIZE_INT - 1) / VECSIZE_INT) * VECSIZE_INT;
this->w_contrib = new float*[maxSeqLength];
for (size_t j = 0; j < maxSeqLength; j++) {
@@ -47,7 +41,6 @@ PSSMCalculator::~PSSMCalculator() {
delete [] nseqs;
free(matchWeight);
free(pseudocountsWeight);
free(R);
for (size_t j = 0; j < maxSeqLength; j++) {
free(w_contrib[j]);
}
@@ -59,7 +52,7 @@ PSSMCalculator::~PSSMCalculator() {
PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize,
size_t queryLength,
const char **msaSeqs,
bool wg) {
bool wg, float bitFactor) {
for (size_t pos = 0; pos < queryLength; pos++) {
if (msaSeqs[0][pos] == MultipleAlignment::GAP) {
Debug(Debug::ERROR) <<
@@ -82,12 +75,14 @@ PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize,
// compute consensus sequence
std::string consensusSequence = computeConsensusSequence(matchWeight, queryLength, subMat->pBack, subMat->int2aa);
// add pseudocounts (compute the scalar product between matchWeight and substitution matrix with pseudo counts)
preparePseudoCounts(matchWeight, pseudocountsWeight, queryLength, (const float *) R);
preparePseudoCounts(matchWeight, pseudocountsWeight, Sequence::PROFILE_AA_SIZE, queryLength, (const float **) subMat->subMatrixPseudoCounts);
// SubstitutionMatrix::print(subMat->subMatrixPseudoCounts, subMat->int2aa, 20 );
computePseudoCounts(profile, matchWeight, pseudocountsWeight, queryLength, pca, pcb);
// create final Matrix
computeLogPSSM(pssm, profile, queryLength, 0.0);
computePseudoCounts(profile, matchWeight, pseudocountsWeight, Sequence::PROFILE_AA_SIZE, Neff_M, queryLength, pca, pcb);
// PSSMCalculator::printProfile(queryLength);
// create final Matrix
computeLogPSSM(pssm, profile, bitFactor, queryLength, 0.0);
// PSSMCalculator::printPSSM(queryLength);
return Profile(pssm, profile, Neff_M, consensusSequence);
}
@@ -100,7 +95,7 @@ void PSSMCalculator::printProfile(size_t queryLength){
for(size_t i = 0; i < queryLength; i++){
printf("%3zu ", i);
for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
printf("%03.2f ", profile[i * Sequence::PROFILE_AA_SIZE + aa] );
printf("%03.4f ", profile[i * Sequence::PROFILE_AA_SIZE + aa] );
}
printf("\n");
}
@@ -115,56 +110,42 @@ void PSSMCalculator::printPSSM(size_t queryLength){
for(size_t i = 0; i < queryLength; i++) {
printf("%3zu ", i);
for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++){
printf("%3d ", pssm[i * Sequence::PROFILE_AA_SIZE + aa] );
// char pssmVal = (pssm[i * Sequence::PROFILE_AA_SIZE + aa] == -128) ? 0 : pssm[i * Sequence::PROFILE_AA_SIZE + aa] ;
char pssmVal = pssm[i * Sequence::PROFILE_AA_SIZE + aa];
printf("%3d ", pssmVal);
}
printf("\n");
}
}
void PSSMCalculator::computeLogPSSM(char *pssm, float *profile,
void PSSMCalculator::computeLogPSSM(char *pssm, float *profile, float bitFactor,
size_t queryLength, float scoreBias) {
// calculate the substitution matrix
for(size_t pos = 0; pos < queryLength; pos++) {
for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
const float aaProb = profile[pos * Sequence::PROFILE_AA_SIZE + aa];
const unsigned int idx = pos * Sequence::PROFILE_AA_SIZE + aa;
profile[idx] = subMat->getBitFactor() * MathUtil::flog2(aaProb / subMat->pBack[aa]) + scoreBias;
const float pssmVal = profile[pos * Sequence::PROFILE_AA_SIZE + aa];
float logProb = MathUtil::flog2(aaProb / subMat->pBack[aa]);
const float pssmVal = bitFactor * logProb + scoreBias;
pssm[idx] = static_cast<char>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
float truncPssmVal = std::min(pssmVal, 127.0f);
truncPssmVal = std::max(-128.0f, truncPssmVal);
pssm[idx] = static_cast<char>((truncPssmVal < 0.0) ? truncPssmVal - 0.5 : truncPssmVal + 0.5);
}
}
}
void PSSMCalculator::preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts,
size_t queryLength, float const *R) {
void PSSMCalculator::preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t entrySize,
size_t queryLength, float const ** R) {
for (size_t pos = 0; pos < queryLength; pos++) {
for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
frequency_with_pseudocounts[pos * Sequence::PROFILE_AA_SIZE + aa] = ScalarProd20(&R[aa * Sequence::PROFILE_AA_SIZE],
&frequency[pos * Sequence::PROFILE_AA_SIZE]);
frequency_with_pseudocounts[pos * entrySize + aa] = ScalarProd20(R[aa],
&frequency[pos * entrySize]);
}
}
}
// Normalize a float array such that it sums to one
// If it sums to 0 then assign def_array elements to array (optional)
inline float PSSMCalculator::NormalizeTo1(float* array, int length, const double* def_array = NULL) {
float sum = 0.0f;
for (int k = 0; k < length; k++){
sum += array[k];
}
if (sum != 0.0f) {
float fac = 1.0 / sum;
for (int i = 0; i < length; i++) {
array[i] *= fac;
}
} else if (def_array) {
for (int i = 0; i < length; i++) {
array[i] = def_array[i];
}
}
return sum;
}
void PSSMCalculator::computeNeff_M(float *frequency, float *seqWeight, float *Neff_M,
size_t queryLength, size_t setSize, char const **msaSeqs) {
float Neff_HMM = 0.0f;
@@ -252,24 +233,30 @@ void PSSMCalculator::computeSequenceWeights(float *seqWeight, size_t queryLength
}
}
// std::cout << setSize << std::endl;
NormalizeTo1(seqWeight, setSize);
MathUtil::NormalizeTo1(seqWeight, setSize);
// std::cout << " Seq. Weight: " << std::endl;
// for (size_t k = 0; k < setSize; ++k) {
// std::cout << " k="<< k << "\t" << seqWeight[k] << std::endl;
// }
delete [] number_res;
}
void PSSMCalculator::computePseudoCounts(float *profile, float *frequency, float *frequency_with_pseudocounts, size_t queryLength, float pca, float pcb) {
void PSSMCalculator::computePseudoCounts(float *profile, float *frequency,
float *frequency_with_pseudocounts, size_t entrySize,
float * Neff_M, size_t queryLength,
float pca, float pcb) {
for (size_t pos = 0; pos < queryLength; pos++) {
float tau = fmin(1.0, pca / (1.0 + Neff_M[pos] / pcb));
//float tau = fmin(1.0, pca * (1.0 + pcb)/ (Neff_M[pos] + pcb));
//std::cout<< "Tau: "<< tau << ", Neff: " << Neff_M[pos] <<std::endl;
for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; ++aa) {
// compute proportion of pseudo counts and signal
float pseudoCounts = tau * frequency_with_pseudocounts[pos * Sequence::PROFILE_AA_SIZE + aa];
float frequencySignal = (1.0 - tau) * frequency[pos * Sequence::PROFILE_AA_SIZE + aa];
profile[pos * Sequence::PROFILE_AA_SIZE + aa] = frequencySignal + pseudoCounts;
float pseudoCounts = tau * frequency_with_pseudocounts[pos * entrySize + aa];
float frequencySignal = (1.0 - tau) * frequency[pos * entrySize + aa];
profile[pos * entrySize + aa] = frequencySignal + pseudoCounts;
// printf("%f %f %f %f\n", tau, frequencySignal, pseudoCounts, profile[pos * Sequence::PROFILE_AA_SIZE + aa]);
}
}
@@ -287,7 +274,7 @@ void PSSMCalculator::computeMatchWeights(float * matchWeight, float * seqWeight,
}
}
}
NormalizeTo1(&matchWeight[pos * Sequence::PROFILE_AA_SIZE], Sequence::PROFILE_AA_SIZE, subMat->pBack);
MathUtil::NormalizeTo1(&matchWeight[pos * Sequence::PROFILE_AA_SIZE], Sequence::PROFILE_AA_SIZE, subMat->pBack);
}
}
@@ -427,7 +414,7 @@ void PSSMCalculator::computeContextSpecificWeights(float * matchWeight, float *w
// Add contributions to Neff[i]
for (int j = jmin; j <= jmax; ++j) {
NormalizeTo1(f[j], MultipleAlignment::NAA);
MathUtil::NormalizeTo1(f[j], MultipleAlignment::NAA);
for (int a = 0; a < 20; ++a)
if (f[j][a] > 1E-10)
Neff_M[i] -= f[j][a]
@@ -456,8 +443,7 @@ void PSSMCalculator::computeContextSpecificWeights(float * matchWeight, float *w
matchWeight[i * Sequence::PROFILE_AA_SIZE + a] = 0.0;
for (size_t k = 0; k < setSize; ++k)
matchWeight[i * Sequence::PROFILE_AA_SIZE + (int) X[k][i]] += wi[k];
NormalizeTo1((matchWeight+ i * Sequence::PROFILE_AA_SIZE), MultipleAlignment::NAA, subMat->pBack);
MathUtil::NormalizeTo1((matchWeight+ i * Sequence::PROFILE_AA_SIZE), MultipleAlignment::NAA, subMat->pBack);
}
// remove end gaps
for (size_t k = 0; k < setSize; ++k) {
@@ -23,11 +23,17 @@ class PSSMCalculator {
~PSSMCalculator();
Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs,
bool wg);
bool wg, float bitFactor);
void printProfile(size_t queryLength);
void printPSSM(size_t queryLength);
// prepare pseudocounts
static void preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t entrySize, size_t queryLength, const float **R);
// compute pseudocounts from Neff_M -p log(p) per column
static void computePseudoCounts(float *profile, float *frequency, float *frequency_with_pseudocounts, size_t entrySize, float *Neff_M, size_t length,float pca, float pcb);
private:
SubstitutionMatrix * subMat;
@@ -49,9 +55,6 @@ class PSSMCalculator {
// PSSM contains log odds PSSM values
char * pssm;
// pseudocount matrix (mem aligned)
float * R;
// number of sequences in subalignment i (only for DEBUGGING)
int *nseqs;
@@ -71,24 +74,18 @@ class PSSMCalculator {
// Both PPMs assume statistical independence between positions in the pattern
// 2.) PSSM Log odds score
// M_{aa,pos}={log(M_{aa,pos} / b_{aa}).
void computeLogPSSM(char *pssm, float *profile, size_t queryLength, float scoreBias);
void computeLogPSSM(char *pssm, float *profile, float bitFactor, size_t queryLength, float scoreBias);
// normalize a fector to 1.0
float NormalizeTo1(float *array, int length, double const *def_array);
// prepare pseudocounts
void preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t queryLength, const float *R);
// compute the Neff_M per column -p log(p)
void computeNeff_M(float *frequency, float *seqWeight, float *Neff_M, size_t queryLength, size_t setSize, char const **msaSeqs);
// Compute weight for sequence based on "Position-based Sequence Weights' (1994)
void computeSequenceWeights(float *seqWeight, size_t queryLength, size_t setSize, const char **msaSeqs);
// compute pseudocounts from Neff_M -p log(p) per column
void computePseudoCounts(float *profile, float *frequency, float *frequency_with_pseudocounts, size_t length,float pca, float pcb);
void computeMatchWeights(float * matchWeight, float * seqWeight, size_t setSize, size_t queryLength, const char **msaSeqs);
void computeContextSpecificWeights(float * matchWeight, float *seqWeight, float * Neff_M, size_t queryLength, size_t setSize, const char **msaSeqs);
@@ -1,8 +1,10 @@
#include <simd/simd.h>
#include "BaseMatrix.h"
#include "Debug.h"
#include "Util.h"
#include "MathUtil.h"
#include "Sequence.h"
const double BaseMatrix::ANY_BACK = 1E-5;
@@ -53,7 +55,7 @@ BaseMatrix::BaseMatrix(){
probMatrix[i] = new double[alphabetSize];
subMatrix[i] = new short[alphabetSize];
subMatrix2Bit[i] = new short[alphabetSize];
subMatrixPseudoCounts[i] = new float[alphabetSize];
subMatrixPseudoCounts[i] = (float *) malloc_simd_float(alphabetSize * sizeof(float));
for (int j = 0; j < alphabetSize; j++){
probMatrix[i][j] = 0.0;
subMatrix2Bit[i][j] = 0.0;
@@ -71,7 +73,7 @@ BaseMatrix::~BaseMatrix(){
delete[] probMatrix[i];
delete[] subMatrix[i];
delete[] subMatrix2Bit[i];
delete[] subMatrixPseudoCounts[i];
free(subMatrixPseudoCounts[i]);
}
delete[] probMatrix;
delete[] subMatrix2Bit;
View
@@ -166,13 +166,18 @@ class MathUtil {
return exp > 0 ? (exp << MANTISSA_BITS) | (mantissa & ~HIDDEN_BIT) :
(mantissa >> (1 - exp)) & MANTISSA_MAX;
}
#pragma GCC diagnostic pop
static char convertNeffToChar(const float neff) {
unsigned char retVal = static_cast<char>((neff - 1 )/19 * MINIFLOAT_MAX);
return std::max(static_cast<unsigned char>(1), retVal);
float retVal = std::min(255.0f, 1.0f+64.0f*flog2(neff) );
return std::max(static_cast<unsigned char>(1), static_cast<unsigned char>(retVal + 0.5) );
}
static float convertNeffToFloat(unsigned char neffToScale) {
float retNeff = fpow2((static_cast<float>(neffToScale)-1.0f)/64.0f);;
return retNeff;
}
#pragma GCC diagnostic pop
// compute look up table based on stirling approximation
static void computeFactorial(double *output, const size_t range) {
@@ -185,6 +190,29 @@ class MathUtil {
output[score] = log(S_fact);
}
}
// Normalize a float array such that it sums to one
// If it sums to 0 then assign def_array elements to array (optional)
static inline float NormalizeTo1(float* array, int length, const double* def_array = NULL) {
float sum = 0.0f;
for (int k = 0; k < length; k++){
sum += array[k];
}
if (sum != 0.0f) {
float fac = 1.0 / sum;
for (int i = 0; i < length; i++) {
array[i] *= fac;
}
} else if (def_array) {
for (int i = 0; i < length; i++) {
array[i] = def_array[i];
}
}
return sum;
}
};
#endif
@@ -175,6 +175,8 @@ Parameters::Parameters():
align.push_back(PARAM_INCLUDE_IDENTITY);
align.push_back(PARAM_NO_PRELOAD);
align.push_back(PARAM_EARLY_EXIT);
align.push_back(PARAM_PCA);
align.push_back(PARAM_PCB);
align.push_back(PARAM_THREADS);
align.push_back(PARAM_V);
@@ -199,6 +201,8 @@ Parameters::Parameters():
prefilter.push_back(PARAM_SPACED_KMER_MODE);
prefilter.push_back(PARAM_NO_PRELOAD);
prefilter.push_back(PARAM_EARLY_EXIT);
prefilter.push_back(PARAM_PCA);
prefilter.push_back(PARAM_PCB);
prefilter.push_back(PARAM_THREADS);
prefilter.push_back(PARAM_V);
Oops, something went wrong.

0 comments on commit f00600c

Please sign in to comment.