Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
@stamatak
13353 lines (10722 sloc) 383.212 kb
/* RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees
* Copyright August 2006 by Alexandros Stamatakis
*
* Partially derived from
* fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
*
* and
*
* Programs of the PHYLIP package by Joe Felsenstein.
*
* This program is free software; you may redistribute it and/or modify its
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
*
* For any other enquiries send an Email to Alexandros Stamatakis
* Alexandros.Stamatakis@epfl.ch
*
* When publishing work that is based on the results from RAxML-VI-HPC please cite:
*
* Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models".
* Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
*/
#ifdef WIN32
#include <direct.h>
#endif
#ifndef WIN32
#include <sys/times.h>
#include <sys/types.h>
#include <sys/time.h>
#include <unistd.h>
#endif
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>
#include <inttypes.h>
#include <getopt.h>
#if (defined(_WAYNE_MPI) || defined (_QUARTET_MPI))
#include <mpi.h>
#endif
#ifdef _USE_PTHREADS
#include <pthread.h>
#endif
#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
#include <xmmintrin.h>
/*
special bug fix, enforces denormalized numbers to be flushed to zero,
without this program is a tiny bit faster though.
#include <emmintrin.h>
#define MM_DAZ_MASK 0x0040
#define MM_DAZ_ON 0x0040
#define MM_DAZ_OFF 0x0000
*/
#endif
#include "axml.h"
#include "globalVariables.h"
#define _PORTABLE_PTHREADS
/***************** UTILITY FUNCTIONS **************************/
double FABS(double x)
{
/* if(x < -1.0E-10)
assert(0);*/
/* if(x < 0.0)
printf("%1.40f\n", x); */
return fabs(x);
}
FILE *getNumberOfTrees(tree *tr, char *fileName, analdef *adef)
{
FILE
*f = myfopen(fileName, "r");
int
trees = 0,
ch;
while((ch = fgetc(f)) != EOF)
if(ch == ';')
trees++;
assert(trees > 0);
tr->numberOfTrees = trees;
if(!adef->allInOne)
printBothOpen("\n\nFound %d trees in File %s\n\n", trees, fileName);
rewind(f);
return f;
}
static void printBoth(FILE *f, const char* format, ... )
{
va_list args;
va_start(args, format);
vfprintf(f, format, args );
va_end(args);
va_start(args, format);
vprintf(format, args );
va_end(args);
}
void printBothOpen(const char* format, ... )
{
#ifdef _QUARTET_MPI
if(processID == 0)
#endif
{
FILE *f = myfopen(infoFileName, "ab");
va_list args;
va_start(args, format);
vfprintf(f, format, args );
va_end(args);
va_start(args, format);
vprintf(format, args );
va_end(args);
fclose(f);
}
}
void printBothOpenMPI(const char* format, ... )
{
#ifdef _WAYNE_MPI
if(processID == 0)
#endif
{
FILE *f = myfopen(infoFileName, "ab");
va_list args;
va_start(args, format);
vfprintf(f, format, args );
va_end(args);
va_start(args, format);
vprintf(format, args );
va_end(args);
fclose(f);
}
}
boolean getSmoothFreqs(int dataType)
{
assert(MIN_MODEL < dataType && dataType < MAX_MODEL);
return pLengths[dataType].smoothFrequencies;
}
const unsigned int *getBitVector(int dataType)
{
assert(MIN_MODEL < dataType && dataType < MAX_MODEL);
return pLengths[dataType].bitVector;
}
int getStates(int dataType)
{
assert(MIN_MODEL < dataType && dataType < MAX_MODEL);
return pLengths[dataType].states;
}
unsigned char getUndetermined(int dataType)
{
assert(MIN_MODEL < dataType && dataType < MAX_MODEL);
return pLengths[dataType].undetermined;
}
char getInverseMeaning(int dataType, unsigned char state)
{
assert(MIN_MODEL < dataType && dataType < MAX_MODEL);
return pLengths[dataType].inverseMeaning[state];
}
partitionLengths *getPartitionLengths(pInfo *p)
{
int
dataType = p->dataType,
states = p->states,
tipLength = p->maxTipStates;
assert(states != -1 && tipLength != -1);
assert(MIN_MODEL < dataType && dataType < MAX_MODEL);
pLength.leftLength = pLength.rightLength = states * states;
pLength.eignLength = states -1;
pLength.evLength = states * states;
pLength.eiLength = states * states - states;
pLength.substRatesLength = (states * states - states) / 2;
pLength.frequenciesLength = states;
pLength.tipVectorLength = tipLength * states;
pLength.symmetryVectorLength = (states * states - states) / 2;
pLength.frequencyGroupingLength = states;
pLength.nonGTR = FALSE;
//pLength.optimizeBaseFrequencies = FALSE;
return (&pLengths[dataType]);
}
static boolean isCat(analdef *adef)
{
if(adef->model == M_PROTCAT || adef->model == M_GTRCAT || adef->model == M_BINCAT || adef->model == M_32CAT || adef->model == M_64CAT)
return TRUE;
else
return FALSE;
}
static boolean isGamma(analdef *adef)
{
if(adef->model == M_PROTGAMMA || adef->model == M_GTRGAMMA || adef->model == M_BINGAMMA ||
adef->model == M_32GAMMA || adef->model == M_64GAMMA)
return TRUE;
else
return FALSE;
}
static int stateAnalyzer(tree *tr, int model, int maxStates)
{
boolean
counter[256],
previous,
inputError = FALSE;
int
lower = tr->partitionData[model].lower,
upper = tr->partitionData[model].upper,
i,
j,
states = 0;
for(i = 0; i < 256; i++)
counter[i] = FALSE;
for(i = 0; i < tr->rdta->numsp; i++)
{
unsigned char *yptr = &(tr->rdta->y0[((size_t)i) * ((size_t)tr->originalCrunchedLength)]);
for(j = lower; j < upper; j++)
if(yptr[j] != getUndetermined(GENERIC_32))
counter[yptr[j]] = TRUE;
}
for(i = 0; i < maxStates; i++)
{
if(counter[i])
states++;
}
previous = counter[0];
for(i = 1; i < 256; i++)
{
if(previous == FALSE && counter[i] == TRUE)
{
inputError = TRUE;
break;
}
else
{
if(previous == TRUE && counter[i] == FALSE)
previous = FALSE;
}
}
if(inputError)
{
printf("Multi State Error, characters must be used in the order they are available, i.e.\n");
printf("0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V\n");
printf("You are using the following characters: \n");
for(i = 0; i < 256; i++)
if(counter[i])
printf("%c ", inverseMeaningGeneric32[i]);
printf("\n");
exit(-1);
}
return states;
}
static void setRateHetAndDataIncrement(tree *tr, analdef *adef)
{
int model;
if(isCat(adef))
tr->rateHetModel = CAT;
else
{
if(adef->useInvariant)
tr->rateHetModel = GAMMA_I;
else
tr->rateHetModel = GAMMA;
}
switch(tr->rateHetModel)
{
case GAMMA:
case GAMMA_I:
tr->discreteRateCategories = 4;
break;
case CAT:
if((adef->boot && !adef->bootstrapBranchLengths) || (adef->mode == CLASSIFY_ML) || (tr->catOnly))
tr->discreteRateCategories = 1;
else
tr->discreteRateCategories = 4;
break;
default:
assert(0);
}
if(adef->bootstrapBranchLengths)
assert(tr->discreteRateCategories == 4);
for(model = 0; model < tr->NumberOfModels; model++)
{
int
states = -1,
maxTipStates = getUndetermined(tr->partitionData[model].dataType) + 1;
switch(tr->partitionData[model].dataType)
{
case BINARY_DATA:
case DNA_DATA:
case AA_DATA:
case SECONDARY_DATA:
case SECONDARY_DATA_6:
case SECONDARY_DATA_7:
states = getStates(tr->partitionData[model].dataType);
break;
case GENERIC_32:
case GENERIC_64:
states = stateAnalyzer(tr, model, getStates(tr->partitionData[model].dataType));
break;
default:
assert(0);
}
tr->partitionData[model].states = states;
tr->partitionData[model].maxTipStates = maxTipStates;
}
}
double gettime(void)
{
#ifdef WIN32
time_t tp;
struct tm localtm;
tp = time(NULL);
localtm = *localtime(&tp);
return 60.0*localtm.tm_min + localtm.tm_sec;
#else
struct timeval ttime;
gettimeofday(&ttime , NULL);
return ttime.tv_sec + ttime.tv_usec * 0.000001;
#endif
}
double randum (int64_t *seed)
{
int64_t sum, mult0, mult1, seed0, seed1, seed2, newseed0, newseed1, newseed2;
double res;
mult0 = 1549;
seed0 = *seed & 4095;
sum = mult0 * seed0;
newseed0 = sum & 4095;
sum >>= 12;
seed1 = (*seed >> 12) & 4095;
mult1 = 406;
sum += mult0 * seed1 + mult1 * seed0;
newseed1 = sum & 4095;
sum >>= 12;
seed2 = (*seed >> 24) & 255;
sum += mult0 * seed2 + mult1 * seed1;
newseed2 = sum & 255;
*seed = newseed2 << 24 | newseed1 << 12 | newseed0;
res = 0.00390625 * (newseed2 + 0.000244140625 * (newseed1 + 0.000244140625 * newseed0));
return res;
}
int filexists(char *filename)
{
FILE *fp;
int res;
fp = fopen(filename,"rb");
if(fp)
{
res = 1;
fclose(fp);
}
else
res = 0;
return res;
}
FILE *myfopen(const char *path, const char *mode)
{
FILE *fp = fopen(path, mode);
if(strcmp(mode,"r") == 0 || strcmp(mode,"rb") == 0)
{
if(fp)
return fp;
else
{
if(processID == 0)
printf("The file %s you want to open for reading does not exist, exiting ...\n", path);
errorExit(-1);
return (FILE *)NULL;
}
}
else
{
if(fp)
return fp;
else
{
if(processID == 0)
printf("The file %s RAxML wants to open for writing or appending can not be opened [mode: %s], exiting ...\n",
path, mode);
errorExit(-1);
return (FILE *)NULL;
}
}
}
/********************* END UTILITY FUNCTIONS ********************/
/******************************some functions for the likelihood computation ****************************/
boolean isTip(int number, int maxTips)
{
assert(number > 0);
if(number <= maxTips)
return TRUE;
else
return FALSE;
}
void getxnode (nodeptr p)
{
nodeptr s;
if ((s = p->next)->x || (s = s->next)->x)
{
p->x = s->x;
s->x = 0;
}
assert(p->x);
}
void hookup (nodeptr p, nodeptr q, double *z, int numBranches)
{
int i;
p->back = q;
q->back = p;
for(i = 0; i < numBranches; i++)
p->z[i] = q->z[i] = z[i];
#ifdef _BASTIEN
for(i = 0; i < numBranches; i++)
p->secondDerivativeValid[i] = q->secondDerivativeValid[i] = FALSE;
#endif
}
void hookupDefault (nodeptr p, nodeptr q, int numBranches)
{
int i;
p->back = q;
q->back = p;
for(i = 0; i < numBranches; i++)
p->z[i] = q->z[i] = defaultz;
#ifdef _BASTIEN
for(i = 0; i < numBranches; i++)
p->secondDerivativeValid[i] = q->secondDerivativeValid[i] = FALSE;
#endif
}
/***********************reading and initializing input ******************/
static void rax_getline_insptr_valid(char **lineptr, size_t *n, size_t ins_ptr )
{
const size_t
n_inc = 1024;
if(ins_ptr >= *n)
{
assert( *n <= (SSIZE_MAX - n_inc));
*n += n_inc;
*lineptr = (char*)rax_realloc((void*)(*lineptr), *n * sizeof(char), FALSE);
assert(*lineptr != 0);
}
}
ssize_t rax_getline(char **lineptr, size_t *n, FILE *h)
{
size_t
ins_ptr = 0;
/* this implementation does not conform to the standard regarding error checking (i.e., asserts on errors ) */
assert(h != (FILE*)NULL);
if(*lineptr == (char *)NULL)
*n = 0;
while(1)
{
int
c = fgetc(h);
/* handle EOF: if no character has been read on the current line throw an error.
Otherwise treat as end-of-line. Don't know if this is correct,
as I don't have the POSIX standard and the linux manpage is unclear. */
if(c == EOF)
{
if(ins_ptr == 0)
return -1;
else
c = '\n';
//break;
}
if(c == '\r')
{
//this is the original GNU implementation
/* windows line-end: must be followed by a '\n'. Don't tolerate anything else. */
//c = fgetc(h);
//assert(c == '\n');
//fixed to essentialy replace windows line endings by '\n'
c = '\n';
}
/* insert character (including '\n') into buffer */
rax_getline_insptr_valid(lineptr, n, ins_ptr);
(*lineptr)[ins_ptr] = c;
++ins_ptr;
if(c == '\n')
break;
}
/* null-terminate */
rax_getline_insptr_valid( lineptr, n, ins_ptr );
(*lineptr)[ins_ptr] = 0;
return ((ssize_t)ins_ptr);
}
static void getnums (rawdata *rdta, analdef *adef)
{
if(fscanf(INFILE, "%d %d", & rdta->numsp, & rdta->sites) != 2)
{
char
*line = NULL;
size_t
len = 0;
ssize_t
read;
int
sequenceLength = 0,
sequences = 0,
taxa = 0,
sites =0;
if(processID == 0)
{
printf("\nRAxML can't, parse the alignment file as phylip file \n");
printf("it will now try to parse it as FASTA file\n\n");
}
while((read = rax_getline(&line, &len, INFILE)) != -1)
{
ssize_t
i = 0;
while((i < read - 1) && (line[i] == ' ' || line[i] == '\t'))
i++;
if(line[i] == '>')
{
if(taxa == 1)
sequenceLength = sites;
if(taxa > 0)
{
if(sites == 0 && processID == 0)
{
printf("Fasta parsing error, RAxML was expecting sequence data before: %s\n", line);
errorExit(-1);
}
assert(sites > 0);
sequences++;
}
if(taxa > 0)
{
if(sequenceLength != sites && processID == 0)
{
printf("Fasta parsing error, RAxML expects an alignment.\n");
printf("the sequence before taxon %s: seems to have a different length\n", line);
errorExit(-1);
}
assert(sequenceLength == sites);
}
taxa++;
sites = 0;
}
else
{
while(i < read - 1)
{
if(!(line[i] == ' ' || line[i] == '\t'))
{
sites++;
}
i++;
}
//printf("sites %d %s\n", sites, line);
}
}
if(sites > 0)
sequences++;
if(taxa != sequences && processID == 0)
{
printf("Fasta parsing error, the number of taxa %d and sequences %d are not equal!\n", taxa, sequences);
errorExit(-1);
}
assert(taxa == sequences);
if(sequenceLength != sites && processID == 0)
{
printf("Fasta parsing error, RAxML expects an alignment.\n");
printf("the last sequence in the alignment seems to have a different length\n");
errorExit(-1);
}
assert(sites == sequenceLength);
if(line)
rax_free(line);
rewind(INFILE);
adef->alignmentFileType = FASTA;
rdta->numsp = taxa;
rdta->sites = sites;
}
if (rdta->numsp < 4)
{
if(processID == 0)
printf("TOO FEW SPECIES\n");
errorExit(-1);
}
if (rdta->sites < 1)
{
if(processID == 0)
printf("TOO FEW SITES\n");
errorExit(-1);
}
return;
}
boolean whitechar (int ch)
{
return (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r');
}
static void uppercase (int *chptr)
{
int ch;
ch = *chptr;
if ((ch >= 'a' && ch <= 'i') || (ch >= 'j' && ch <= 'r')
|| (ch >= 's' && ch <= 'z'))
*chptr = ch + 'A' - 'a';
}
static void getyspace (rawdata *rdta)
{
size_t size = 4 * ((size_t)(rdta->sites / 4 + 1));
int i;
unsigned char *y0;
rdta->y = (unsigned char **) rax_malloc((rdta->numsp + 1) * sizeof(unsigned char *));
assert(rdta->y);
y0 = (unsigned char *) rax_malloc(((size_t)(rdta->numsp + 1)) * size * sizeof(unsigned char));
assert(y0);
rdta->y0 = y0;
for (i = 0; i <= rdta->numsp; i++)
{
rdta->y[i] = y0;
y0 += size;
}
return;
}
static unsigned int KISS32(void)
{
static unsigned int
x = 123456789,
y = 362436069,
z = 21288629,
w = 14921776,
c = 0;
unsigned int t;
x += 545925293;
y ^= (y<<13);
y ^= (y>>17);
y ^= (y<<5);
t = z + w + c;
z = w;
c = (t>>31);
w = t & 2147483647;
return (x+y+w);
}
static boolean setupTree (tree *tr, analdef *adef)
{
nodeptr p0, p, q;
int
i,
j,
tips,
inter;
tr->storedBrLens = (double*)NULL;
if(!adef->readTaxaOnly)
{
tr->bigCutoff = FALSE;
tr->patternPosition = (int*)NULL;
tr->columnPosition = (int*)NULL;
tr->maxCategories = MAX(4, adef->categories);
tr->partitionContributions = (double *)rax_malloc(sizeof(double) * tr->NumberOfModels);
for(i = 0; i < tr->NumberOfModels; i++)
tr->partitionContributions[i] = -1.0;
tr->perPartitionLH = (double *)rax_malloc(sizeof(double) * tr->NumberOfModels);
tr->storedPerPartitionLH = (double *)rax_malloc(sizeof(double) * tr->NumberOfModels);
for(i = 0; i < tr->NumberOfModels; i++)
{
tr->perPartitionLH[i] = 0.0;
tr->storedPerPartitionLH[i] = 0.0;
}
if(adef->grouping)
tr->grouped = TRUE;
else
tr->grouped = FALSE;
if(adef->constraint)
tr->constrained = TRUE;
else
tr->constrained = FALSE;
tr->treeID = 0;
}
tips = tr->mxtips;
inter = tr->mxtips - 1;
if(!adef->readTaxaOnly)
{
tr->yVector = (unsigned char **) rax_malloc((tr->mxtips + 1) * sizeof(unsigned char *));
tr->fracchanges = (double *)rax_malloc(tr->NumberOfModels * sizeof(double));
tr->rawFracchanges = (double *)rax_malloc(tr->NumberOfModels * sizeof(double));
#ifdef _HET
tr->fracchanges_TIP = (double *)rax_malloc(tr->NumberOfModels * sizeof(double));
tr->rawFracchanges_TIP = (double *)rax_malloc(tr->NumberOfModels * sizeof(double));
#endif
tr->likelihoods = (double *)rax_malloc(adef->multipleRuns * sizeof(double));
}
tr->numberOfTrees = -1;
tr->treeStringLength =
2 * (size_t)tr->mxtips + //parentheses
2 * (size_t)tr->mxtips * 64 + //branche lengths with : and . and branch labels and
(size_t)tr->mxtips + //commas
1 + //closing semicolon
(size_t)tr->mxtips * nmlngth; //taxon names
//tr->treeStringLength = tr->mxtips * (nmlngth+128) + 256 + tr->mxtips * 2;
//printf("tips %d Tree String Length %d old length %d\n", tr->mxtips, tr->treeStringLength,tr->mxtips * (nmlngth+128) + 256 + tr->mxtips * 2 );
tr->tree_string = (char*)rax_calloc(tr->treeStringLength, sizeof(char));
if(!adef->readTaxaOnly)
{
tr->td[0].count = 0;
tr->td[0].ti = (traversalInfo *)rax_malloc(sizeof(traversalInfo) * tr->mxtips);
for(i = 0; i < tr->NumberOfModels; i++)
{
tr->fracchanges[i] = -1.0;
tr->rawFracchanges[i] = -1.0;
#ifdef _HET
tr->fracchanges_TIP[i] = -1.0;
tr->rawFracchanges_TIP[i] = -1.0;
#endif
}
tr->fracchange = -1.0;
tr->rawFracchange = -1.0;
#ifdef _HET
tr->fracchange_TIP = -1.0;
tr->rawFracchange_TIP = -1.0;
#endif
tr->constraintVector = (int *)rax_malloc((2 * tr->mxtips) * sizeof(int));
tr->nameList = (char **)rax_malloc(sizeof(char *) * (tips + 1));
}
if (!(p0 = (nodeptr) rax_malloc((tips + 3*inter) * sizeof(node))))
{
printf("ERROR: Unable to obtain sufficient tree memory\n");
return FALSE;
}
if (!(tr->nodep = (nodeptr *) rax_malloc((2*tr->mxtips) * sizeof(nodeptr))))
{
printf("ERROR: Unable to obtain sufficient tree memory, too\n");
return FALSE;
}
tr->nodep[0] = (node *) NULL; /* Use as 1-based array */
for (i = 1; i <= tips; i++)
{
p = p0++;
p->hash = KISS32(); /* hast table stuff */
p->x = 0;
p->number = i;
p->next = p;
p->back = (node *)NULL;
p->bInf = (branchInfo *)NULL;
tr->nodep[i] = p;
}
for (i = tips + 1; i <= tips + inter; i++)
{
q = (node *) NULL;
for (j = 1; j <= 3; j++)
{
p = p0++;
if(j == 1)
p->x = 1;
else
p->x = 0;
p->number = i;
p->next = q;
p->bInf = (branchInfo *)NULL;
p->back = (node *) NULL;
p->hash = 0;
q = p;
}
p->next->next->next = p;
tr->nodep[i] = p;
}
tr->likelihood = unlikely;
tr->start = (node *) NULL;
tr->ntips = 0;
tr->nextnode = 0;
if(!adef->readTaxaOnly)
{
for(i = 0; i < tr->numBranches; i++)
tr->partitionSmoothed[i] = FALSE;
}
return TRUE;
}
static void checkTaxonName(char *buffer, int len)
{
int i;
//printf("checking taxon name\n");
for(i = 0; i < len - 1; i++)
{
boolean valid;
switch(buffer[i])
{
case '\0':
case '\t':
case '\n':
case '\r':
case ' ':
case ':':
case ',':
case '(':
case ')':
case ';':
case '[':
case ']':
case '\'':
valid = FALSE;
break;
default:
valid = TRUE;
}
if(!valid)
{
printf("ERROR: Taxon Name \"%s\" is invalid at position %d, it contains illegal character %c\n", buffer, i, buffer[i]);
printf("Illegal characters in taxon-names are: tabulators, carriage returns, spaces, \":\", \",\", \")\", \"(\", \";\", \"]\", \"[\", \"\'\" \n");
printf("Exiting\n");
exit(-1);
}
}
assert(buffer[len - 1] == '\0');
}
static void printParsingErrorContext(FILE *f)
{
const int64_t
contextWidth = 20;
int64_t
i,
currentPos = ftell(f),
contextPos = MAX(currentPos - contextWidth, 0);
fseek(f, MAX(currentPos - contextWidth, 0), SEEK_SET);
printf("Printing error context:\n\n");
for(i = contextPos; i < currentPos + contextWidth; i++)
{
int
ch = getc(f);
if(ch != EOF)
printf("%c", ch);
else
break;
}
printf("\n\n");
}
static boolean getdata(analdef *adef, rawdata *rdta, tree *tr)
{
int
i,
j,
basesread,
basesnew,
ch, my_i, meaning,
len,
meaningAA[256],
meaningDNA[256],
meaningBINARY[256],
meaningGeneric32[256],
meaningGeneric64[256];
boolean
allread,
firstpass;
char
buffer[nmlngth + 2];
unsigned char
genericChars32[32] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V'};
uint64_t
total = 0,
gaps = 0;
for (i = 0; i < 256; i++)
{
meaningAA[i] = -1;
meaningDNA[i] = -1;
meaningBINARY[i] = -1;
meaningGeneric32[i] = -1;
meaningGeneric64[i] = -1;
}
/* generic 32 data */
for(i = 0; i < 32; i++)
meaningGeneric32[genericChars32[i]] = i;
meaningGeneric32['-'] = getUndetermined(GENERIC_32);
meaningGeneric32['?'] = getUndetermined(GENERIC_32);
/* AA data */
meaningAA['A'] = 0; /* alanine */
meaningAA['R'] = 1; /* arginine */
meaningAA['N'] = 2; /* asparagine*/
meaningAA['D'] = 3; /* aspartic */
meaningAA['C'] = 4; /* cysteine */
meaningAA['Q'] = 5; /* glutamine */
meaningAA['E'] = 6; /* glutamic */
meaningAA['G'] = 7; /* glycine */
meaningAA['H'] = 8; /* histidine */
meaningAA['I'] = 9; /* isoleucine */
meaningAA['L'] = 10; /* leucine */
meaningAA['K'] = 11; /* lysine */
meaningAA['M'] = 12; /* methionine */
meaningAA['F'] = 13; /* phenylalanine */
meaningAA['P'] = 14; /* proline */
meaningAA['S'] = 15; /* serine */
meaningAA['T'] = 16; /* threonine */
meaningAA['W'] = 17; /* tryptophan */
meaningAA['Y'] = 18; /* tyrosine */
meaningAA['V'] = 19; /* valine */
meaningAA['B'] = 20; /* asparagine, aspartic 2 and 3*/
meaningAA['Z'] = 21; /*21 glutamine glutamic 5 and 6*/
meaningAA['X'] =
meaningAA['?'] =
meaningAA['*'] =
meaningAA['-'] =
getUndetermined(AA_DATA);
/* DNA data */
meaningDNA['A'] = 1;
meaningDNA['B'] = 14;
meaningDNA['C'] = 2;
meaningDNA['D'] = 13;
meaningDNA['G'] = 4;
meaningDNA['H'] = 11;
meaningDNA['K'] = 12;
meaningDNA['M'] = 3;
meaningDNA['R'] = 5;
meaningDNA['S'] = 6;
meaningDNA['T'] = 8;
meaningDNA['U'] = 8;
meaningDNA['V'] = 7;
meaningDNA['W'] = 9;
meaningDNA['Y'] = 10;
meaningDNA['N'] =
meaningDNA['O'] =
meaningDNA['X'] =
meaningDNA['-'] =
meaningDNA['?'] =
getUndetermined(DNA_DATA);
/* BINARY DATA */
meaningBINARY['0'] = 1;
meaningBINARY['1'] = 2;
meaningBINARY['-'] =
meaningBINARY['?'] =
getUndetermined(BINARY_DATA);
/*******************************************************************/
basesread = basesnew = 0;
allread = FALSE;
firstpass = TRUE;
ch = ' ';
while (! allread)
{
for(i = 1; i <= tr->mxtips; i++)
{
if(firstpass)
{
ch = getc(INFILE);
while(ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r')
ch = getc(INFILE);
my_i = 0;
do
{
buffer[my_i] = ch;
ch = getc(INFILE);
my_i++;
if(my_i >= nmlngth)
{
if(processID == 0)
{
printf("Taxon Name too long at taxon %d, adapt constant nmlngth in\n", i);
printf("axml.h, current setting %d\n", nmlngth);
}
errorExit(-1);
}
}
while(ch != ' ' && ch != '\n' && ch != '\t' && ch != '\r');
buffer[my_i] = '\0';
len = strlen(buffer) + 1;
checkTaxonName(buffer, len);
tr->nameList[i] = (char *)rax_malloc(sizeof(char) * len);
strcpy(tr->nameList[i], buffer);
while(ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r')
ch = getc(INFILE);
ungetc(ch, INFILE);
}
j = basesread;
while((j < rdta->sites) && ((ch = getc(INFILE)) != EOF) && (ch != '\n') && (ch != '\r'))
{
uppercase(& ch);
assert(tr->dataVector[j + 1] != -1);
switch(tr->dataVector[j + 1])
{
case BINARY_DATA:
meaning = meaningBINARY[ch];
break;
case DNA_DATA:
case SECONDARY_DATA:
case SECONDARY_DATA_6:
case SECONDARY_DATA_7:
/*
still dealing with DNA/RNA here, hence just act if as they where DNA characters
corresponding column merging for sec struct models will take place later
*/
meaning = meaningDNA[ch];
break;
case AA_DATA:
meaning = meaningAA[ch];
break;
case GENERIC_32:
meaning = meaningGeneric32[ch];
break;
case GENERIC_64:
meaning = meaningGeneric64[ch];
break;
default:
assert(0);
}
if (meaning != -1)
{
j++;
rdta->y[i][j] = ch;
}
else
{
if(!whitechar(ch))
{
printf("ERROR: Bad base (%c) at site %d of sequence %d\n",
ch, j + 1, i);
printParsingErrorContext(INFILE);
return FALSE;
}
}
}
if (ch == EOF)
{
printf("ERROR: End-of-file at site %d of sequence %d\n", j + 1, i);
printParsingErrorContext(INFILE);
return FALSE;
}
if (! firstpass && (j == basesread))
i--;
else
{
if (i == 1)
basesnew = j;
else
if (j != basesnew)
{
printf("ERROR: Sequences out of alignment\n");
printf("%d (instead of %d) residues read in sequence %d %s\n",
j - basesread, basesnew - basesread, i, tr->nameList[i]);
printParsingErrorContext(INFILE);
return FALSE;
}
}
while (ch != '\n' && ch != EOF && ch != '\r')
ch = getc(INFILE); /* flush line *//* PC-LINEBREAK*/
}
firstpass = FALSE;
basesread = basesnew;
allread = (basesread >= rdta->sites);
}
if(ch != EOF)
{
int
garbageCount = 0;
do
{
if(!whitechar(ch))
garbageCount++;
if(garbageCount > 0)
{
if(garbageCount == 1)
printf("\nOups, there is garbage at the end of your file:\n\n");
if(garbageCount < 1000)
printf("%c", ch);
if(garbageCount == 1000)
printf("\n .... and so on\n");
}
}
while((ch = getc(INFILE)) != EOF);
if(garbageCount > 0)
{
printf("\n\nRAxML correctly finished parsing your PHYLIP file,\n");
printf("but there seems to be garbage at the end of the file, i.e., non-whitespace characters!\n");
printf("RAxML will exit now\n\n");
errorExit(-1);
}
}
for(j = 1; j <= tr->mxtips; j++)
for(i = 1; i <= rdta->sites; i++)
{
assert(tr->dataVector[i] != -1);
switch(tr->dataVector[i])
{
case BINARY_DATA:
meaning = meaningBINARY[rdta->y[j][i]];
if(meaning == getUndetermined(BINARY_DATA))
gaps++;
break;
case SECONDARY_DATA:
case SECONDARY_DATA_6:
case SECONDARY_DATA_7:
assert(tr->secondaryStructurePairs[i - 1] != -1);
assert(i - 1 == tr->secondaryStructurePairs[tr->secondaryStructurePairs[i - 1]]);
/*
don't worry too much about undetermined column count here for sec-struct, just count
DNA/RNA gaps here and worry about the rest later-on, falling through to DNA again :-)
*/
case DNA_DATA:
meaning = meaningDNA[rdta->y[j][i]];
if(meaning == getUndetermined(DNA_DATA))
gaps++;
break;
case AA_DATA:
meaning = meaningAA[rdta->y[j][i]];
if(meaning == getUndetermined(AA_DATA))
gaps++;
break;
case GENERIC_32:
meaning = meaningGeneric32[rdta->y[j][i]];
if(meaning == getUndetermined(GENERIC_32))
gaps++;
break;
case GENERIC_64:
meaning = meaningGeneric64[rdta->y[j][i]];
if(meaning == getUndetermined(GENERIC_64))
gaps++;
break;
default:
assert(0);
}
total++;
rdta->y[j][i] = meaning;
}
adef->gapyness = (double)gaps / (double)total;
return TRUE;
}
static void parseFasta(analdef *adef, rawdata *rdta, tree *tr)
{
int
index,
meaning,
meaningAA[256],
meaningDNA[256],
meaningBINARY[256],
meaningGeneric32[256],
meaningGeneric64[256];
char
buffer[nmlngth + 2];
unsigned char
genericChars32[32] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V'};
uint64_t
total = 0,
gaps = 0;
for(index = 0; index < 256; index++)
{
meaningAA[index] = -1;
meaningDNA[index] = -1;
meaningBINARY[index] = -1;
meaningGeneric32[index] = -1;
meaningGeneric64[index] = -1;
}
/* generic 32 data */
for(index = 0; index < 32; index++)
meaningGeneric32[genericChars32[index]] = index;
meaningGeneric32['-'] = getUndetermined(GENERIC_32);
meaningGeneric32['?'] = getUndetermined(GENERIC_32);
/* AA data */
meaningAA['A'] = 0; /* alanine */
meaningAA['R'] = 1; /* arginine */
meaningAA['N'] = 2; /* asparagine*/
meaningAA['D'] = 3; /* aspartic */
meaningAA['C'] = 4; /* cysteine */
meaningAA['Q'] = 5; /* glutamine */
meaningAA['E'] = 6; /* glutamic */
meaningAA['G'] = 7; /* glycine */
meaningAA['H'] = 8; /* histidine */
meaningAA['I'] = 9; /* isoleucine */
meaningAA['L'] = 10; /* leucine */
meaningAA['K'] = 11; /* lysine */
meaningAA['M'] = 12; /* methionine */
meaningAA['F'] = 13; /* phenylalanine */
meaningAA['P'] = 14; /* proline */
meaningAA['S'] = 15; /* serine */
meaningAA['T'] = 16; /* threonine */
meaningAA['W'] = 17; /* tryptophan */
meaningAA['Y'] = 18; /* tyrosine */
meaningAA['V'] = 19; /* valine */
meaningAA['B'] = 20; /* asparagine, aspartic 2 and 3*/
meaningAA['Z'] = 21; /*21 glutamine glutamic 5 and 6*/
meaningAA['X'] =
meaningAA['?'] =
meaningAA['*'] =
meaningAA['-'] =
getUndetermined(AA_DATA);
/* DNA data */
meaningDNA['A'] = 1;
meaningDNA['B'] = 14;
meaningDNA['C'] = 2;
meaningDNA['D'] = 13;
meaningDNA['G'] = 4;
meaningDNA['H'] = 11;
meaningDNA['K'] = 12;
meaningDNA['M'] = 3;
meaningDNA['R'] = 5;
meaningDNA['S'] = 6;
meaningDNA['T'] = 8;
meaningDNA['U'] = 8;
meaningDNA['V'] = 7;
meaningDNA['W'] = 9;
meaningDNA['Y'] = 10;
meaningDNA['N'] =
meaningDNA['O'] =
meaningDNA['X'] =
meaningDNA['-'] =
meaningDNA['?'] =
getUndetermined(DNA_DATA);
/* BINARY DATA */
meaningBINARY['0'] = 1;
meaningBINARY['1'] = 2;
meaningBINARY['-'] =
meaningBINARY['?'] =
getUndetermined(BINARY_DATA);
/*******************************************************************/
{
char
*line = NULL;
size_t
len = 0;
ssize_t
read;
int
sequenceLength = 0,
sequences = 0,
taxa = 0,
sites = 0;
while((read = rax_getline(&line, &len, INFILE)) != -1)
{
ssize_t
i = 0;
while((i < read - 1) && (line[i] == ' ' || line[i] == '\t'))
i++;
if(line[i] == '>')
{
int
nameCount = 0,
nameLength;
if(taxa == 1)
sequenceLength = sites;
if(taxa > 0)
{
assert(sites > 0);
sequences++;
}
if(taxa > 0)
assert(sequenceLength == sites);
taxa++;
i++;
while((i < read - 1) && (line[i] == ' ' || line[i] == '\t'))
i++;
while((i < read - 1) && !(line[i] == ' ' || line[i] == '\t'))
{
buffer[nameCount] = line[i];
nameCount++;
i++;
}
if(nameCount >= nmlngth)
{
if(processID == 0)
{
printf("Taxon Name too long at taxon %d, adapt constant nmlngth in\n", taxa);
printf("axml.h, current setting %d\n", nmlngth);
}
errorExit(-1);
}
buffer[nameCount] = '\0';
nameLength = strlen(buffer) + 1;
checkTaxonName(buffer, nameLength);
tr->nameList[taxa] = (char *)rax_malloc(sizeof(char) * nameLength);
strcpy(tr->nameList[taxa], buffer);
sites = 0;
}
else
{
while(i < read - 1)
{
if(!(line[i] == ' ' || line[i] == '\t'))
{
int
ch = line[i];
uppercase(&ch);
assert(tr->dataVector[sites + 1] != -1);
switch(tr->dataVector[sites + 1])
{
case BINARY_DATA:
meaning = meaningBINARY[ch];
break;
case DNA_DATA:
case SECONDARY_DATA:
case SECONDARY_DATA_6:
case SECONDARY_DATA_7:
meaning = meaningDNA[ch];
break;
case AA_DATA:
meaning = meaningAA[ch];
break;
case GENERIC_32:
meaning = meaningGeneric32[ch];
break;
case GENERIC_64:
meaning = meaningGeneric64[ch];
break;
default:
assert(0);
}
if (meaning != -1)
rdta->y[taxa][sites + 1] = ch;
else
{
if(processID == 0)
{
printf("ERROR: Bad base (%c) at site %d of sequence %d\n",
ch, sites + 1, taxa);
}
errorExit(-1);
}
sites++;
}
i++;
}
}
}
if(sites > 0)
sequences++;
/* the assertions below should never fail, the have already been checked in getNums */
assert(taxa == sequences);
assert(sites == sequenceLength);
if(line)
rax_free(line);
}
{
int
i,
j;
for(j = 1; j <= tr->mxtips; j++)
for(i = 1; i <= rdta->sites; i++)
{
assert(tr->dataVector[i] != -1);
switch(tr->dataVector[i])
{
case BINARY_DATA:
meaning = meaningBINARY[rdta->y[j][i]];
if(meaning == getUndetermined(BINARY_DATA))
gaps++;
break;
case SECONDARY_DATA:
case SECONDARY_DATA_6:
case SECONDARY_DATA_7:
assert(tr->secondaryStructurePairs[i - 1] != -1);
assert(i - 1 == tr->secondaryStructurePairs[tr->secondaryStructurePairs[i - 1]]);
/*
don't worry too much about undetermined column count here for sec-struct, just count
DNA/RNA gaps here and worry about the rest later-on, falling through to DNA again :-)
*/
case DNA_DATA:
meaning = meaningDNA[rdta->y[j][i]];
if(meaning == getUndetermined(DNA_DATA))
gaps++;
break;
case AA_DATA:
meaning = meaningAA[rdta->y[j][i]];
if(meaning == getUndetermined(AA_DATA))
gaps++;
break;
case GENERIC_32:
meaning = meaningGeneric32[rdta->y[j][i]];
if(meaning == getUndetermined(GENERIC_32))
gaps++;
break;
case GENERIC_64:
meaning = meaningGeneric64[rdta->y[j][i]];
if(meaning == getUndetermined(GENERIC_64))
gaps++;
break;
default:
assert(0);
}
total++;
rdta->y[j][i] = meaning;
}
}
adef->gapyness = (double)gaps / (double)total;
return;
}
static void inputweights (rawdata *rdta)
{
int i, w, fres;
FILE *weightFile;
int *wv = (int *)rax_malloc(sizeof(int) * rdta->sites);
weightFile = myfopen(weightFileName, "rb");
i = 0;
while((fres = fscanf(weightFile,"%d", &w)) != EOF)
{
if(!fres)
{
if(processID == 0)
printf("error reading weight file probably encountered a non-integer weight value\n");
errorExit(-1);
}
wv[i] = w;
i++;
}
if(i != rdta->sites)
{
if(processID == 0)
printf("number %d of weights not equal to number %d of alignment columns\n", i, rdta->sites);
errorExit(-1);
}
for(i = 1; i <= rdta->sites; i++)
rdta->wgt[i] = wv[i - 1];
fclose(weightFile);
rax_free(wv);
}
static void getinput(analdef *adef, rawdata *rdta, cruncheddata *cdta, tree *tr)
{
int i;
if(!adef->readTaxaOnly)
{
INFILE = myfopen(seq_file, "rb");
getnums(rdta, adef);
}
tr->mxtips = rdta->numsp;
if(!adef->readTaxaOnly)
{
rdta->wgt = (int *) rax_malloc((rdta->sites + 1) * sizeof(int));
cdta->alias = (int *) rax_malloc((rdta->sites + 1) * sizeof(int));
cdta->aliaswgt = (int *) rax_malloc((rdta->sites + 1) * sizeof(int));
cdta->rateCategory = (int *) rax_malloc((rdta->sites + 1) * sizeof(int));
tr->model = (int *) rax_calloc((rdta->sites + 1), sizeof(int));
tr->initialDataVector = (int *) rax_malloc((rdta->sites + 1) * sizeof(int));
tr->extendedDataVector = (int *) rax_malloc((rdta->sites + 1) * sizeof(int));
cdta->patrat = (double *) rax_malloc((rdta->sites + 1) * sizeof(double));
cdta->patratStored = (double *) rax_malloc((rdta->sites + 1) * sizeof(double));
if(!adef->useWeightFile)
{
for (i = 1; i <= rdta->sites; i++)
rdta->wgt[i] = 1;
}
else
{
assert(!adef->useSecondaryStructure);
inputweights(rdta);
}
}
else
tr->NumberOfModels = 0;
tr->multiBranch = 0;
tr->numBranches = 1;
if(!adef->readTaxaOnly)
{
if(adef->useMultipleModel)
{
int ref;
parsePartitions(adef, rdta, tr);
for(i = 1; i <= rdta->sites; i++)
{
ref = tr->model[i];
tr->initialDataVector[i] = tr->initialPartitionData[ref].dataType;
}
}
else
{
int
dataType = -1;
tr->initialPartitionData = (pInfo*)rax_malloc(sizeof(pInfo));
tr->initialPartitionData[0].partitionName = (char*)rax_malloc(128 * sizeof(char));
strcpy(tr->initialPartitionData[0].partitionName, "No Name Provided");
tr->initialPartitionData[0].protModels = adef->proteinMatrix;
if(adef->protEmpiricalFreqs)
tr->initialPartitionData[0].usePredefinedProtFreqs = FALSE;
else
tr->initialPartitionData[0].usePredefinedProtFreqs = TRUE;
if(adef->optimizeBaseFrequencies)
{
tr->initialPartitionData[0].optimizeBaseFrequencies = TRUE;
tr->initialPartitionData[0].usePredefinedProtFreqs = FALSE;
}
else
tr->initialPartitionData[0].optimizeBaseFrequencies = FALSE;
if(adef->ascertainmentBias)
tr->initialPartitionData[0].ascBias = TRUE;
else
tr->initialPartitionData[0].ascBias = FALSE;
tr->NumberOfModels = 1;
if(adef->model == M_PROTCAT || adef->model == M_PROTGAMMA)
dataType = AA_DATA;
if(adef->model == M_GTRCAT || adef->model == M_GTRGAMMA)
dataType = DNA_DATA;
if(adef->model == M_BINCAT || adef->model == M_BINGAMMA)
dataType = BINARY_DATA;
if(adef->model == M_32CAT || adef->model == M_32GAMMA)
dataType = GENERIC_32;
if(adef->model == M_64CAT || adef->model == M_64GAMMA)
dataType = GENERIC_64;
assert(dataType == BINARY_DATA || dataType == DNA_DATA || dataType == AA_DATA ||
dataType == GENERIC_32 || dataType == GENERIC_64);
tr->initialPartitionData[0].dataType = dataType;
if(dataType == AA_DATA && adef->userProteinModel)
{
tr->initialPartitionData[0].protModels = PROT_FILE;
tr->initialPartitionData[0].usePredefinedProtFreqs = TRUE;
tr->initialPartitionData[0].optimizeBaseFrequencies = FALSE;
strcpy(tr->initialPartitionData[0].proteinSubstitutionFileName, proteinModelFileName);
}
for(i = 0; i <= rdta->sites; i++)
{
tr->initialDataVector[i] = dataType;
tr->model[i] = 0;
}
}
if(adef->useSecondaryStructure)
{
memcpy(tr->extendedDataVector, tr->initialDataVector, (rdta->sites + 1) * sizeof(int));
tr->extendedPartitionData =(pInfo*)rax_malloc(sizeof(pInfo) * tr->NumberOfModels);
for(i = 0; i < tr->NumberOfModels; i++)
{
tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(tr->initialPartitionData[i].partitionName) + 1) * sizeof(char));
strcpy(tr->extendedPartitionData[i].partitionName, tr->initialPartitionData[i].partitionName);
strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, tr->initialPartitionData[i].proteinSubstitutionFileName);
strcpy(tr->extendedPartitionData[i].ascFileName, tr->initialPartitionData[i].ascFileName);
tr->extendedPartitionData[i].dataType = tr->initialPartitionData[i].dataType;
tr->extendedPartitionData[i].protModels = tr->initialPartitionData[i].protModels;
tr->extendedPartitionData[i].usePredefinedProtFreqs = tr->initialPartitionData[i].usePredefinedProtFreqs;
tr->extendedPartitionData[i].optimizeBaseFrequencies = tr->initialPartitionData[i].optimizeBaseFrequencies;
tr->extendedPartitionData[i].ascBias = tr->initialPartitionData[i].ascBias;
}
parseSecondaryStructure(tr, adef, rdta->sites);
tr->dataVector = tr->extendedDataVector;
tr->partitionData = tr->extendedPartitionData;
}
else
{
tr->dataVector = tr->initialDataVector;
tr->partitionData = tr->initialPartitionData;
}
for(i = 0; i < tr->NumberOfModels; i++)
if(tr->partitionData[i].dataType == AA_DATA && tr->partitionData[i].protModels == PROT_FILE)
parseProteinModel(tr->partitionData[i].externalAAMatrix, tr->partitionData[i].proteinSubstitutionFileName);
tr->executeModel = (boolean *)rax_malloc(sizeof(boolean) * tr->NumberOfModels);
for(i = 0; i < tr->NumberOfModels; i++)
tr->executeModel[i] = TRUE;
getyspace(rdta);
}
setupTree(tr, adef);
if(!adef->readTaxaOnly)
{
switch(adef->alignmentFileType)
{
case PHYLIP:
if(!getdata(adef, rdta, tr))
{
printf("Problem reading alignment file \n");
errorExit(1);
}
break;
case FASTA:
parseFasta(adef, rdta, tr);
break;
default:
assert(0);
}
tr->nameHash = initStringHashTable(10 * tr->mxtips);
for(i = 1; i <= tr->mxtips; i++)
{
addword(tr->nameList[i], tr->nameHash, i);
}
fclose(INFILE);
}
}
static unsigned char buildStates(int secModel, unsigned char v1, unsigned char v2)
{
unsigned char new = 0;
switch(secModel)
{
case SECONDARY_DATA:
new = v1;
new = new << 4;
new = new | v2;
break;
case SECONDARY_DATA_6:
{
int
meaningDNA[256],
i;
const unsigned char
allowedStates[6][2] = {{'A','T'}, {'C', 'G'}, {'G', 'C'}, {'G','T'}, {'T', 'A'}, {'T', 'G'}};
const unsigned char
finalBinaryStates[6] = {1, 2, 4, 8, 16, 32};
unsigned char
intermediateBinaryStates[6];
int length = 6;
for(i = 0; i < 256; i++)
meaningDNA[i] = -1;
meaningDNA['A'] = 1;
meaningDNA['B'] = 14;
meaningDNA['C'] = 2;
meaningDNA['D'] = 13;
meaningDNA['G'] = 4;
meaningDNA['H'] = 11;
meaningDNA['K'] = 12;
meaningDNA['M'] = 3;
meaningDNA['N'] = 15;
meaningDNA['O'] = 15;
meaningDNA['R'] = 5;
meaningDNA['S'] = 6;
meaningDNA['T'] = 8;
meaningDNA['U'] = 8;
meaningDNA['V'] = 7;
meaningDNA['W'] = 9;
meaningDNA['X'] = 15;
meaningDNA['Y'] = 10;
meaningDNA['-'] = 15;
meaningDNA['?'] = 15;
for(i = 0; i < length; i++)
{
unsigned char n1 = meaningDNA[allowedStates[i][0]];
unsigned char n2 = meaningDNA[allowedStates[i][1]];
new = n1;
new = new << 4;
new = new | n2;
intermediateBinaryStates[i] = new;
}
new = v1;
new = new << 4;
new = new | v2;
for(i = 0; i < length; i++)
{
if(new == intermediateBinaryStates[i])
break;
}
if(i < length)
new = finalBinaryStates[i];
else
{
new = 0;
for(i = 0; i < length; i++)
{
if(v1 & meaningDNA[allowedStates[i][0]])
{
/*printf("Adding %c%c\n", allowedStates[i][0], allowedStates[i][1]);*/
new |= finalBinaryStates[i];
}
if(v2 & meaningDNA[allowedStates[i][1]])
{
/*printf("Adding %c%c\n", allowedStates[i][0], allowedStates[i][1]);*/
new |= finalBinaryStates[i];
}
}
}
}
break;
case SECONDARY_DATA_7:
{
int
meaningDNA[256],
i;
const unsigned char
allowedStates[6][2] = {{'A','T'}, {'C', 'G'}, {'G', 'C'}, {'G','T'}, {'T', 'A'}, {'T', 'G'}};
const unsigned char
finalBinaryStates[7] = {1, 2, 4, 8, 16, 32, 64};
unsigned char
intermediateBinaryStates[7];
for(i = 0; i < 256; i++)
meaningDNA[i] = -1;
meaningDNA['A'] = 1;
meaningDNA['B'] = 14;
meaningDNA['C'] = 2;
meaningDNA['D'] = 13;
meaningDNA['G'] = 4;
meaningDNA['H'] = 11;
meaningDNA['K'] = 12;
meaningDNA['M'] = 3;
meaningDNA['N'] = 15;
meaningDNA['O'] = 15;
meaningDNA['R'] = 5;
meaningDNA['S'] = 6;
meaningDNA['T'] = 8;
meaningDNA['U'] = 8;
meaningDNA['V'] = 7;
meaningDNA['W'] = 9;
meaningDNA['X'] = 15;
meaningDNA['Y'] = 10;
meaningDNA['-'] = 15;
meaningDNA['?'] = 15;
for(i = 0; i < 6; i++)
{
unsigned char n1 = meaningDNA[allowedStates[i][0]];
unsigned char n2 = meaningDNA[allowedStates[i][1]];
new = n1;
new = new << 4;
new = new | n2;
intermediateBinaryStates[i] = new;
}
new = v1;
new = new << 4;
new = new | v2;
for(i = 0; i < 6; i++)
{
/* exact match */
if(new == intermediateBinaryStates[i])
break;
}
if(i < 6)
new = finalBinaryStates[i];
else
{
/* distinguish between exact mismatches and partial mismatches */
for(i = 0; i < 6; i++)
if((v1 & meaningDNA[allowedStates[i][0]]) && (v2 & meaningDNA[allowedStates[i][1]]))
break;
if(i < 6)
{
/* printf("partial mismatch\n"); */
new = 0;
for(i = 0; i < 6; i++)
{
if((v1 & meaningDNA[allowedStates[i][0]]) && (v2 & meaningDNA[allowedStates[i][1]]))
{
/*printf("Adding %c%c\n", allowedStates[i][0], allowedStates[i][1]);*/
new |= finalBinaryStates[i];
}
else
new |= finalBinaryStates[6];
}
}
else
new = finalBinaryStates[6];
}
}
break;
default:
assert(0);
}
return new;
}
static void adaptRdataToSecondary(tree *tr, rawdata *rdta)
{
int *alias = (int*)rax_calloc(rdta->sites, sizeof(int));
int i, j, realPosition;
for(i = 0; i < rdta->sites; i++)
alias[i] = -1;
for(i = 0, realPosition = 0; i < rdta->sites; i++)
{
int partner = tr->secondaryStructurePairs[i];
if(partner != -1)
{
assert(tr->dataVector[i+1] == SECONDARY_DATA || tr->dataVector[i+1] == SECONDARY_DATA_6 || tr->dataVector[i+1] == SECONDARY_DATA_7);
if(i < partner)
{
for(j = 1; j <= rdta->numsp; j++)
{
unsigned char v1 = rdta->y[j][i+1];
unsigned char v2 = rdta->y[j][partner+1];
assert(i+1 < partner+1);
rdta->y[j][i+1] = buildStates(tr->dataVector[i+1], v1, v2);
}
alias[realPosition] = i;
realPosition++;
}
}
else
{
alias[realPosition] = i;
realPosition++;
}
}
assert(rdta->sites - realPosition == tr->numberOfSecondaryColumns / 2);
rdta->sites = realPosition;
for(i = 0; i < rdta->sites; i++)
{
assert(alias[i] != -1);
tr->model[i+1] = tr->model[alias[i]+1];
tr->dataVector[i+1] = tr->dataVector[alias[i]+1];
rdta->wgt[i+1] = rdta->wgt[alias[i]+1];
for(j = 1; j <= rdta->numsp; j++)
rdta->y[j][i+1] = rdta->y[j][alias[i]+1];
}
rax_free(alias);
}
static void sitesort(rawdata *rdta, cruncheddata *cdta, tree *tr, analdef *adef)
{
int
gap,
i,
j,
jj,
jg,
k,
n,
nsp,
*index,
*category = (int*)NULL;
boolean
flip,
tied;
unsigned char
**data;
if(adef->useSecondaryStructure)
{
assert(tr->NumberOfModels > 1 && adef->useMultipleModel);
adaptRdataToSecondary(tr, rdta);
}
if(adef->useMultipleModel)
category = tr->model;
index = cdta->alias;
data = rdta->y;
n = rdta->sites;
nsp = rdta->numsp;
index[0] = -1;
if(adef->compressPatterns)
{
for (gap = n / 2; gap > 0; gap /= 2)
{
for (i = gap + 1; i <= n; i++)
{
j = i - gap;
do
{
jj = index[j];
jg = index[j+gap];
if(adef->useMultipleModel)
{
assert(category[jj] != -1 &&
category[jg] != -1);
flip = (category[jj] > category[jg]);
tied = (category[jj] == category[jg]);
}
else
{
flip = 0;
tied = 1;
}
for (k = 1; (k <= nsp) && tied; k++)
{
flip = (data[k][jj] > data[k][jg]);
tied = (data[k][jj] == data[k][jg]);
}
if (flip)
{
index[j] = jg;
index[j+gap] = jj;
j -= gap;
}
}
while (flip && (j > 0));
}
}
}
}
static void sitecombcrunch (rawdata *rdta, cruncheddata *cdta, tree *tr, analdef *adef, int countAscBias)
{
boolean
tied;
int
i,
sitei,
j,
sitej,
k,
*aliasModel = (int*)NULL,
*aliasSuperModel = (int*)NULL,
undeterminedSites = 0;
if(adef->useMultipleModel)
{
aliasSuperModel = (int*)rax_malloc(sizeof(int) * (rdta->sites + 1));
aliasModel = (int*)rax_malloc(sizeof(int) * (rdta->sites + 1));
}
i = 0;
cdta->alias[0] = cdta->alias[1];
cdta->aliaswgt[0] = 0;
//if(adef->mode == PER_SITE_LL || adef->mode == ANCESTRAL_STATES)
{
int i;
tr->patternPosition = (int*)rax_malloc(sizeof(int) * rdta->sites);
tr->columnPosition = (int*)rax_malloc(sizeof(int) * rdta->sites);
for(i = 0; i < rdta->sites; i++)
{
tr->patternPosition[i] = -1;
tr->columnPosition[i] = -1;
}
}
i = 0;
for (j = 1; j <= rdta->sites; j++)
{
int
allGap = TRUE;
unsigned char
undetermined;
sitei = cdta->alias[i];
sitej = cdta->alias[j];
undetermined = getUndetermined(tr->dataVector[sitej]);
for(k = 1; k <= rdta->numsp; k++)
{
if(rdta->y[k][sitej] != undetermined)
{
allGap = FALSE;
break;
}
}
if(allGap)
undeterminedSites++;
if(!adef->compressPatterns)
tied = 0;
else
{
if(adef->useMultipleModel)
{
tied = (tr->model[sitei] == tr->model[sitej]);
if(tied)
assert(tr->dataVector[sitei] == tr->dataVector[sitej]);
}
else
tied = 1;
}
for (k = 1; tied && (k <= rdta->numsp); k++)
tied = (rdta->y[k][sitei] == rdta->y[k][sitej]);
assert(!(tied && allGap));
if(tied && !allGap)
{
tr->patternPosition[j - 1] = i;
tr->columnPosition[j - 1] = sitej;
cdta->aliaswgt[i] += rdta->wgt[sitej];
if(adef->useMultipleModel)
{
aliasModel[i] = tr->model[sitej];
aliasSuperModel[i] = tr->dataVector[sitej];
}
}
else
{
if(!allGap)
{
if(cdta->aliaswgt[i] > 0)
i++;
tr->patternPosition[j - 1] = i;
tr->columnPosition[j - 1] = sitej;
cdta->aliaswgt[i] = rdta->wgt[sitej];
cdta->alias[i] = sitej;
if(adef->useMultipleModel)
{
aliasModel[i] = tr->model[sitej];
aliasSuperModel[i] = tr->dataVector[sitej];
}
}
}
}
cdta->endsite = i;
if (cdta->aliaswgt[i] > 0)
cdta->endsite++;
if(adef->mode == PER_SITE_LL || adef->mode == ANCESTRAL_STATES || (countAscBias > 0))
{
if(undeterminedSites > 0)
{
printBothOpen("You are trying to infer per site likelihoods or ancestral states or\n");
printBothOpen("do calculations with an ascertainment bias correction\n");
printBothOpen("on an alignment containing %d sites consisting only of undetermined\n", undeterminedSites);
printBothOpen("characters. Please remove them first and then re-run RAxML!\n");
errorExit(-1);
}
for(i = 0; i < rdta->sites; i++)
{
int
p = tr->patternPosition[i],
c = tr->columnPosition[i];
assert(p >= 0 && p < cdta->endsite);
assert(c >= 1 && c <= rdta->sites);
}
}
if(adef->useMultipleModel)
{
for(i = 0; i <= rdta->sites; i++)
{
tr->model[i] = aliasModel[i];
tr->dataVector[i] = aliasSuperModel[i];
}
}
if(adef->useMultipleModel)
{
rax_free(aliasModel);
rax_free(aliasSuperModel);
}
if(undeterminedSites > 0)
printBothOpen("\nAlignment has %d completely undetermined sites that will be automatically removed from the input data\n\n", undeterminedSites);
//exit(-1);
}
static boolean makeweights (analdef *adef, rawdata *rdta, cruncheddata *cdta, tree *tr, int countAscBias)
{
int i;
for (i = 1; i <= rdta->sites; i++)
cdta->alias[i] = i;
sitesort(rdta, cdta, tr, adef);
sitecombcrunch(rdta, cdta, tr, adef, countAscBias);
return TRUE;
}
static boolean makevalues(rawdata *rdta, cruncheddata *cdta, tree *tr, analdef *adef)
{
int i, j, model, fullSites = 0, modelCounter;
unsigned char
*y = (unsigned char *)rax_malloc(((size_t)rdta->numsp) * ((size_t)cdta->endsite) * sizeof(unsigned char)),
*yBUF = (unsigned char *)rax_malloc( ((size_t)rdta->numsp) * ((size_t)cdta->endsite) * sizeof(unsigned char));
for (i = 1; i <= rdta->numsp; i++)
for (j = 0; j < cdta->endsite; j++)
y[(((size_t)(i - 1)) * ((size_t)cdta->endsite)) + j] = rdta->y[i][cdta->alias[j]];
rax_free(rdta->y0);
rax_free(rdta->y);
rdta->y0 = y;
memcpy(yBUF, y, ((size_t)rdta->numsp) * ((size_t)cdta->endsite) * sizeof(unsigned char));
rdta->yBUF = yBUF;
if(!adef->useMultipleModel)
tr->NumberOfModels = 1;
if(adef->useMultipleModel)
{
tr->partitionData[0].lower = 0;
model = tr->model[0];
modelCounter = 0;
i = 1;
while(i < cdta->endsite)
{
if(tr->model[i] != model)
{
tr->partitionData[modelCounter].upper = i;
tr->partitionData[modelCounter + 1].lower = i;
model = tr->model[i];
modelCounter++;
}
i++;
}
if(modelCounter < tr->NumberOfModels - 1)
{
printf("\nYou specified %d partitions, but after parsing and pre-processing ExaML only found %d partitions\n", tr->NumberOfModels, modelCounter + 1);
printf("Presumably one or more partitions vanished because they consisted entirely of undetermined characters.\n");
printf("Please fix your data!\n\n");
exit(-1);
}
tr->partitionData[tr->NumberOfModels - 1].upper = cdta->endsite;
for(i = 0; i < tr->NumberOfModels; i++)
tr->partitionData[i].width = tr->partitionData[i].upper - tr->partitionData[i].lower;
model = tr->model[0];
modelCounter = 0;
tr->model[0] = modelCounter;
i = 1;
while(i < cdta->endsite)
{
if(tr->model[i] != model)
{
model = tr->model[i];
modelCounter++;
tr->model[i] = modelCounter;
}
else
tr->model[i] = modelCounter;
i++;
}
}
else
{
tr->partitionData[0].lower = 0;
tr->partitionData[0].upper = cdta->endsite;
tr->partitionData[0].width = tr->partitionData[0].upper - tr->partitionData[0].lower;
}
tr->rdta = rdta;
tr->cdta = cdta;
tr->invariant = (int *)rax_malloc(cdta->endsite * sizeof(int));
tr->originalDataVector = (int *)rax_malloc(cdta->endsite * sizeof(int));
tr->originalModel = (int *)rax_malloc(cdta->endsite * sizeof(int));
tr->originalWeights = (int *)rax_malloc(cdta->endsite * sizeof(int));
memcpy(tr->originalModel, tr->model, cdta->endsite * sizeof(int));
memcpy(tr->originalDataVector, tr->dataVector, cdta->endsite * sizeof(int));
memcpy(tr->originalWeights, tr->cdta->aliaswgt, cdta->endsite * sizeof(int));
tr->originalCrunchedLength = tr->cdta->endsite;
for(i = 0; i < tr->cdta->endsite; i++)
fullSites += tr->cdta->aliaswgt[i];
tr->fullSites = fullSites;
for(i = 0; i < rdta->numsp; i++)
tr->yVector[i + 1] = &(rdta->y0[((size_t)tr->originalCrunchedLength) * ((size_t)i)]);
return TRUE;
}
static int sequenceSimilarity(unsigned char *tipJ, unsigned char *tipK, int n)
{
int i;
for(i = 0; i < n; i++)
if(*tipJ++ != *tipK++)
return 0;
return 1;
}
static void checkSequences(tree *tr, rawdata *rdta, analdef *adef)
{
int n = tr->mxtips + 1;
int i, j;
int *omissionList = (int *)rax_calloc(n, sizeof(int));
int *undeterminedList = (int *)rax_calloc((rdta->sites + 1), sizeof(int));
int *modelList = (int *)rax_malloc((rdta->sites + 1)* sizeof(int));
int count = 0;
int countNameDuplicates = 0;
int countUndeterminedColumns = 0;
int countOnlyGaps = 0;
int modelCounter = 1;
unsigned char *tipI, *tipJ;
for(i = 1; i < n; i++)
{
for(j = i + 1; j < n; j++)
if(strcmp(tr->nameList[i], tr->nameList[j]) == 0)
{
countNameDuplicates++;
if(processID == 0)
printBothOpen("Sequence names of taxon %d and %d are identical, they are both called %s\n", i, j, tr->nameList[i]);
}
}
if(countNameDuplicates > 0)
{
if(processID == 0)
printBothOpen("ERROR: Found %d taxa that had equal names in the alignment, exiting...\n", countNameDuplicates);
errorExit(-1);
}
if(adef->checkForUndeterminedSequences)
{
for(i = 1; i < n; i++)
{
j = 1;
while(j <= rdta->sites)
{
if(rdta->y[i][j] != getUndetermined(tr->dataVector[j]))
break;
j++;
}
if(j == (rdta->sites + 1))
{
if(processID == 0)
printBothOpen("ERROR: Sequence %s consists entirely of undetermined values which will be treated as missing data\n",
tr->nameList[i]);
countOnlyGaps++;
}
}
if(countOnlyGaps > 0)
{
if(processID == 0)
printBothOpen("ERROR: Found %d sequences that consist entirely of undetermined values, exiting...\n", countOnlyGaps);
errorExit(-1);
}
}
for(i = 0; i <= rdta->sites; i++)
modelList[i] = -1;
for(i = 1; i <= rdta->sites; i++)
{
j = 1;
while(j < n)
{
if(rdta->y[j][i] != getUndetermined(tr->dataVector[i]))
break;
j++;
}
if(j == n)
{
undeterminedList[i] = 1;
if(processID == 0 && !adef->silent)
printBothOpen("IMPORTANT WARNING: Alignment column %d contains only undetermined values which will be treated as missing data\n", i);
countUndeterminedColumns++;
}
else
{
if(adef->useMultipleModel)
{
modelList[modelCounter] = tr->model[i];
modelCounter++;
}
}
}
for(i = 1; i < n; i++)
{
if(omissionList[i] == 0)
{
tipI = &(rdta->y[i][1]);
for(j = i + 1; j < n; j++)
{
if(omissionList[j] == 0)
{
tipJ = &(rdta->y[j][1]);
if(sequenceSimilarity(tipI, tipJ, rdta->sites))
{
if(processID == 0 && !adef->silent)
printBothOpen("\n\nIMPORTANT WARNING: Sequences %s and %s are exactly identical\n", tr->nameList[i], tr->nameList[j]);
omissionList[j] = 1;
count++;
}
}
}
}
}
if(count > 0 || countUndeterminedColumns > 0)
{
char noDupFile[2048];
char noDupModels[2048];
char noDupSecondary[2048];
if(count > 0 && processID == 0 && !adef->silent)
{
printBothOpen("\nIMPORTANT WARNING\n");
printBothOpen("Found %d %s that %s exactly identical to other sequences in the alignment.\n", count, (count == 1)?"sequence":"sequences", (count == 1)?"is":"are");
printBothOpen("Normally they should be excluded from the analysis.\n\n");
}
if(countUndeterminedColumns > 0 && processID == 0 && !adef->silent)
{
printBothOpen("\nIMPORTANT WARNING\n");
printBothOpen("Found %d %s that %s only undetermined values which will be treated as missing data.\n",
countUndeterminedColumns, (countUndeterminedColumns == 1)?"column":"columns", (countUndeterminedColumns == 1)?"contains":"contain");
printBothOpen("Normally these columns should be excluded from the analysis.\n\n");
}
strcpy(noDupFile, seq_file);
strcat(noDupFile, ".reduced");
strcpy(noDupModels, modelFileName);
strcat(noDupModels, ".reduced");
strcpy(noDupSecondary, secondaryStructureFileName);
strcat(noDupSecondary, ".reduced");
if(processID == 0)
{
if(adef->useSecondaryStructure)
{
if(countUndeterminedColumns && !filexists(noDupSecondary))
{
FILE *newFile = myfopen(noDupSecondary, "wb");
int count;
printBothOpen("\nJust in case you might need it, a secondary structure file with \n");
printBothOpen("structure assignments for undetermined columns removed is printed to file %s\n",noDupSecondary);
for(i = 1, count = 0; i <= rdta->sites; i++)
{
if(undeterminedList[i] == 0)
fprintf(newFile, "%c", tr->secondaryStructureInput[i - 1]);
else
count++;
}
assert(count == countUndeterminedColumns);
fprintf(newFile,"\n");
fclose(newFile);
}
else
{
if(countUndeterminedColumns)
{
printBothOpen("\nA secondary structure file with model assignments for undetermined\n");
printBothOpen("columns removed has already been printed to file %s\n",noDupSecondary);
}
}
}
if(adef->useMultipleModel && !filexists(noDupModels) && countUndeterminedColumns)
{
FILE *newFile = myfopen(noDupModels, "wb");
printBothOpen("\nJust in case you might need it, a mixed model file with \n");
printBothOpen("model assignments for undetermined columns removed is printed to file %s\n",noDupModels);
for(i = 0; i < tr->NumberOfModels; i++)
{
boolean modelStillExists = FALSE;
for(j = 1; (j <= rdta->sites) && (!modelStillExists); j++)
{
if(modelList[j] == i)
modelStillExists = TRUE;
}
if(modelStillExists)
{
int k = 1;
int lower, upper;
int parts = 0;
switch(tr->partitionData[i].dataType)
{
case AA_DATA:
{
char
AAmodel[1024];
if(tr->partitionData[i].protModels != PROT_FILE)
{
if(tr->partitionData[i].ascBias)
{
strcpy(AAmodel, "ASC_");
strcat(AAmodel, protModels[tr->partitionData[i].protModels]);
}
else
strcpy(AAmodel, protModels[tr->partitionData[i].protModels]);
if(tr->partitionData[i].usePredefinedProtFreqs == FALSE)
strcat(AAmodel, "F");
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
strcat(AAmodel, "X");
assert(!(tr->partitionData[i].optimizeBaseFrequencies && tr->partitionData[i].usePredefinedProtFreqs));
fprintf(newFile, "%s, ", AAmodel);
}
else
fprintf(newFile, "[%s], ", tr->partitionData[i].proteinSubstitutionFileName);
}
break;
case DNA_DATA:
if(tr->partitionData[i].ascBias)
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "ASC_DNAX, ");
else
fprintf(newFile, "ASC_DNA, ");
}
else
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "DNAX, ");
else
fprintf(newFile, "DNA, ");
}
break;
case BINARY_DATA:
if(tr->partitionData[i].ascBias)
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "ASC_BINX, ");
else
fprintf(newFile, "ASC_BIN, ");
}
else
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "BINX, ");
else
fprintf(newFile, "BIN, ");
}
break;
case GENERIC_32:
if(tr->partitionData[i].ascBias)
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "ASC_MULTIX, ");
else
fprintf(newFile, "ASC_MULTI, ");
}
else
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "MULTIX, ");
else
fprintf(newFile, "MULTI, ");
}
break;
case GENERIC_64:
if(tr->partitionData[i].ascBias)
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "ASC_CODONX, ");
else
fprintf(newFile, "ASC_CODON, ");
}
else
{
if(tr->partitionData[i].optimizeBaseFrequencies == TRUE)
fprintf(newFile, "CODONX, ");
else
fprintf(newFile, "CODON, ");
}
break;
default:
assert(0);
}
fprintf(newFile, "%s = ", tr->partitionData[i].partitionName);
while(k <= rdta->sites)
{
if(modelList[k] == i)
{
lower = k;
while((modelList[k + 1] == i) && (k <= rdta->sites))
k++;
upper = k;
if(lower == upper)
{
if(parts == 0)
fprintf(newFile, "%d", lower);
else
fprintf(newFile, ",%d", lower);
}
else
{
if(parts == 0)
fprintf(newFile, "%d-%d", lower, upper);
else
fprintf(newFile, ",%d-%d", lower, upper);
}
parts++;
}
k++;
}
fprintf(newFile, "\n");
}
}
fclose(newFile);
}
else
{
if(adef->useMultipleModel)
{
printBothOpen("\nA mixed model file with model assignments for undetermined\n");
printBothOpen("columns removed has already been printed to file %s\n",noDupModels);
}
}
if(!filexists(noDupFile))
{
FILE
*newFile;
if(adef->silent && (count || countUndeterminedColumns))
printBothOpen("\nIMPORTANT WARNING: Alignment validation warnings have been suppressed. Found %d duplicate %s and %d undetermined %s\n\n",
count, count > 1 ? "sequences" : "sequence", countUndeterminedColumns, countUndeterminedColumns > 1 ? "columns" : "column");
printBothOpen("Just in case you might need it, an alignment file with \n");
if(count && !countUndeterminedColumns)
printBothOpen("sequence duplicates removed is printed to file %s\n", noDupFile);
if(!count && countUndeterminedColumns)
printBothOpen("undetermined columns removed is printed to file %s\n", noDupFile);
if(count && countUndeterminedColumns)
printBothOpen("sequence duplicates and undetermined columns removed is printed to file %s\n", noDupFile);
newFile = myfopen(noDupFile, "wb");
fprintf(newFile, "%d %d\n", tr->mxtips - count, rdta->sites - countUndeterminedColumns);
for(i = 1; i < n; i++)
{
if(!omissionList[i])
{
fprintf(newFile, "%s ", tr->nameList[i]);
tipI = &(rdta->y[i][1]);
for(j = 0; j < rdta->sites; j++)
{
if(undeterminedList[j + 1] == 0)
fprintf(newFile, "%c", getInverseMeaning(tr->dataVector[j + 1], tipI[j]));
}
fprintf(newFile, "\n");
}
}
fclose(newFile);
}
else
{
if(count && !countUndeterminedColumns)
printBothOpen("An alignment file with sequence duplicates removed has already\n");
if(!count && countUndeterminedColumns)
printBothOpen("An alignment file with undetermined columns removed has already\n");
if(count && countUndeterminedColumns)
printBothOpen("An alignment file with undetermined columns and sequence duplicates removed has already\n");
printBothOpen("been printed to file %s\n", noDupFile);
}
}
}
rax_free(undeterminedList);
rax_free(omissionList);
rax_free(modelList);
}
static void generateBS(tree *tr, analdef *adef)
{
int
i,
j,
k,
w;
char outName[1024], buf[16];
FILE *of;
assert(adef->boot != 0);
for(i = 0; i < adef->multipleRuns; i++)
{
int
count = 0;
computeNextReplicate(tr, &adef->boot, (int*)NULL, (int*)NULL, FALSE, FALSE, adef);
count = 0;
for(j = 0; j < tr->cdta->endsite; j++)
count += tr->cdta->aliaswgt[j];
assert(count == tr->fullSites);
strcpy(outName, workdir);
strcat(outName, seq_file);
strcat(outName, ".BS");
sprintf(buf, "%d", i);
strcat(outName, buf);
printf("Printing replicate %d to %s\n", i, outName);
of = myfopen(outName, "wb");
fprintf(of, "%d %d\n", tr->mxtips, count);
for(j = 1; j <= tr->mxtips; j++)
{
unsigned char *tip = tr->yVector[tr->nodep[j]->number];
fprintf(of, "%s ", tr->nameList[j]);
for(k = 0; k < tr->cdta->endsite; k++)
{
for(w = 0; w < tr->cdta->aliaswgt[k]; w++)
fprintf(of, "%c", getInverseMeaning(tr->dataVector[k], tip[k]));
}
fprintf(of, "\n");
}
fclose(of);
}
}
static void splitMultiGene(tree *tr, rawdata *rdta)
{
int i, l;
int n = rdta->sites + 1;
int *modelFilter = (int *)rax_malloc(sizeof(int) * n);
int length, k;
unsigned char *tip;
FILE *outf;
char outFileName[2048];
/* char buf[16]; */
for(i = 0; i < tr->NumberOfModels; i++)
{
strcpy(outFileName, seq_file);
/*sprintf(buf, "%d", i);*/
/*strcat(outFileName, ".GENE.");*/
strcat(outFileName, ".");
strcat(outFileName, tr->partitionData[i].partitionName);
strcat(outFileName, ".phy");
/*strcat(outFileName, buf);*/
outf = myfopen(outFileName, "wb");
length = 0;
for(k = 1; k < n; k++)
{
if(tr->model[k] == i)
{
modelFilter[k] = 1;
length++;
}
else
modelFilter[k] = -1;
}
fprintf(outf, "%d %d\n", rdta->numsp, length);
for(l = 1; l <= rdta->numsp; l++)
{
fprintf(outf, "%s ", tr->nameList[l]);
tip = &(rdta->y[l][0]);
for(k = 1; k < n; k++)
{
if(modelFilter[k] == 1)
fprintf(outf, "%c", getInverseMeaning(tr->dataVector[k], tip[k]));
}
fprintf(outf, "\n");
}
fclose(outf);
printf("Wrote individual gene/partition alignment to file %s\n", outFileName);
}
rax_free(modelFilter);
printf("Wrote all %d individual gene/partition alignments\n", tr->NumberOfModels);
printf("Exiting normally\n");
}
static int countTaxaInTopology(void)
{
FILE
*f = myfopen(tree_file, "rb");
int
c,
taxaCount = 0;
while((c = fgetc(f)) != EOF)
{
if(c == '(' || c == ',')
{
c = fgetc(f);
if(c == '(' || c == ',')
ungetc(c, f);
else
{
do
{
c = fgetc(f);
}
while(c != ':' && c != ')' && c != ',');
taxaCount++;
ungetc(c, f);
}
}
}
printBothOpen("Found a total of %d taxa in tree file %s\n", taxaCount, tree_file);
fclose(f);
return taxaCount;
}
static void allocPartitions(tree *tr)
{
int
i,
maxCategories = tr->maxCategories;
for(i = 0; i < tr->NumberOfModels; i++)
{
const partitionLengths
*pl = getPartitionLengths(&(tr->partitionData[i]));
if(tr->useFastScaling)
tr->partitionData[i].globalScaler = (unsigned int *)rax_calloc(2 * tr->mxtips, sizeof(unsigned int));
tr->partitionData[i].left = (double *)rax_malloc(pl->leftLength * (maxCategories + 1) * sizeof(double));
tr->partitionData[i].right = (double *)rax_malloc(pl->rightLength * (maxCategories + 1) * sizeof(double));
tr->partitionData[i].EIGN = (double*)rax_malloc(pl->eignLength * sizeof(double));
tr->partitionData[i].EV = (double*)rax_malloc(pl->evLength * sizeof(double));
tr->partitionData[i].EI = (double*)rax_malloc(pl->eiLength * sizeof(double));
tr->partitionData[i].substRates = (double *)rax_malloc(pl->substRatesLength * sizeof(double));
tr->partitionData[i].frequencies = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
tr->partitionData[i].freqExponents = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
tr->partitionData[i].tipVector = (double *)rax_malloc(pl->tipVectorLength * sizeof(double));
tr->partitionData[i].invariableFrequencies = (double *)rax_malloc(pl->states * sizeof(double));
#ifdef _HET
tr->partitionData[i].EIGN_TIP = (double*)rax_malloc(pl->eignLength * sizeof(double));
tr->partitionData[i].EV_TIP = (double*)rax_malloc(pl->evLength * sizeof(double));
tr->partitionData[i].EI_TIP = (double*)rax_malloc(pl->eiLength * sizeof(double));
tr->partitionData[i].substRates_TIP = (double *)rax_malloc(pl->substRatesLength * sizeof(double));
tr->partitionData[i].tipVector_TIP = (double *)rax_malloc(pl->tipVectorLength * sizeof(double));
#endif
if(tr->partitionData[i].protModels == LG4 || tr->partitionData[i].protModels == LG4X)
{
int
k;
for(k = 0; k < 4; k++)
{
tr->partitionData[i].EIGN_LG4[k] = (double*)rax_malloc(pl->eignLength * sizeof(double));
tr->partitionData[i].EV_LG4[k] = (double*)rax_malloc(pl->evLength * sizeof(double));
tr->partitionData[i].EI_LG4[k] = (double*)rax_malloc(pl->eiLength * sizeof(double));
tr->partitionData[i].substRates_LG4[k] = (double *)rax_malloc(pl->substRatesLength * sizeof(double));
tr->partitionData[i].frequencies_LG4[k] = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
tr->partitionData[i].tipVector_LG4[k] = (double *)rax_malloc(pl->tipVectorLength * sizeof(double));
}
}
tr->partitionData[i].symmetryVector = (int *)rax_malloc(pl->symmetryVectorLength * sizeof(int));
tr->partitionData[i].frequencyGrouping = (int *)rax_malloc(pl->frequencyGroupingLength * sizeof(int));
tr->partitionData[i].perSiteRates = (double *)rax_malloc(sizeof(double) * tr->maxCategories);
tr->partitionData[i].unscaled_perSiteRates = (double *)rax_malloc(sizeof(double) * tr->maxCategories);
tr->partitionData[i].nonGTR = FALSE;
tr->partitionData[i].gammaRates = (double*)rax_malloc(sizeof(double) * 4);
tr->partitionData[i].yVector = (unsigned char **)rax_malloc(sizeof(unsigned char*) * (tr->mxtips + 1));
tr->partitionData[i].xVector = (double **)rax_malloc(sizeof(double*) * tr->innerNodes);
tr->partitionData[i].xSpaceVector = (size_t *)rax_calloc(tr->innerNodes, sizeof(size_t));
tr->partitionData[i].expVector = (int **)rax_malloc(sizeof(int*) * tr->innerNodes);
tr->partitionData[i].expSpaceVector = (size_t *)rax_calloc(tr->innerNodes, sizeof(size_t));
tr->partitionData[i].mxtips = tr->mxtips;
//andre-opt
tr->partitionData[i].presenceMap = (unsigned int *)rax_calloc((size_t)tr->mxtips + 1 , sizeof(unsigned int));
#ifndef _USE_PTHREADS
{
int j;
for(j = 1; j <= tr->mxtips; j++)
tr->partitionData[i].yVector[j] = &(tr->yVector[j][tr->partitionData[i].lower]);
}
#endif
}
}
#ifndef _USE_PTHREADS
static void allocNodex (tree *tr)
{
size_t
i,
model,
offset,
memoryRequirements = 0;
allocPartitions(tr);
for(model = 0; model < (size_t)tr->NumberOfModels; model++)
{
size_t
width = tr->partitionData[model].upper - tr->partitionData[model].lower;
int
undetermined,
j;
memoryRequirements += (size_t)(tr->discreteRateCategories) * (size_t)(tr->partitionData[model].states) * width;
//asc
if(tr->partitionData[model].ascBias)
{
tr->partitionData[model].ascOffset = 4 * tr->partitionData[model].states * tr->partitionData[model].states;
tr->partitionData[model].ascVector = (double *)rax_malloc(((size_t)tr->innerNodes) *
((size_t)tr->partitionData[model].ascOffset) *
sizeof(double));
tr->partitionData[model].ascExpVector = (int *)rax_calloc(((size_t)tr->innerNodes) * ((size_t)tr->partitionData[model].states),
sizeof(int));
tr->partitionData[model].ascMissingVector = (int *)rax_calloc((size_t)(tr->mxtips + 1), sizeof(int));
tr->partitionData[model].ascSumBuffer = (double *)rax_malloc(((size_t)tr->partitionData[model].ascOffset) *
sizeof(double));
}
//asc
tr->partitionData[model].gapVectorLength = ((int)width / 32) + 1;
tr->partitionData[model].gapVector = (unsigned int*)rax_calloc(tr->partitionData[model].gapVectorLength * 2 * tr->mxtips, sizeof(unsigned int));
tr->partitionData[model].initialGapVectorSize = tr->partitionData[model].gapVectorLength * 2 * tr->mxtips * sizeof(int);
/* always multiply by 4 due to frequent switching between CAT and GAMMA in standard RAxML */
tr->partitionData[model].gapColumn = (double *)rax_malloc(((size_t)tr->innerNodes) *
((size_t)4) *
((size_t)(tr->partitionData[model].states)) *
sizeof(double));
undetermined = getUndetermined(tr->partitionData[model].dataType);
for(j = 1; j <= tr->mxtips; j++)
for(i = 0; i < width; i++)
if(tr->partitionData[model].yVector[j][i] == undetermined)
tr->partitionData[model].gapVector[tr->partitionData[model].gapVectorLength * j + i / 32] |= mask32[i % 32];
}
tr->perSiteLL = (double *)rax_malloc((size_t)tr->cdta->endsite * sizeof(double));
assert(tr->perSiteLL != NULL);
tr->sumBuffer = (double *)rax_malloc(memoryRequirements * sizeof(double));
assert(tr->sumBuffer != NULL);
offset = 0;
/* C-OPT for initial testing tr->NumberOfModels will be 1 */
for(model = 0; model < (size_t)tr->NumberOfModels; model++)
{
size_t
lower = tr->partitionData[model].lower,
width = tr->partitionData[model].upper - lower;
/* TODO all of this must be reset/adapted when fixModelIndices is called ! */
tr->partitionData[model].sumBuffer = &tr->sumBuffer[offset];
tr->partitionData[model].perSiteLL = &tr->perSiteLL[lower];
tr->partitionData[model].wgt = &tr->cdta->aliaswgt[lower];
tr->partitionData[model].invariant = &tr->invariant[lower];
tr->partitionData[model].rateCategory = &tr->cdta->rateCategory[lower];
offset += (size_t)(tr->discreteRateCategories) * (size_t)(tr->partitionData[model].states) * width;
}
for(i = 0; i < tr->innerNodes; i++)
{
for(model = 0; model < (size_t)tr->NumberOfModels; model++)
{
tr->partitionData[model].expVector[i] = (int*)NULL;
tr->partitionData[model].xVector[i] = (double*)NULL;
}
}
}
#endif
static void initAdef(analdef *adef)
{
adef->useSecondaryStructure = FALSE;
adef->bootstrapBranchLengths = FALSE;
adef->model = M_GTRCAT;
adef->max_rearrange = 21;
adef->stepwidth = 5;
adef->initial = adef->bestTrav = 10;
adef->initialSet = FALSE;
adef->restart = FALSE;
adef->mode = BIG_RAPID_MODE;
adef->categories = 25;
adef->boot = 0;
adef->rapidBoot = 0;
adef->useWeightFile = FALSE;
adef->checkpoints = 0;
adef->startingTreeOnly = 0;
adef->multipleRuns = 1;
adef->useMultipleModel = FALSE;
adef->likelihoodEpsilon = 0.1;
adef->constraint = FALSE;
adef->grouping = FALSE;
adef->randomStartingTree = FALSE;
adef->parsimonySeed = 0;
adef->constraintSeed = 0;
adef->proteinMatrix = JTT;
adef->protEmpiricalFreqs = 0;
adef->outgroup = FALSE;
adef->useInvariant = FALSE;
adef->permuteTreeoptimize = FALSE;
adef->useInvariant = FALSE;
adef->allInOne = FALSE;
adef->likelihoodTest = FALSE;
adef->perGeneBranchLengths = FALSE;
adef->generateBS = FALSE;
adef->bootStopping = FALSE;
adef->gapyness = 0.0;
adef->similarityFilterMode = 0;
adef->useExcludeFile = FALSE;
adef->userProteinModel = FALSE;
adef->computeELW = FALSE;
adef->computeDistance = FALSE;
adef->compressPatterns = TRUE;
adef->readTaxaOnly = FALSE;
adef->useBinaryModelFile = FALSE;
adef->leaveDropMode = FALSE;
adef->slidingWindowSize = 100;
adef->checkForUndeterminedSequences = TRUE;
adef->useQuartetGrouping = FALSE;
adef->alignmentFileType = PHYLIP;
adef->calculateIC = FALSE;
adef->verboseIC = FALSE;
adef->stepwiseAdditionOnly = FALSE;
adef->optimizeBaseFrequencies = FALSE;
adef->ascertainmentBias = FALSE;
adef->rellBootstrap = FALSE;
adef->mesquite = FALSE;
adef->silent = FALSE;
adef->noSequenceCheck = FALSE;
adef->useBFGS = TRUE;
}
static int modelExists(char *model, analdef *adef)
{
int
i;
char
thisModel[1024];
/********** BINARY ********************/
if(strcmp(model, "BINGAMMAI\0") == 0)
{
adef->model = M_BINGAMMA;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "BINGAMMA\0") == 0)
{
adef->model = M_BINGAMMA;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "ASC_BINGAMMA\0") == 0)
{
adef->model = M_BINGAMMA;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "BINCAT\0") == 0)
{
adef->model = M_BINCAT;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "BINCATI\0") == 0)
{
adef->model = M_BINCAT;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "ASC_BINCAT\0") == 0)
{
adef->model = M_BINCAT;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "BINGAMMAX\0") == 0)
{
adef->model = M_BINGAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_BINGAMMAX\0") == 0)
{
adef->model = M_BINGAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "BINCATX\0") == 0)
{
adef->model = M_BINCAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_BINCATX\0") == 0)
{
adef->model = M_BINCAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "BINGAMMAIX\0") == 0)
{
adef->model = M_BINGAMMA;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "BINCATIX\0") == 0)
{
adef->model = M_BINCAT;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/*********** 32 state ****************************/
if(strcmp(model, "MULTIGAMMAI\0") == 0)
{
adef->model = M_32GAMMA;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "MULTIGAMMA\0") == 0)
{
adef->model = M_32GAMMA;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "ASC_MULTIGAMMA\0") == 0)
{
adef->model = M_32GAMMA;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "MULTICAT\0") == 0)
{
adef->model = M_32CAT;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "MULTICATI\0") == 0)
{
adef->model = M_32CAT;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "ASC_MULTICAT\0") == 0)
{
adef->model = M_32CAT;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "MULTIGAMMAX\0") == 0)
{
adef->model = M_32GAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_MULTIGAMMAX\0") == 0)
{
adef->model = M_32GAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "MULTICATX\0") == 0)
{
adef->model = M_32CAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_MULTICATX\0") == 0)
{
adef->model = M_32CAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "MULTIGAMMAIX\0") == 0)
{
adef->model = M_32GAMMA;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "MULTICATIX\0") == 0)
{
adef->model = M_32CAT;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/*********** 64 state ****************************/
if(strcmp(model, "CODONGAMMAI\0") == 0)
{
adef->model = M_64GAMMA;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "CODONGAMMA\0") == 0)
{
adef->model = M_64GAMMA;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "ASC_CODONGAMMA\0") == 0)
{
adef->model = M_64GAMMA;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "CODONCAT\0") == 0)
{
adef->model = M_64CAT;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "CODONCATI\0") == 0)
{
adef->model = M_64CAT;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "ASC_CODONCAT\0") == 0)
{
adef->model = M_64CAT;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "CODONGAMMAX\0") == 0)
{
adef->model = M_64GAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_CODONGAMMAX\0") == 0)
{
adef->model = M_64GAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "CODONCATX\0") == 0)
{
adef->model = M_64CAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_CODONCATX\0") == 0)
{
adef->model = M_64CAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "CODONGAMMAIX\0") == 0)
{
adef->model = M_64GAMMA;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "CODONCATIX\0") == 0)
{
adef->model = M_64CAT;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/*********** DNA **********************/
if(strcmp(model, "GTRGAMMAI\0") == 0)
{
adef->model = M_GTRGAMMA;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "GTRGAMMA\0") == 0)
{
adef->model = M_GTRGAMMA;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "ASC_GTRGAMMA\0") == 0)
{
adef->model = M_GTRGAMMA;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "GTRCAT\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = FALSE;
return 1;
}
if(strcmp(model, "GTRCATI\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "ASC_GTRCAT\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = FALSE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "GTRGAMMAX\0") == 0)
{
adef->model = M_GTRGAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
//printf("opt base freqs!\n");
return 1;
}
if(strcmp(model, "ASC_GTRGAMMAX\0") == 0)
{
adef->model = M_GTRGAMMA;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
//printf("opt base freqs!\n");
return 1;
}
if(strcmp(model, "GTRCATX\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_GTRCATX\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = FALSE;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "GTRGAMMAIX\0") == 0)
{
adef->model = M_GTRGAMMA;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "GTRCATI\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "GTRCATIX\0") == 0)
{
adef->model = M_GTRCAT;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/*************** AA GTR ********************/
/* TODO empirical FREQS */
if(strcmp(model, "PROTCATGTR\0") == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
return 1;
}
if(strcmp(model, "PROTCATIGTR\0") == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAGTR\0") == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
return 1;
}
if(strcmp(model, "ASC_PROTGAMMAGTR\0") == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAIGTR\0") == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR;
adef->useInvariant = TRUE;
adef->protEmpiricalFreqs = 1;
return 1;
}
if(strcmp(model, "PROTCATGTRX\0") == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_PROTCATGTRX\0") == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAGTRX\0") == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_PROTGAMMAGTRX\0") == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAIGTRX\0") == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR;
adef->useInvariant = TRUE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "PROTCATIGTRX\0") == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR;
adef->useInvariant = TRUE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/*************** AA GTR_UNLINKED ********************/
if(strcmp(model, "PROTCATGTR_UNLINKED\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
return 1;
}
if(strcmp(model, "PROTCATGTR_UNLINKED_X\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "PROTCATIGTR_UNLINKED\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = TRUE;
adef->protEmpiricalFreqs = 1;
return 1;
}
if(strcmp(model, "ASC_PROTCATGTR_UNLINKED\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "ASC_PROTCATGTR_UNLINKED_X\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAGTR_UNLINKED\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
return 1;
}
if(strcmp(model, "PROTGAMMAGTR_UNLINKED_X\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "ASC_PROTGAMMAGTR_UNLINKED\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 1;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "ASC_PROTGAMMAGTR_UNLINKED_X\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = FALSE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAIGTR_UNLINKED\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = TRUE;
return 1;
}
if(strcmp(model, "PROTGAMMAIGTR_UNLINKED_X\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTGAMMA;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = TRUE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
if(strcmp(model, "PROTCATIGTR_UNLINKED_X\0") == 0)
{
printf("Advisory: GTR_UNLINKED only has an effect if specified in the partition file\n");
adef->model = M_PROTCAT;
adef->proteinMatrix = GTR_UNLINKED;
adef->useInvariant = TRUE;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/****************** AA ************************/
for(i = 0; i < NUM_PROT_MODELS - 2; i++)
{
/* check CAT */
strcpy(thisModel, "PROTCAT");
strcat(thisModel, protModels[i]);
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
return 1;
}
/* check CATF */
strcpy(thisModel, "PROTCAT");
strcat(thisModel, protModels[i]);
strcat(thisModel, "F");
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->protEmpiricalFreqs = 1;
return 1;
}
/* check CATX */
strcpy(thisModel, "PROTCAT");
strcat(thisModel, protModels[i]);
strcat(thisModel, "X");
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/* check CATI */
strcpy(thisModel, "PROTCATI");
strcat(thisModel, protModels[i]);
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->useInvariant = TRUE;
return 1;
}
/* check CATIF */
strcpy(thisModel, "PROTCATI");
strcat(thisModel, protModels[i]);
strcat(thisModel, "F");
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->protEmpiricalFreqs = 1;
adef->useInvariant = TRUE;
return 1;
}
/* check CATIX */
strcpy(thisModel, "PROTCATI");
strcat(thisModel, protModels[i]);
strcat(thisModel, "X");
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->protEmpiricalFreqs = 1;
adef->useInvariant = TRUE;
adef->optimizeBaseFrequencies = TRUE;
return 1;
}
/**************check CAT ASC ***********************************/
/* check CAT */
strcpy(thisModel, "ASC_PROTCAT");
strcat(thisModel, protModels[i]);
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->ascertainmentBias = TRUE;
return 1;
}
/* check CATF */
strcpy(thisModel, "ASC_PROTCAT");
strcat(thisModel, protModels[i]);
strcat(thisModel, "F");
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->protEmpiricalFreqs = 1;
adef->ascertainmentBias = TRUE;
return 1;
}
/* check CATX */
strcpy(thisModel, "ASC_PROTCAT");
strcat(thisModel, protModels[i]);
strcat(thisModel, "X");
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTCAT;
adef->proteinMatrix = i;
adef->protEmpiricalFreqs = 0;
adef->optimizeBaseFrequencies = TRUE;
adef->ascertainmentBias = TRUE;
return 1;
}
/****************check GAMMA ************************/
strcpy(thisModel, "PROTGAMMA");
strcat(thisModel, protModels[i]);
if(strcmp(model, thisModel) == 0)
{
adef->model = M_PROTGAMMA;
adef->proteinMatrix = i;
adef->useInvariant = FALSE;
return